# same for character vectors and for lists
tokensFromChar <- tokenize(inaugTexts[1:3])
tokensFromCorp <- tokenize(subset(inaugCorpus, Year<1798))
identical(tokensFromChar, tokensFromCorp)
str(tokensFromChar)
# returned as a list
head(tokenize(inaugTexts[57])[[1]], 10)
# returned as a character vector using simplify=TRUE
head(tokenize(inaugTexts[57], simplify=TRUE), 10)
# demonstrate some options with clean
head(tokenize(inaugTexts[57], simplify=TRUE, cpp=TRUE), 30)
## NOTE: not the same as
head(tokenize(inaugTexts[57], simplify=TRUE, cpp=FALSE), 30)
## MORE COMPARISONS
tokenize("this is MY <3 4U @myhandle gr8 stuff :-)", removeTwitter=FALSE, cpp=TRUE)
tokenize("this is MY <3 4U @myhandle gr8 stuff :-)", removeTwitter=FALSE, cpp=FALSE)
tokenize("great website http://textasdata.com", removeURL=FALSE, cpp=TRUE)
tokenize("great website http://textasdata.com", removeURL=FALSE, cpp=FALSE)
tokenize("great website http://textasdata.com", removeURL=TRUE, cpp=TRUE)
tokenize("great website http://textasdata.com", removeURL=TRUE, cpp=FALSE)
Run the code above in your browser using DataLab