tokenize(x, ...)## S3 method for class 'character':
tokenize(x, what = c("word", "sentence", "character",
"fastestword", "fasterword"), removeNumbers = FALSE, removePunct = FALSE,
removeSeparators = TRUE, removeTwitter = FALSE, removeHyphens = FALSE,
ngrams = 1L, skip = 0L, concatenator = "_", simplify = FALSE,
verbose = FALSE, ...)
## S3 method for class 'corpus':
tokenize(x, ...)
is.tokenizedTexts(x)
"word"
.
Available alternatives are c("character", "word", "line_break",
"sentence")
. See stringi-search-boundaries.2day
removePunct=FALSE
. Only
applicable for what = "character"
(when you @
and #}; set to
FALSE
if you wish to eliminate these.
TRUE
, split words that are connected by
hyphenation and hyphenation-like characters in bet