
Last chance! 50% off unlimited learning
Sale ends in
removeFeatures
will be to eliminate stop words from a text
or text-based object, or to select only features from a list of regular
expression.selectFeatures(x, features, ...)## S3 method for class 'dfm':
selectFeatures(x, features, selection = c("keep", "remove"),
valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE,
verbose = TRUE, ...)
## S3 method for class 'tokenizedTexts':
selectFeatures(x, features, selection = c("keep",
"remove"), valuetype = c("glob", "regex", "fixed"),
case_insensitive = TRUE, verbose = TRUE, ...)
## S3 method for class 'collocations':
selectFeatures(x, features, selection = c("keep",
"remove"), valuetype = c("glob", "regex", "fixed"),
case_insensitive = TRUE, verbose = TRUE, pos = 1:3, ...)
stri_detect_regex
. (This is how
case_insensitive
is passed, but you may wish to pass others.)fixed
for words as
is; "regex"
for regular expressions; or "glob"
for
"glob"-style wildcardTRUE
TRUE
print message about how many features were
removedpos
is a stopwordfeatures
is a dfm-class object, then the returned object
will be identical in its feature set to the dfm supplied as the
features
argument. This means that any features in x
not in
features
will be discarded, and that any features in found in the
dfm supplied as features
but not found in x
will be added
with all zero counts. This is useful when you have trained a model on one dfm, and
need to project this onto a test set whose features must be identical.removeFeatures
, trim
myDfm <- dfm(c("My Christmas was ruined by your opposition tax plan.",
"Does the United_States or Sweden have more progressive taxation?"),
toLower = FALSE, verbose = FALSE)
mydict <- dictionary(list(countries = c("United_States", "Sweden", "France"),
wordsEndingInY = c("by", "my"),
notintext = "blahblah"))
selectFeatures(myDfm, mydict)
selectFeatures(myDfm, mydict, case_insensitive = FALSE)
selectFeatures(myDfm, c("s$", ".y"), "keep")
selectFeatures(myDfm, c("s$", ".y"), "keep", valuetype = "regex")
selectFeatures(myDfm, c("s$", ".y"), "remove", valuetype = "regex")
selectFeatures(myDfm, stopwords("english"), "keep", valuetype = "fixed")
selectFeatures(myDfm, stopwords("english"), "remove", valuetype = "fixed")
# selecting on a dfm
textVec1 <- c("This is text one.", "This, the second text.", "Here: the third text.")
textVec2 <- c("Here are new words.", "New words in this text.")
features(dfm1 <- dfm(textVec1))
features(dfm2a <- dfm(textVec2))
(dfm2b <- selectFeatures(dfm2a, dfm1))
identical(features(dfm1), features(dfm2b))
toks <- tokenize(c("This is some example text from me.", "More of the example text."),
removePunct = TRUE)
selectFeatures(toks, stopwords("english"), "remove")
selectFeatures(toks, "ex", "keep", valuetype = "regex")
## example for collocations
(myCollocs <- collocations(inaugTexts[1:3], n=20))
selectFeatures(myCollocs, stopwords("english"), "remove")
Run the code above in your browser using DataLab