removeFeatures
will be to eliminate stop words from a text
or text-based object, or to select only features from a list of regular
expression.
selectFeatures(x, features, ...)
"selectFeatures"(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, verbose = FALSE, ...)
"selectFeatures"(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, padding = FALSE, indexing = FALSE, verbose = FALSE, ...)
"selectFeatures"(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, padding = FALSE, ...)
"selectFeatures"(x, features, selection = c("keep", "remove"), valuetype = c("glob", "regex", "fixed"), case_insensitive = TRUE, verbose = TRUE, pos = 1:3, ...)
stri_detect_regex
. (This is how
case_insensitive
is passed, but you may wish to pass others.)"glob"
for
"glob"-style wildcard expressions; "regex"
for regular expressions;
or "fixed"
for exact matching. See valuetype for details.TRUE
TRUE
print message about how many features were
removedtokenizedTexts
objects) if TRUE
, leave
an empty string where the removed tokens previously existed. This is
useful if a positional match is needed between the pre- and post-selected
features, for instance if a window of adjacency needs to be computed.pos
is a stopwordfeatures
is a dfm object, then the returned object
will be identical in its feature set to the dfm supplied as the
features
argument. This means that any features in x
not in
features
will be discarded, and that any features in found in the
dfm supplied as features
but not found in x
will be added
with all zero counts. This is useful when you have trained a model on one dfm, and
need to project this onto a test set whose features must be identical.
removeFeatures
, dfm_trim
## Not run: ## performance comparisons
# data(SOTUCorpus, package = "quantedaData")
# toks <- tokenize(SOTUCorpus, removePunct = TRUE)
# # toks <- tokenize(tokenize(SOTUCorpus, what='sentence', simplify = TRUE), removePunct = TRUE)
# # head to head, old v. new
# system.time(selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE))
# system.time(selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE))
# system.time(selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
# system.time(selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
# microbenchmark::microbenchmark(
# old = selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE),
# new = selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE),
# times = 5, unit = "relative")
# microbenchmark::microbenchmark(
# new = selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
# old = selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
# times = 2, unit = "relative")
#
# types <- unique(unlist(toks))
# numbers <- types[stringi::stri_detect_regex(types, '[0-9]')]
# microbenchmark::microbenchmark(
# new = selectFeaturesOLD(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
# old = selectFeatures(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
# times = 2, unit = "relative")
#
# # removing tokens before dfm, versus after
# microbenchmark::microbenchmark(
# pre = dfm(selectFeaturesOLD(toks, stopwords("english"), "remove"), verbose = FALSE),
# post = dfm(toks, remove = stopwords("english"), verbose = FALSE),
# times = 5, unit = "relative")
# ## End(Not run)
## with simple examples
toks <- tokenize(c("This is a sentence.", "This is a second sentence."),
removePunct = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
# how case_insensitive works
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = FALSE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "glob", padding = TRUE, case_insensitive = TRUE)
selectFeatures(toks, c("is", "a", "this"), selection = "remove",
valuetype = "glob", padding = TRUE, case_insensitive = FALSE)
# with longer texts
txts <- data_char_inaugural[1:2]
toks <- tokenize(txts)
selectFeatures(toks, stopwords("english"), "remove")
selectFeatures(toks, stopwords("english"), "keep")
selectFeatures(toks, stopwords("english"), "remove", padding = TRUE)
selectFeatures(toks, stopwords("english"), "keep", padding = TRUE)
selectFeatures(tokenize(data_char_inaugural[2]), stopwords("english"), "remove", padding = TRUE)
toksh <- tokens(c(doc1 = "This is a SAMPLE text", doc2 = "this sample text is better"))
feats <- c("this", "sample", "is")
# keeping features
selectFeatures(toksh, feats, selection = "keep")
selectFeatures(toksh, feats, selection = "keep", padding = TRUE)
selectFeatures(toksh, feats, selection = "keep", case_insensitive = FALSE)
selectFeatures(toksh, feats, selection = "keep", padding = TRUE, case_insensitive = FALSE)
# removing features
selectFeatures(toksh, feats, selection = "remove")
selectFeatures(toksh, feats, selection = "remove", padding = TRUE)
selectFeatures(toksh, feats, selection = "remove", case_insensitive = FALSE)
selectFeatures(toksh, feats, selection = "remove", padding = TRUE, case_insensitive = FALSE)
## example for collocations
(myCollocs <- collocations(data_char_inaugural[1:3], n=20))
selectFeatures(myCollocs, stopwords("english"), "remove")
Run the code above in your browser using DataLab