## Not run: ------------------------------------
# data(SOTUCorpus, package = "quantedaData")
# toks <- tokenize(SOTUCorpus, remove_punct = TRUE)
# # toks <- tokenize(tokenize(SOTUCorpus, what='sentence', simplify = TRUE), remove_punct = TRUE)
# # head to head, old v. new
# system.time(selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE))
# system.time(selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE))
# system.time(selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
# system.time(selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"))
# microbenchmark::microbenchmark(
#     old = selectFeaturesOLD(toks, stopwords("english"), "remove", verbose = FALSE),
#     new = selectFeatures(toks, stopwords("english"), "remove", verbose = FALSE),
#     times = 5, unit = "relative")
# microbenchmark::microbenchmark(
#     new = selectFeaturesOLD(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
#     old = selectFeatures(toks, c("and", "of"), "remove", verbose = FALSE, valuetype = "regex"),
#     times = 2, unit = "relative")
#     
# types <- unique(unlist(toks))
# numbers <- types[stringi::stri_detect_regex(types, '[0-9]')]
# microbenchmark::microbenchmark(
#     new = selectFeaturesOLD(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
#     old = selectFeatures(toks, numbers, "remove", verbose = FALSE, valuetype = "fixed"),
#     times = 2, unit = "relative")  
#     
# # removing tokens before dfm, versus after
# microbenchmark::microbenchmark(
#     pre = dfm(selectFeaturesOLD(toks, stopwords("english"), "remove"), verbose = FALSE),
#     post = dfm(toks, remove = stopwords("english"), verbose = FALSE),
#     times = 5, unit = "relative")
# 
# 
# ## with simple examples
# toks <- tokenize(c("This is a sentence.", "This is a second sentence."), 
#                  remove_punct = TRUE)
# selectFeatures(toks, c("is", "a", "this"), selection = "remove", 
#                 valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
# 
# # how case_insensitive works
# selectFeatures(toks, c("is", "a", "this"), selection = "remove", 
#                valuetype = "fixed", padding = TRUE, case_insensitive = FALSE)
# selectFeatures(toks, c("is", "a", "this"), selection = "remove", 
#                valuetype = "fixed", padding = TRUE, case_insensitive = TRUE)
# selectFeatures(toks, c("is", "a", "this"), selection = "remove", 
#                valuetype = "glob", padding = TRUE, case_insensitive = TRUE)
# selectFeatures(toks, c("is", "a", "this"), selection = "remove", 
#                valuetype = "glob", padding = TRUE, case_insensitive = FALSE)
# 
# # with longer texts
# toks <- tokenize(data_corpus_inaugural[1:2])
# selectFeatures(toks, stopwords("english"), "remove")
# selectFeatures(toks, stopwords("english"), "keep")
# selectFeatures(toks, stopwords("english"), "remove", padding = TRUE)
# selectFeatures(toks, stopwords("english"), "keep", padding = TRUE)
# selectFeatures(tokenize(data_corpus_inaugural[2]), stopwords("english"), "remove", padding = TRUE)
## ---------------------------------------------
## Not run: ------------------------------------
# toksh <- tokens(c(doc1 = "This is a SAMPLE text", doc2 = "this sample text is better"))
# feats <- c("this", "sample", "is")
# # keeping features
# selectFeatures(toksh, feats, selection = "keep")
# selectFeatures(toksh, feats, selection = "keep", padding = TRUE)
# selectFeatures(toksh, feats, selection = "keep", case_insensitive = FALSE)
# selectFeatures(toksh, feats, selection = "keep", padding = TRUE, case_insensitive = FALSE)
# # removing features
# selectFeatures(toksh, feats, selection = "remove")
# selectFeatures(toksh, feats, selection = "remove", padding = TRUE)
# selectFeatures(toksh, feats, selection = "remove", case_insensitive = FALSE)
# selectFeatures(toksh, feats, selection = "remove", padding = TRUE, case_insensitive = FALSE)
## ---------------------------------------------
 
## Not run: ------------------------------------
# ## example for collocations
# (myCollocs <- collocations(data_corpus_inaugural[1:3], n=20))
# selectFeatures(myCollocs, stopwords("english"), "remove")
## ---------------------------------------------
Run the code above in your browser using DataLab