Last chance! 50% off unlimited learning
Sale ends in
dfm(x, tolower = TRUE, stem = FALSE, select = NULL, remove = NULL, thesaurus = NULL, dictionary = NULL, valuetype = c("glob", "regex", "fixed"), groups = NULL, verbose = FALSE, ...)
TRUE
, stem wordsselect = "@*"
and make
sure that removeTwitter = FALSE
as an additional argument passed to
tokenize. Note: select = "^@\\w+\\b"
would be the regular
expression version of this matching pattern. The pattern matching type
will be set by valuetype
.stopwords()
. The pattern matching type will be set by
valuetype
. For behaviour of remove
with ngrams > 1
,
see Details.dfm_lookup
after creating the
dfm."glob"
for
"glob"-style wildcard expressions; "regex"
for regular expressions;
or "fixed"
for exact matching. See valuetype for details.x
is a dfm object, groups
provides a convenient
and fast method of combining and refactoring the documents of the dfm
according to the groups.TRUE
remove
/select
when
constructing ngrams using dfm(x,
ngrams > 1)
is to
remove/select any ngram constructed from a matching feature. If you wish
to remove these before constructing ngrams, you will need to first tokenize
the texts with ngrams, then remove the features to be ignored, and then
construct the dfm using this modified tokenization object. See the code
examples for an illustration.
dfm_select
, dfm-class
## for a corpus
corpus_post80inaug <- corpus_subset(data_corpus_inaugural, Year > 1980)
dfm(corpus_post80inaug)
dfm(corpus_post80inaug, tolower = FALSE)
# grouping documents by docvars in a corpus
dfm(corpus_post80inaug, groups = "President", verbose = TRUE)
# with English stopwords and stemming
dfm(corpus_post80inaug, remove = stopwords("english"), stem = TRUE, verbose = TRUE)
# works for both words in ngrams too
dfm("Banking industry", stem = TRUE, ngrams = 2, verbose = FALSE)
# with dictionaries
corpus_post1900inaug <- corpus_subset(data_corpus_inaugural, Year>1900)
mydict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
opposition = c("Opposition", "reject", "notincorpus"),
taxing = "taxing",
taxation = "taxation",
taxregex = "tax*",
country = "states"))
dfm(corpus_post1900inaug, dictionary = mydict)
# with the thesaurus feature
mytexts <- c("The new law included a capital gains tax, and an inheritance tax.",
"New York City has raised a taxes: an income tax and a sales tax.")
mydict <- dictionary(list(tax=c("tax", "income tax", "capital gains tax", "inheritance tax")))
dfm(phrasetotoken(mytexts, mydict), thesaurus = lapply(mydict, function(x) gsub("\\s", "_", x)))
# pick up "taxes" with "tax" as a regex
dfm(phrasetotoken(mytexts, mydict), thesaurus = list(anytax = "tax"), valuetype = "regex")
# removing stopwords
testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with
the newspaper from a boy named Seamus, in his mouth."
testCorpus <- corpus(testText)
# note: "also" is not in the default stopwords("english")
featnames(dfm(testCorpus, select = stopwords("english")))
# for ngrams
featnames(dfm(testCorpus, ngrams = 2, select = stopwords("english"), removePunct = TRUE))
featnames(dfm(testCorpus, ngrams = 1:2, select = stopwords("english"), removePunct = TRUE))
## removing stopwords before constructing ngrams
tokensAll <- tokens(char_tolower(testText), removePunct = TRUE)
tokensNoStopwords <- removeFeatures(tokensAll, stopwords("english"))
tokensNgramsNoStopwords <- tokens_ngrams(tokensNoStopwords, 2)
featnames(dfm(tokensNgramsNoStopwords, verbose = FALSE))
# keep only certain words
dfm(testCorpus, select = "*s", verbose = FALSE) # keep only words ending in "s"
dfm(testCorpus, select = "s$", valuetype = "regex", verbose = FALSE)
# testing Twitter functions
testTweets <- c("My homie @justinbieber #justinbieber shopping in #LA yesterday #beliebers",
"2all the ha8ers including my bro #justinbieber #emabiggestfansjustinbieber",
"Justin Bieber #justinbieber #belieber #fetusjustin #EMABiggestFansJustinBieber")
dfm(testTweets, select = "#*", removeTwitter = FALSE) # keep only hashtags
dfm(testTweets, select = "^#.*$", valuetype = "regex", removeTwitter = FALSE)
# for a dfm
dfm1 <- dfm(data_corpus_irishbudget2010)
dfm2 <- dfm(dfm1,
groups = ifelse(docvars(data_corpus_irishbudget2010, "party") %in% c("FF", "Green"),
"Govt", "Opposition"),
tolower = FALSE, verbose = TRUE)
Run the code above in your browser using DataLab