dfm
and return a standard dist
object.
textstat_dist(x, selection = character(0), n = NULL, margin = c("documents", "features"), method = "euclidean", upper = TRUE, diag = FALSE, p = 2)
textstat_simil(x, selection = character(0), n = NULL, margin = c("documents", "features"), method = "correlation", upper = FALSE, diag = FALSE)
n
highest-ranking items will be returned. If n is
NULL
, return all items.documents
for documents or
features
for word/term features.textstat_dist
options are: "euclidean"
(default),
"canberra"
, "Chisquared"
, "Chisquared2"
, "hamming"
, "kullback"
.
"manhattan"
, "maximum"
, "canberra"
, and "minkowski"
.textstat_simil
options are: "correlation"
(default),
"cosine"
, "jaccard"
, "eJaccard"
,
"dice"
, "eDice"
, "simple matching"
, "hamann"
, and
"faith"
.
# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(inaugCorpus, Year > 1980),
remove = stopwords("english"), stem = TRUE)
## distance
# compute some document distances
(tmp <- textstat_dist(presDfm, margin = "documents"))
# for specific comparisons
textstat_dist(presDfm, "1985-Reagan", n = 5, margin = "documents")
textstat_dist(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
textstat_dist(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
textstat_dist(presDfm, "2005-Bush", margin = "documents", method = "eJaccard")
## similarities
# compute some document similarities
(tmp <- textstat_simil(presDfm, margin = "documents"))
# output as a list
as.list(tmp)[1:2]
# for specific comparisons
textstat_simil(presDfm, "1985-Reagan", n = 5, margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
textstat_simil(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents", method = "cosine")
# compute some term similarities
textstat_simil(presDfm, c("fair", "health", "terror"), method = "cosine",
margin = "features", 20)
Run the code above in your browser using DataLab