dfm
. Uses the similarity measures defined in
simil. See pr_DB
for available distance
measures, or how to create your own.
similarity(x, selection = NULL, n = NULL, margin = c("documents", "features"), method = "correlation", sorted = TRUE, normalize = FALSE)
"similarity"(x, selection = NULL, n = NULL, margin = c("documents", "features"), method = "correlation", sorted = TRUE, normalize = FALSE)
"as.matrix"(x, ...)
n
most similar items will be returned, sorted in
descending order. If n is NULL
, return all items.documents
for documents or features
for word/term
features.pr_DB
TRUE
x
), wrap it in weight(x, "relFreq")
.# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), stem = TRUE,
remove = stopwords("english"))
# compute some document similarities
(tmp <- similarity(presDfm, margin = "documents"))
# output as a matrix
as.matrix(tmp)
# for specific comparisons
similarity(presDfm, "1985-Reagan", n = 5, margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), n = 5, margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents")
similarity(presDfm, c("2009-Obama" , "2013-Obama"), margin = "documents", method = "cosine")
similarity(presDfm, "2005-Bush", margin = "documents", method = "eJaccard", sorted = FALSE)
# compute some term similarities
similarity(presDfm, c("fair", "health", "terror"), method="cosine", margin = "features", 20)
Run the code above in your browser using DataLab