
DocumentTermMatrix
/TermDocumentMatrix
Convenience functions to convert a objects from different packages into either a tm::DocumentTermMatrix
or tm::TermDocumentMatrix
object. Grouping variables are used as the row/column names for the DocumentTermMatrix
/TermDocumentMatrix
.
as_dtm(x, weighting = tm::weightTf, docs = NULL, pos = TRUE, ...)as_tdm(x, weighting = tm::weightTf, docs = NULL, pos = TRUE, ...)
A data object.
A weighting function capable of handling a tm::DocumentTermMatrix
. It defaults to weightTf
for term frequency weighting. Available weighting functions shipped with the tm package are weightTf
, weightTfIdf
, weightBin
, and weightSMART
.
The vector of integers or character strings denoting document columns.
logical. If TRUE
parts of speech will be used. If
FALSE
the corresponding tokens will be used.
ignored.
Returns a tm::DocumentTermMatrix
or tm::TermDocumentMatrix
object.
# NOT RUN {
with(partial_republican_debates_2015,
as_dtm(dialogue, paste(location, element_id, sentence_id, sep = "_"))
)
as_dtm(mtcars)
as_dtm(CO2, docs = c('Plant', 'Type', 'Treatment'))
# }
# NOT RUN {
## termco object to DTM/TDM
library(termco)
as_dtm(markers)
as_dtm(markers,weighting = tm::weightTfIdf)
as_tdm(markers)
cosine_distance <- function (x, ...) {
x <- t(slam::as.simple_triplet_matrix(x))
stats::as.dist(1 - slam::crossprod_simple_triplet_matrix(x)/(sqrt(slam::col_sums(x^2) %*%
t(slam::col_sums(x^2)))))
}
mod <- hclust(cosine_distance(as_dtm(markers)))
plot(mod)
rect.hclust(mod, k = 5, border = "red")
(clusters <- cutree(mod, 5))
## Parts of speech to DTM/TDM
library(tagger)
library(dplyr)
data(presidential_debates_2012_pos)
pos <- presidential_debates_2012_pos %>%
select_tags(c("NN", "NNP", "NNPS", "NNS"))
as_dtm(pos_text)
as_dtm(pos_text, pos=FALSE)
as_tdm(pos_text)
as_tdm(pos_text, pos=FALSE)
presidential_debates_2012_pos %>%
as_basic() %>%
as_dtm()
# }
Run the code above in your browser using DataLab