data(moview_review)
txt <- movie_review[['review']][1:1000]
it <- itoken(txt, tolower, word_tokenizer)
vocab <- vocabulary(it)
#remove very common and uncommon words
pruned_vocab = prune_vocabulary(vocab, term_count_min = 10,
doc_proportion_max = 0.8, doc_proportion_min = 0.001, max_number_of_terms = 20000)
it <- itoken(txt, tolower, word_tokenizer)
corpus <- create_vocab_corpus(it, pruned_vocab)
dtm <- get_dtm(corpus, type = 'dgCMatrix' )
tf_scale_matrix <- dtm_get_tf(dtm, type = 'tf')
dtm_tf <- tf_scale_matrix %*% dtm
dtm_tf_idf <- dtm_get_tf %*% m %*% dtm_get_idf(dtm)
# The same result we can obtain using transform function with parameter type = 'tfidf'
dtm_tf_idf_2 <- transform(dtm, type='tfidf')
identical(dtm_tf_idf, dtm_tf_idf_2)
Run the code above in your browser using DataLab