tdm: tm Package Compatibility Tools: Apply to or Convert to/from Term Document Matrix or Document Term Matrix

Description

tdm - Create term document matrices from raw text or wfm for use with other text analysis packages. dtm - Create document term matrices from raw text or wfm for use with other text analysis packages. tm2qdap - Convert the tm package's TermDocumentMatrix/DocumentTermMatrix to wfm. apply_as_tm - Apply functions intended to be used on the tm package's TermDocumentMatrix to a wfm object. tm_corpus2df - Convert a tm package corpus to a dataframe. tm_corpus2wfm - Convert a Corpus package corpus to a wfm. df2tm_corpus - Convert a qdap dataframe to a tm package Corpus. Apply a tm Corpus as a qdap dataframe.

Usage

tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)

dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)

tm2qdap(x)

apply_as_tm(wfm.obj, tmfun, ..., to.qdap = TRUE)

tm_corpus2df(tm.corpus, col1 = "docs", col2 = "text")

tm_corpus2wfm(tm.corpus, col1 = "docs", col2 = "text", ...)

df2tm_corpus(text.var, grouping.var = NULL, ...)

apply_as_df(tm.corpus, qdapfun, ...)

Arguments

text.var

The text variable or a wfm object.

grouping.var

The grouping variables. Default NULL generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables.

...

If tdm or dtm - Other arguments passed to wfm. If apply_as_tm - Other arguments passed to functions used on the tm package's "TermDocumentMatrix". If df2tm_corpus

vowel.check

logical. Should terms without vowels be remove?

A TermDocumentMatrix/DocumentTermMatrix.

wfm.obj

A wfm object.

tmfun

A function applied to a TermDocumentMatrix object.

to.qdap

logical. If TRUE should wfm try to coerce the output back to a qdap object.

tm.corpus

A Corpus object.

col1

Name for column 1 (the vector elements).

col2

Name for column 2 (the names of the vectors).

qdapfun

A qdap function that is usually used on text.variable ~ grouping variable.

Value

tdm - Returns a TermDocumentMatrix. dtm - Returns a DocumentTermMatrix. tm2qdap - Returns a wfm object or weight object. apply_as_tm - Applies a tm oriented function to a wfm and attempts to simplify back to a wfm or weight format. tm_corpus2df - Converts a Corpus and returns a qdap oriented dataframe. df2tm_wfm - Converts a qdap oriented dataframe and returns a wfm. df2tm_corpus - Converts a qdap oriented dataframe and returns a Corpus.

Details

Produces output that is identical to the tm package's TermDocumentMatrix, DocumentTermMatrix, Corpus or allows convenient interface between the qdap and tm packages.

Examples

Run this code

dtm(DATA$state, DATA$person)
tdm(DATA$state, DATA$person)

x <- wfm(DATA$state, DATA$person)
tdm(x)
dtm(x)
library(tm)
plot(tdm(x))

pres <- tdm(pres_debates2012$dialogue, pres_debates2012$person)
plot(pres, corThreshold = 0.8)
pres
(pres2 <- removeSparseTerms(pres, .3))
plot(pres2, corThreshold = 0.95)

## Latent Semantic Analysis
library(lsa)
lsa(tdm(x), dims=dimcalc_share())
lsa(tdm(DATA$state, DATA$person), dims=dimcalc_share())

shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
    pres_debates2012)[,1]) < 4]

SW <- c(shorts, qdapDictionaries::contractions[, 1],
    qdapDictionaries::Top200Words,
    "governor", "president", "mister", "obama","romney")

DocTermMat2 <- with(pres_debates2012, dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]

out <- lsa(DocTermMat2, 6)
out$tk
out2 <- colsplit2df(matrix2df(out$tk), new.names = qcv(Person, Time))
out2$Person <- factor(out2$Person,
    levels = names(sort(colSums(with(pres_debates2012,
        wfm(dialogue, person, stopwords = SW))), TRUE))
)
colnames(out2) <- gsub("X", "Topic ", colnames(out2))

qheat(out2, facet.vars = "Time", high="darkgreen", plot=FALSE) +
    theme(legend.title=element_blank()) +
    guides(fill = guide_colorbar(barwidth = .5, barheight = 12))

## Correspondence Analysis
library(ca)

dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]

speech <- stemmer(dat$dialogue)
mytable1 <- with(dat, tdm(speech, list(person, time), stopwords = Top25Words))

fit <- ca(mytable1)
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)


mytable2 <- with(dat, tdm(speech, list(person, time), stopwords = Top200Words))

fit2 <- ca(mytable2)
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)

## Topic Models
# Example 1 #
library(topicmodels); library(tm)

# Generate stop words based on short words, frequent words and contractions
shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
    pres_debates2012)[,1]) < 4]

SW <- c(shorts, qdapDictionaries::contractions[, 1],
    qdapDictionaries::Top200Words,
    "governor", "president", "mister", "obama","romney")

DocTermMat <- with(pres_debates2012, dtm(dialogue, person, stopwords = SW))
DocTermMat <- removeSparseTerms(DocTermMat,0.999)
DocTermMat <- DocTermMat[rowSums(as.matrix(DocTermMat))> 0,]

lda.model <- LDA(DocTermMat, 5)

(topics <- posterior(lda.model, DocTermMat)$topics)
terms(lda.model,20)

# Plot the Topics Per Person
topic.dat <- matrix2df(topics, "Person")
colnames(topic.dat)[-1] <- paste2(t(terms(lda.model,20)), sep=", ")

library(reshape2)
mtopic <- melt(topic.dat, variable="Topic", value.name="Proportion")
ggplot(mtopic, aes(weight=Proportion, x=Topic, fill=Topic)) +
    geom_bar() +
    coord_flip() +
    facet_grid(Person~.) +
    guides(fill=FALSE)

# Example 2 #
DocTermMat2 <- with(pres_debates2012, dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]

lda.model2 <- LDA(DocTermMat2, 6)

(topics2 <- posterior(lda.model2, DocTermMat2)$topics)
terms(lda.model2,20)
qheat(topics2, high="blue", low="yellow", by.col=FALSE)

# Example 3 #
lda.model3 <- LDA(DocTermMat2, 10)

(topics3 <- posterior(lda.model3, DocTermMat2)$topics)
terms(lda.model3, 20)
qheat(topics3, high="blue", low="yellow", by.col=FALSE)

# Plot the Topics Per Person
topic.dat3 <- matrix2df(topics3, "Person&Time")
colnames(topic.dat3)[-1] <- paste2(t(terms(lda.model3, 10)), sep=", ")
topic.dat3 <- colsplit2df(topic.dat3)

library(reshape2)
library(scales)
mtopic3 <- melt(topic.dat3, variable="Topic", value.name="Proportion")
(p1 <- ggplot(mtopic3, aes(weight=Proportion, x=Topic, fill=Topic)) +
    geom_bar() +
    coord_flip() +
    facet_grid(Person~Time) +
    guides(fill=FALSE) +
    scale_y_continuous(labels = percent) +
    theme(plot.margin = unit(c(1, 0, 0.5, .5), "lines")) +
    ylab("Proportion"))

mtopic3.b <- mtopic3
mtopic3.b[, "Topic"] <- factor(as.numeric(mtopic3.b[, "Topic"]), levels = 1:10)
mtopic3.b[, "Time"] <- factor(gsub("time ", "", mtopic3.b[, "Time"]))

p2 <- ggplot(mtopic3.b, aes(x=Time, y=Topic, fill=Proportion)) +
    geom_tile(color = "white") +
    scale_fill_gradient(low = "grey70", high = "red") +
    facet_grid(Person~Time, scales = "free") +
    theme(axis.title.y = element_blank(),
        axis.text.x= element_text(colour="white"),
        axis.ticks.x= element_line(colour="white"),
        axis.ticks.y = element_blank(),
        axis.text.y= element_blank(),
        plot.margin = unit(c(1, -.5, .5, -.9), "lines")
)

library(gridExtra)
grid.arrange(p1, p2, nrow=1, widths = c(.85, .15))

## tm Matrices to wfm
library(tm)
data(crude)

## A Term Document Matrix Conversion
(tm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE)))
converted <- tm2qdap(tm_in)
head(converted)
summary(converted)

## A Document Term Matrix Conversion
(dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE)))
summary(tm2qdap(dtm_in))

## `apply_as_tm` Examples
## Create a wfm
a <- with(DATA, wfm(state, list(sex, adult)))
summary(a)

## Apply functions meant for a tm TermDocumentMatrix
out <- apply_as_tm(a, tm:::removeSparseTerms, sparse=0.6)
summary(out)

apply_as_tm(a, tm:::dissimilarity, method = "cosine")
apply_as_tm(a, tm:::findAssocs, "computer", .8)
apply_as_tm(a, tm:::findFreqTerms, 2, 3)
apply_as_tm(a, tm:::Zipf_plot)
apply_as_tm(a, tm:::Heaps_plot)
apply_as_tm(a, tm:::plot.TermDocumentMatrix, corThreshold = 0.4)

library(proxy)
apply_as_tm(a, tm:::weightBin)
apply_as_tm(a, tm:::weightBin, to.qdap = FALSE)
apply_as_tm(a, tm:::weightSMART)
apply_as_tm(a, tm:::weightTfIdf)

## Convert tm Corpus to Dataframe
## A tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
    readerControl = list(reader = readReut21578XML))

## Convert to dataframe
corp_df <- tm_corpus2df(reuters)
htruncdf(corp_df)

## Apply a qdap function
out <- formality(corp_df$text, corp_df$docs)
plot(out)

## Convert a qdap dataframe to tm package Corpus
(x <- with(DATA2, df2tm_corpus(state, list(person, class, day))))
library(tm)
inspect(x)
class(x)

(y <- with(pres_debates2012, df2tm_corpus(dialogue, list(person, time))))

## Apply qdap functions meant for dataframes from sentSplit to tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
    readerControl = list(reader = readReut21578XML))

apply_as_df(reuters, word_stats)
apply_as_df(reuters, formality)
apply_as_df(reuters, word_list)
apply_as_df(reuters, polarity)
apply_as_df(reuters, Dissimilarity)
apply_as_df(reuters, diversity)
apply_as_df(reuters, pos_by)
apply_as_df(reuters, flesch_kincaid)
apply_as_df(reuters, trans_venn)
apply_as_df(reuters, gantt_plot)
apply_as_df(reuters, rank_freq_mplot)
apply_as_df(reuters, termco,
    match.list = list(
        oil = qcv(oil, Texas, crude),
        money = c("economic", "money")
    ))
plot(apply_as_df(reuters, termco,
    match.list = list(
        oil = qcv(oil, Texas, crude),
        money = c("economic", "money")
    ), elim.old = FALSE), values = TRUE, high="red")
apply_as_df(reuters, word_cor,
    word = qcv(oil, Texas, crude, economic, money)
)
plot(apply_as_df(reuters, word_cor,
    word = qcv(oil, Texas, crude, economic, money)
))

Run the code above in your browser using DataLab