Learn R Programming

qdap (version 1.2.0)

tdm: tm Package Compatibility Tools: Apply to or Convert to/from Term Document Matrix or Document Term Matrix

Description

tdm - Create term document matrices from raw text or wfm for use with other text analysis packages. dtm - Create document term matrices from raw text or wfm for use with other text analysis packages. tm2qdap - Convert the tm package's TermDocumentMatrix/DocumentTermMatrix to wfm. apply_as_tm - Apply functions intended to be used on the tm package's TermDocumentMatrix to a wfm object. tm_corpus2df - Convert a tm package corpus to a dataframe. tm_corpus2wfm - Convert a Corpus package corpus to a wfm. df2tm_corpus - Convert a qdap dataframe to a tm package Corpus. Apply a tm Corpus as a qdap dataframe.

Usage

tdm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)

dtm(text.var, grouping.var = NULL, vowel.check = TRUE, ...)

tm2qdap(x)

apply_as_tm(wfm.obj, tmfun, ..., to.qdap = TRUE)

tm_corpus2df(tm.corpus, col1 = "docs", col2 = "text")

tm_corpus2wfm(tm.corpus, col1 = "docs", col2 = "text", ...)

df2tm_corpus(text.var, grouping.var = NULL, ...)

apply_as_df(tm.corpus, qdapfun, ...)

Arguments

text.var
The text variable or a wfm object.
grouping.var
The grouping variables. Default NULL generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables.
...
If tdm or dtm - Other arguments passed to wfm. If apply_as_tm - Other arguments passed to functions used on the tm package's "TermDocumentMatrix". If df2tm_corpus
vowel.check
logical. Should terms without vowels be remove?
wfm.obj
A wfm object.
tmfun
A function applied to a TermDocumentMatrix object.
to.qdap
logical. If TRUE should wfm try to coerce the output back to a qdap object.
tm.corpus
A Corpus object.
col1
Name for column 1 (the vector elements).
col2
Name for column 2 (the names of the vectors).
qdapfun
A qdap function that is usually used on text.variable ~ grouping variable.

Value

  • tdm - Returns a TermDocumentMatrix. dtm - Returns a DocumentTermMatrix. tm2qdap - Returns a wfm object or weight object. apply_as_tm - Applies a tm oriented function to a wfm and attempts to simplify back to a wfm or weight format. tm_corpus2df - Converts a Corpus and returns a qdap oriented dataframe. df2tm_wfm - Converts a qdap oriented dataframe and returns a wfm. df2tm_corpus - Converts a qdap oriented dataframe and returns a Corpus.

Details

Produces output that is identical to the tm package's TermDocumentMatrix, DocumentTermMatrix, Corpus or allows convenient interface between the qdap and tm packages.

See Also

DocumentTermMatrix, Corpus, TermDocumentMatrix

Examples

Run this code
dtm(DATA$state, DATA$person)
tdm(DATA$state, DATA$person)

x <- wfm(DATA$state, DATA$person)
tdm(x)
dtm(x)
library(tm)
plot(tdm(x))

pres <- tdm(pres_debates2012$dialogue, pres_debates2012$person)
plot(pres, corThreshold = 0.8)
pres
(pres2 <- removeSparseTerms(pres, .3))
plot(pres2, corThreshold = 0.95)

## Latent Semantic Analysis
library(lsa)
lsa(tdm(x), dims=dimcalc_share())
lsa(tdm(DATA$state, DATA$person), dims=dimcalc_share())

shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
    pres_debates2012)[,1]) < 4]

SW <- c(shorts, qdapDictionaries::contractions[, 1],
    qdapDictionaries::Top200Words,
    "governor", "president", "mister", "obama","romney")

DocTermMat2 <- with(pres_debates2012, dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]

out <- lsa(DocTermMat2, 6)
out$tk
out2 <- colsplit2df(matrix2df(out$tk), new.names = qcv(Person, Time))
out2$Person <- factor(out2$Person,
    levels = names(sort(colSums(with(pres_debates2012,
        wfm(dialogue, person, stopwords = SW))), TRUE))
)
colnames(out2) <- gsub("X", "Topic ", colnames(out2))

qheat(out2, facet.vars = "Time", high="darkgreen", plot=FALSE) +
    theme(legend.title=element_blank()) +
    guides(fill = guide_colorbar(barwidth = .5, barheight = 12))

## Correspondence Analysis
library(ca)

dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA), ]

speech <- stemmer(dat$dialogue)
mytable1 <- with(dat, tdm(speech, list(person, time), stopwords = Top25Words))

fit <- ca(mytable1)
summary(fit)
plot(fit)
plot3d.ca(fit, labels=1)


mytable2 <- with(dat, tdm(speech, list(person, time), stopwords = Top200Words))

fit2 <- ca(mytable2)
summary(fit2)
plot(fit2)
plot3d.ca(fit2, labels=1)

## Topic Models
# Example 1 #
library(topicmodels); library(tm)

# Generate stop words based on short words, frequent words and contractions
shorts <- all_words(pres_debates2012)[,1][nchar(all_words(
    pres_debates2012)[,1]) < 4]

SW <- c(shorts, qdapDictionaries::contractions[, 1],
    qdapDictionaries::Top200Words,
    "governor", "president", "mister", "obama","romney")

DocTermMat <- with(pres_debates2012, dtm(dialogue, person, stopwords = SW))
DocTermMat <- removeSparseTerms(DocTermMat,0.999)
DocTermMat <- DocTermMat[rowSums(as.matrix(DocTermMat))> 0,]

lda.model <- LDA(DocTermMat, 5)

(topics <- posterior(lda.model, DocTermMat)$topics)
terms(lda.model,20)

# Plot the Topics Per Person
topic.dat <- matrix2df(topics, "Person")
colnames(topic.dat)[-1] <- paste2(t(terms(lda.model,20)), sep=", ")

library(reshape2)
mtopic <- melt(topic.dat, variable="Topic", value.name="Proportion")
ggplot(mtopic, aes(weight=Proportion, x=Topic, fill=Topic)) +
    geom_bar() +
    coord_flip() +
    facet_grid(Person~.) +
    guides(fill=FALSE)

# Example 2 #
DocTermMat2 <- with(pres_debates2012, dtm(dialogue, list(person, time), stopwords = SW))
DocTermMat2 <- removeSparseTerms(DocTermMat2,0.95)
DocTermMat2 <- DocTermMat2[rowSums(as.matrix(DocTermMat2))> 0,]

lda.model2 <- LDA(DocTermMat2, 6)

(topics2 <- posterior(lda.model2, DocTermMat2)$topics)
terms(lda.model2,20)
qheat(topics2, high="blue", low="yellow", by.col=FALSE)

# Example 3 #
lda.model3 <- LDA(DocTermMat2, 10)

(topics3 <- posterior(lda.model3, DocTermMat2)$topics)
terms(lda.model3, 20)
qheat(topics3, high="blue", low="yellow", by.col=FALSE)

# Plot the Topics Per Person
topic.dat3 <- matrix2df(topics3, "Person&Time")
colnames(topic.dat3)[-1] <- paste2(t(terms(lda.model3, 10)), sep=", ")
topic.dat3 <- colsplit2df(topic.dat3)

library(reshape2)
library(scales)
mtopic3 <- melt(topic.dat3, variable="Topic", value.name="Proportion")
(p1 <- ggplot(mtopic3, aes(weight=Proportion, x=Topic, fill=Topic)) +
    geom_bar() +
    coord_flip() +
    facet_grid(Person~Time) +
    guides(fill=FALSE) +
    scale_y_continuous(labels = percent) +
    theme(plot.margin = unit(c(1, 0, 0.5, .5), "lines")) +
    ylab("Proportion"))

mtopic3.b <- mtopic3
mtopic3.b[, "Topic"] <- factor(as.numeric(mtopic3.b[, "Topic"]), levels = 1:10)
mtopic3.b[, "Time"] <- factor(gsub("time ", "", mtopic3.b[, "Time"]))

p2 <- ggplot(mtopic3.b, aes(x=Time, y=Topic, fill=Proportion)) +
    geom_tile(color = "white") +
    scale_fill_gradient(low = "grey70", high = "red") +
    facet_grid(Person~Time, scales = "free") +
    theme(axis.title.y = element_blank(),
        axis.text.x= element_text(colour="white"),
        axis.ticks.x= element_line(colour="white"),
        axis.ticks.y = element_blank(),
        axis.text.y= element_blank(),
        plot.margin = unit(c(1, -.5, .5, -.9), "lines")
)

library(gridExtra)
grid.arrange(p1, p2, nrow=1, widths = c(.85, .15))

## tm Matrices to wfm
library(tm)
data(crude)

## A Term Document Matrix Conversion
(tm_in <- TermDocumentMatrix(crude, control = list(stopwords = TRUE)))
converted <- tm2qdap(tm_in)
head(converted)
summary(converted)

## A Document Term Matrix Conversion
(dtm_in <- DocumentTermMatrix(crude, control = list(stopwords = TRUE)))
summary(tm2qdap(dtm_in))

## `apply_as_tm` Examples
## Create a wfm
a <- with(DATA, wfm(state, list(sex, adult)))
summary(a)

## Apply functions meant for a tm TermDocumentMatrix
out <- apply_as_tm(a, tm:::removeSparseTerms, sparse=0.6)
summary(out)

apply_as_tm(a, tm:::dissimilarity, method = "cosine")
apply_as_tm(a, tm:::findAssocs, "computer", .8)
apply_as_tm(a, tm:::findFreqTerms, 2, 3)
apply_as_tm(a, tm:::Zipf_plot)
apply_as_tm(a, tm:::Heaps_plot)
apply_as_tm(a, tm:::plot.TermDocumentMatrix, corThreshold = 0.4)

library(proxy)
apply_as_tm(a, tm:::weightBin)
apply_as_tm(a, tm:::weightBin, to.qdap = FALSE)
apply_as_tm(a, tm:::weightSMART)
apply_as_tm(a, tm:::weightTfIdf)

## Convert tm Corpus to Dataframe
## A tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
    readerControl = list(reader = readReut21578XML))

## Convert to dataframe
corp_df <- tm_corpus2df(reuters)
htruncdf(corp_df)

## Apply a qdap function
out <- formality(corp_df$text, corp_df$docs)
plot(out)

## Convert a qdap dataframe to tm package Corpus
(x <- with(DATA2, df2tm_corpus(state, list(person, class, day))))
library(tm)
inspect(x)
class(x)

(y <- with(pres_debates2012, df2tm_corpus(dialogue, list(person, time))))

## Apply qdap functions meant for dataframes from sentSplit to tm Corpus
library(tm)
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- Corpus(DirSource(reut21578),
    readerControl = list(reader = readReut21578XML))

apply_as_df(reuters, word_stats)
apply_as_df(reuters, formality)
apply_as_df(reuters, word_list)
apply_as_df(reuters, polarity)
apply_as_df(reuters, Dissimilarity)
apply_as_df(reuters, diversity)
apply_as_df(reuters, pos_by)
apply_as_df(reuters, flesch_kincaid)
apply_as_df(reuters, trans_venn)
apply_as_df(reuters, gantt_plot)
apply_as_df(reuters, rank_freq_mplot)
apply_as_df(reuters, termco,
    match.list = list(
        oil = qcv(oil, Texas, crude),
        money = c("economic", "money")
    ))
plot(apply_as_df(reuters, termco,
    match.list = list(
        oil = qcv(oil, Texas, crude),
        money = c("economic", "money")
    ), elim.old = FALSE), values = TRUE, high="red")
apply_as_df(reuters, word_cor,
    word = qcv(oil, Texas, crude, economic, money)
)
plot(apply_as_df(reuters, word_cor,
    word = qcv(oil, Texas, crude, economic, money)
))

Run the code above in your browser using DataLab