pos
- Apply part of speech tagger to transcript(s).
pos_by
- Apply part of speech tagger to transcript(s) by zero or more
grouping variable(s).
pos_tags
- Useful for interpreting the parts of speech tags created by
pos and pos_by.pos(text.var, parallel = FALSE, cores = detectCores()/2,
progress.bar = TRUE, na.omit = FALSE, digits = 1, percent = TRUE,
zero.replace = 0, gc.rate = 10)
pos_by(text.var, grouping.var = NULL, digits = 1, percent = TRUE,
zero.replace = 0, ...)
pos_tags(type = "pretty")
TRUE
attempts to run the function on
multiple cores. Note that this may not mean a speed boost if you have one
core or if the data set is smaller as the cluster takes time to create.parallel = TRUE
. Default
is half the number of available cores.TRUE
attempts to provide a OS
appropriate progress bar. If parallel is TRUE
this argument is
ignored. Note that setting this argument to TRUE
may slow down the
function.TRUE
missing values (NA
) will be
omitted.TRUE
output given as percent. If
FALSE
the output is proportion.pos
wraps. Consider adjusting this argument upward if
the error java.l
NULL
generates
one word list for all text. Also takes a single grouping variable or a list
of 1 or more grouping variables."pretty"
(a left justified version of
the output optimized for viewing but not good for export), "matrix"
(a matrix version of thpos
.pos
- returns a list of 4:pos_by
- returns a list of 6:Maxent_POS_Tag_Annotator
,
colcomb2class
posdat <- pos(DATA$state)
ltruncdf(posdat, 7, 4)
## str(posdat)
names(posdat)
posdat$text #original text
## Methods
preprocessed(posdat) #words replaced with parts of speech
counts(posdat) #frequency of parts of speech by row
proportions(posdat) #proportion of parts of speech by row
## Methods Plotting
plot(preprocessed(posdat))
plot(counts(posdat))
plot(proportions(posdat))
plot(posdat)
out1 <- pos(DATA$state, parallel = TRUE) # not always useful
ltruncdf(out1, 7, 4)
#use pos_tags to interpret part of speech tags used by pos & pos_by
pos_tags()[1:10, ]
pos_tags("matrix")[1:10, ]
pos_tags("dataframe")[1:10, ]
pos_tags("df")[1:10, ]
ltruncdf(pos_tags("all"), 3)
posbydat <- with(DATA, pos_by(state, sex))
names(posbydat)
## Methods
scores(posbydat)
preprocessed(posbydat)
counts(posbydat)
proportions(posbydat)
## Methods Plotting
plot(preprocessed(posbydat))
plot(counts(posbydat))
plot(proportions(posbydat))
plot(posbydat)
ltruncdf(posbydat, 7, 4)
truncdf(posbydat$pos.by.prop, 4)
POSby <- with(DATA, pos_by(state, list(adult, sex)))
plot(POSby, values = TRUE, digits = 2)
#or more quickly - reuse the output from before
out2 <- with(DATA, pos_by(posbydat, list(adult, sex)))
## Definite/Indefinite Noun
## 2 approached compared...
## The later is more efficient but less accurate
## ------------------------##
## Part off speech tagging ##
## ------------------------##
pos_after <- function(text.var, words, pos){
posses <- strsplit(as.character(text.var[["POStagged"]][["POStagged"]]), "\\s+")
namespos <- lapply(posses, function(x) {
y <- unlist(strsplit(x, "/"))
setNames(y[c(TRUE, FALSE)], y[c(FALSE, TRUE)])
})
lapply(namespos, function(x, thewords = words, thepos = pos){
locs <- which(x %in% thewords)
locs <- locs[!is.na(locs)]
if (identical(unclass(locs), integer(0))) return(NA_character_)
nounlocs <- which(names(x) %in% thepos)
unname(x[unique(sapply(locs, function(x){
min(nounlocs[nounlocs - x > 0])
}))])
})
}
out2 <- setNames(lapply(list(a=c("a", "an"), the="the"), function(x) {
o <- pos_after(rajPOS, x, c("NN", "NNS", "NNP", "NNPS"))
m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
m[m$freq> 3, ]
}), c("a", "the"))
dat2 <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out2), c("Word", "A", "THE"))
dat2 <- reshape2::melt(dat2, id="Word", variable.name="Article", value.name="freq")
dat2 <- dat2[order(dat2$freq, dat2$Word), ]
ord2 <- aggregate(freq ~ Word, dat2, sum)
dat2$Word <- factor(dat2$Word, levels=ord2[order(ord2[[2]]), 1])
rownames(dat2) <- NULL
ggplot(dat2, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Part Of Speech Parsing Approach")
dev.new()
## --------------------##
## Regular Expressions ##
## --------------------##
library(qdapRegex);library(ggplot2);library(reshape2)
out <- setNames(lapply(c("@after_a", "@after_the"), function(x) {
o <- rm_default(stringi:::stri_trans_tolower(raj$dialogue),
pattern = x, extract=TRUE)
m <- qdapTools::matrix2df(data.frame(freq=sort(table(unlist(o)), TRUE)), "word")
m[m$freq> 3, ]
}), c("a", "the"))
dat <- setNames(Reduce(function(x, y) {
merge(x, y, by = "word", all = TRUE)}, out), c("Word", "A", "THE"))
dat <- reshape2::melt(dat, id="Word", variable.name="Article", value.name="freq")
dat <- dat[order(dat$freq, dat$Word), ]
ord <- aggregate(freq ~ Word, dat, sum)
dat$Word <- factor(dat$Word, levels=ord[order(ord[[2]]), 1])
rownames(dat) <- NULL
ggplot(dat, aes(x=freq, y=Word)) +
geom_point()+ facet_grid(~Article) +
ggtitle("Regex Approach")
Run the code above in your browser using DataLab