library(torch)
library(topicmodels.etm)
library(word2vec)
library(udpipe)
data(brussels_reviews_anno, package = "udpipe")
##
## Toy example with pretrained embeddings
##
## a. build word2vec model
x <- subset(brussels_reviews_anno, language %in% "nl")
x <- paste.data.frame(x, term = "lemma", group = "doc_id")
set.seed(4321)
w2v <- word2vec(x = x$lemma, dim = 15, iter = 20, type = "cbow", min_count = 5)
embeddings <- as.matrix(w2v)
## b. build document term matrix on nouns + adjectives, align with the embedding terms
dtm <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% c("NOUN", "ADJ"))
dtm <- document_term_frequencies(dtm, document = "doc_id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_conform(dtm, columns = rownames(embeddings))
dtm <- dtm[dtm_rowsums(dtm) > 0, ]
## create and fit an embedding topic model - 8 topics, theta 100-dimensional
if (torch::torch_is_installed()) {
set.seed(4321)
torch_manual_seed(4321)
model <- ETM(k = 8, dim = 100, embeddings = embeddings, dropout = 0.5)
optimizer <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
epochs <- 40
# \dontshow{
epochs <- 5
# }
overview <- model$fit(data = dtm, optimizer = optimizer, epoch = epochs, batch_size = 1000)
scores <- predict(model, dtm, type = "topics")
lastbatch <- subset(overview$loss, overview$loss$batch_is_last == TRUE)
plot(lastbatch$epoch, lastbatch$loss)
plot(overview$loss_test)
## show top words in each topic
terminology <- predict(model, type = "terms", top_n = 7)
terminology
##
## Toy example without pretrained word embeddings
##
set.seed(4321)
torch_manual_seed(4321)
model <- ETM(k = 8, dim = 100, embeddings = 15, dropout = 0.5, vocab = colnames(dtm))
optimizer <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
epochs <- 40
# \dontshow{
epochs <- 5
# }
overview <- model$fit(data = dtm, optimizer = optimizer, epoch = epochs, batch_size = 1000)
terminology <- predict(model, type = "terms", top_n = 7)
terminology
# \donttest{
# \dontshow{
##
## Another example using fit_original
##
data(ng20, package = "topicmodels.etm")
vocab <- ng20$vocab
tokens <- ng20$bow_tr$tokens
counts <- ng20$bow_tr$counts
torch_manual_seed(123456789)
model <- ETM(k = 4, vocab = vocab, dim = 5, embeddings = 25)
model
optimizer <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
traindata <- list(tokens = tokens, counts = counts, vocab = vocab)
test1 <- list(tokens = ng20$bow_ts_h1$tokens, counts = ng20$bow_ts_h1$counts, vocab = vocab)
test2 <- list(tokens = ng20$bow_ts_h2$tokens, counts = ng20$bow_ts_h2$counts, vocab = vocab)
out <- model$fit_original(data = traindata, test1 = test1, test2 = test2, epoch = 4,
optimizer = optimizer, batch_size = 1000,
lr_anneal_factor = 4, lr_anneal_nonmono = 10)
test <- subset(out$loss, out$loss$batch_is_last == TRUE)
plot(test$epoch, test$loss)
topic.centers <- as.matrix(model, type = "embedding", which = "topics")
word.embeddings <- as.matrix(model, type = "embedding", which = "words")
topic.terminology <- as.matrix(model, type = "beta")
terminology <- predict(model, type = "terms", top_n = 4)
terminology
# }
# }
}
Run the code above in your browser using DataLab