# NOT RUN {
library(torch)
library(topicmodels.etm)
library(word2vec)
library(udpipe)
data(brussels_reviews_anno, package = "udpipe")
##
## Toy example with pretrained embeddings
##
## a. build word2vec model
x <- subset(brussels_reviews_anno, language %in% "nl")
x <- paste.data.frame(x, term = "lemma", group = "doc_id")
set.seed(4321)
w2v <- word2vec(x = x$lemma, dim = 15, iter = 20, type = "cbow", min_count = 5)
embeddings <- as.matrix(w2v)
## b. build document term matrix on nouns + adjectives, align with the embedding terms
dtm <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% c("NOUN", "ADJ"))
dtm <- document_term_frequencies(dtm, document = "doc_id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_conform(dtm, columns = rownames(embeddings))
dtm <- dtm[dtm_rowsums(dtm) > 0, ]
## create and fit an embedding topic model - 8 topics, theta 100-dimensional
if (torch::torch_is_installed()) {
set.seed(4321)
torch_manual_seed(4321)
model <- ETM(k = 8, dim = 100, embeddings = embeddings, dropout = 0.5)
optimizer <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
overview <- model$fit(data = dtm, optimizer = optimizer, epoch = 40, batch_size = 1000)
scores <- predict(model, dtm, type = "topics")
lastbatch <- subset(overview$loss, overview$loss$batch_is_last == TRUE)
plot(lastbatch$epoch, lastbatch$loss)
plot(overview$loss_test)
## show top words in each topic
terminology <- predict(model, type = "terms", top_n = 7)
terminology
##
## Toy example without pretrained word embeddings
##
set.seed(4321)
torch_manual_seed(4321)
model <- ETM(k = 8, dim = 100, embeddings = 15, dropout = 0.5, vocab = colnames(dtm))
optimizer <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
overview <- model$fit(data = dtm, optimizer = optimizer, epoch = 40, batch_size = 1000)
terminology <- predict(model, type = "terms", top_n = 7)
terminology
# }
# NOT RUN {
}
# }
Run the code above in your browser using DataLab