ETM: Topic Modelling in Semantic Embedding Spaces

Description

ETM is a generative topic model combining traditional topic models (LDA) with word embeddings (word2vec).

It models each word with a categorical distribution whose natural parameter is the inner product between a word embedding and an embedding of its assigned topic.
The model is fitted using an amortized variational inference algorithm on top of libtorch.

Usage

ETM(
  k = 20,
  embeddings,
  dim = 800,
  activation = c("relu", "tanh", "softplus", "rrelu", "leakyrelu", "elu", "selu", "glu"),
  dropout = 0.5,
  vocab = rownames(embeddings)
)

Value

an object of class ETM which is a torch nn_module containing o.a.

num_topics: the number of topics
vocab: character vector with the terminology used in the model
vocab_size: the number of words in vocab
rho: The word embeddings
alphas: The topic embeddings

Arguments

k: the number of topics to extract
embeddings: either a matrix with pretrained word embeddings or an integer with the dimension of the word embeddings. Defaults to 50 if not provided.
dim: dimension of the variational inference hyperparameter theta (passed on to nn_linear). Defaults to 800.
activation: character string with the activation function of theta. Either one of 'relu', 'tanh', 'softplus', 'rrelu', 'leakyrelu', 'elu', 'selu', 'glu'. Defaults to 'relu'.
dropout: dropout percentage on the variational distribution for theta (passed on to nn_dropout). Defaults to 0.5.
vocab: a character vector with the words from the vocabulary. Defaults to the rownames of the embeddings argument.

Methods

fit(data, optimizer, epoch, batch_size, normalize = TRUE, clip = 0, lr_anneal_factor = 4, lr_anneal_nonmono = 10): Fit the model on a document term matrix by splitting the data in 70/30 training/test set and updating the model weights.

Arguments

data: bag of words document term matrix in dgCMatrix format

optimizer

object of class torch_Optimizer

epoch

integer with the number of iterations to train

batch_size

integer with the size of the batch

normalize

logical indicating to normalize the bag of words data

clip

number between 0 and 1 indicating to do gradient clipping - passed on to nn_utils_clip_grad_norm_

lr_anneal_factor

divide the learning rate by this factor when the loss on the test set is monotonic for at least lr_anneal_nonmono training iterations

lr_anneal_nonmono

number of iterations after which learning rate annealing is executed if the loss does not decreases

References

https://arxiv.org/pdf/1907.04907.pdf

Examples

Run this code

library(torch)
library(topicmodels.etm)
library(word2vec)
library(udpipe)
data(brussels_reviews_anno, package = "udpipe")
##
## Toy example with pretrained embeddings
##

## a. build word2vec model
x          <- subset(brussels_reviews_anno, language %in% "nl")
x          <- paste.data.frame(x, term = "lemma", group = "doc_id") 
set.seed(4321)
w2v        <- word2vec(x = x$lemma, dim = 15, iter = 20, type = "cbow", min_count = 5)
embeddings <- as.matrix(w2v)

## b. build document term matrix on nouns + adjectives, align with the embedding terms
dtm <- subset(brussels_reviews_anno, language %in% "nl" & upos %in% c("NOUN", "ADJ"))
dtm <- document_term_frequencies(dtm, document = "doc_id", term = "lemma")
dtm <- document_term_matrix(dtm)
dtm <- dtm_conform(dtm, columns = rownames(embeddings))
dtm <- dtm[dtm_rowsums(dtm) > 0, ]

## create and fit an embedding topic model - 8 topics, theta 100-dimensional
if (torch::torch_is_installed()) {

set.seed(4321)
torch_manual_seed(4321)
model       <- ETM(k = 8, dim = 100, embeddings = embeddings, dropout = 0.5)
optimizer   <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
epochs      <- 40
# \dontshow{
epochs      <- 5
# }
overview    <- model$fit(data = dtm, optimizer = optimizer, epoch = epochs, batch_size = 1000)
scores      <- predict(model, dtm, type = "topics")

lastbatch   <- subset(overview$loss, overview$loss$batch_is_last == TRUE)
plot(lastbatch$epoch, lastbatch$loss)
plot(overview$loss_test)

## show top words in each topic
terminology <- predict(model, type = "terms", top_n = 7)
terminology

##
## Toy example without pretrained word embeddings
##
set.seed(4321)
torch_manual_seed(4321)
model       <- ETM(k = 8, dim = 100, embeddings = 15, dropout = 0.5, vocab = colnames(dtm))
optimizer   <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)
epochs      <- 40
# \dontshow{
epochs      <- 5
# }
overview    <- model$fit(data = dtm, optimizer = optimizer, epoch = epochs, batch_size = 1000)
terminology <- predict(model, type = "terms", top_n = 7)
terminology


# \donttest{
# \dontshow{
##
## Another example using fit_original
##
data(ng20, package = "topicmodels.etm")
vocab  <- ng20$vocab
tokens <- ng20$bow_tr$tokens
counts <- ng20$bow_tr$counts

torch_manual_seed(123456789)
model     <- ETM(k = 4, vocab = vocab, dim = 5, embeddings = 25)
model
optimizer <- optim_adam(params = model$parameters, lr = 0.005, weight_decay = 0.0000012)

traindata <- list(tokens = tokens, counts = counts, vocab = vocab)
test1     <- list(tokens = ng20$bow_ts_h1$tokens, counts = ng20$bow_ts_h1$counts, vocab = vocab)
test2     <- list(tokens = ng20$bow_ts_h2$tokens, counts = ng20$bow_ts_h2$counts, vocab = vocab)

out <- model$fit_original(data = traindata, test1 = test1, test2 = test2, epoch = 4, 
                          optimizer = optimizer, batch_size = 1000, 
                          lr_anneal_factor = 4, lr_anneal_nonmono = 10)
test <- subset(out$loss, out$loss$batch_is_last == TRUE)
plot(test$epoch, test$loss)

topic.centers     <- as.matrix(model, type = "embedding", which = "topics")
word.embeddings   <- as.matrix(model, type = "embedding", which = "words")
topic.terminology <- as.matrix(model, type = "beta")

terminology <- predict(model, type = "terms", top_n = 4)
terminology
# }
# }

}

Run the code above in your browser using DataLab