if(require(tokenizers.bpe) & require(word2vec)){
library(tokenizers.bpe)
## Take data and standardise it a bit
data(belgium_parliament, package = "tokenizers.bpe")
str(belgium_parliament)
x <- subset(belgium_parliament, language %in% "french")
x$text <- tolower(x$text)
x$text <- gsub("[^[:alpha:]]", " ", x$text)
x$text <- gsub("[[:space:]]+", " ", x$text)
x$text <- trimws(x$text)
x$nwords <- txt_count_words(x$text)
x <- subset(x, nwords < 1000 & nchar(text) > 0)
## Build the model
model <- paragraph2vec(x = x, type = "PV-DM", dim = 15, iter = 5)
# \donttest{
model <- paragraph2vec(x = x, type = "PV-DBOW", dim = 100, iter = 20)
# }
str(model)
embedding <- as.matrix(model, which = "words")
embedding <- as.matrix(model, which = "docs")
head(embedding)
## Get vocabulary
vocab <- summary(model, type = "vocabulary", which = "docs")
vocab <- summary(model, type = "vocabulary", which = "words")
# \donttest{
## Transfer learning using existing word embeddings
library(word2vec)
w2v <- word2vec(x$text, dim = 50, type = "cbow", iter = 20, min_count = 5)
emb <- as.matrix(w2v)
model <- paragraph2vec(x = x, dim = 50, type = "PV-DM", iter = 20, min_count = 5,
embeddings = emb)
# }
## Transfer learning - proof of concept without learning (iter=0, set to higher to learn)
emb <- matrix(rnorm(30), nrow = 2, dimnames = list(c("en", "met")))
model <- paragraph2vec(x = x, type = "PV-DM", dim = 15, iter = 0, embeddings = emb)
embedding <- as.matrix(model, which = "words", normalize = FALSE)
embedding[c("en", "met"), ]
emb
} # End of main if statement running only if the required packages are installed
Run the code above in your browser using DataLab