# NOT RUN {
library(tokenizers.bpe)
## Take data and standardise it a bit
data(belgium_parliament, package = "tokenizers.bpe")
str(belgium_parliament)
x <- subset(belgium_parliament, language %in% "french")
x$text <- tolower(x$text)
x$text <- gsub("[^[:alpha:]]", " ", x$text)
x$text <- gsub("[[:space:]]+", " ", x$text)
x$text <- trimws(x$text)
x$nwords <- txt_count_words(x$text)
x <- subset(x, nwords < 1000 & nchar(text) > 0)
## Build the model
model <- paragraph2vec(x = x, type = "PV-DM", dim = 15, iter = 5)
# }
# NOT RUN {
model <- paragraph2vec(x = x, type = "PV-DBOW", dim = 100, iter = 20)
# }
# NOT RUN {
str(model)
embedding <- as.matrix(model, which = "words")
embedding <- as.matrix(model, which = "docs")
head(embedding)
## Get vocabulary
vocab <- summary(model, type = "vocabulary", which = "docs")
vocab <- summary(model, type = "vocabulary", which = "words")
# }
Run the code above in your browser using DataLab