# NOT RUN {
library(tokenizers.bpe)
data(belgium_parliament, package = "tokenizers.bpe")
x <- belgium_parliament
x <- subset(x, language %in% "dutch")
x <- subset(x, nchar(text) > 0 & txt_count_words(text) < 1000)
x$doc_id <- sprintf("doc_%s", 1:nrow(x))
x$text <- tolower(x$text)
x$text <- gsub("[^[:alpha:]]", " ", x$text)
x$text <- gsub("[[:space:]]+", " ", x$text)
x$text <- trimws(x$text)
## Build model
model <- paragraph2vec(x = x, type = "PV-DM", dim = 15, iter = 5)
# }
# NOT RUN {
model <- paragraph2vec(x = x, type = "PV-DBOW", dim = 100, iter = 20)
# }
# NOT RUN {
sentences <- list(
example = c("geld", "diabetes"),
hi = c("geld", "diabetes", "koning"),
test = c("geld"),
nothing = character(),
repr = c("geld", "diabetes", "koning"))
## Get embeddings (type = 'embedding')
predict(model, newdata = c("geld", "koning", "unknownword", NA, "</s>", ""),
type = "embedding", which = "words")
predict(model, newdata = c("doc_1", "doc_10", "unknowndoc", NA, "</s>"),
type = "embedding", which = "docs")
predict(model, sentences, type = "embedding")
## Get most similar items (type = 'nearest')
predict(model, newdata = c("doc_1", "doc_10"), type = "nearest", which = "doc2doc")
predict(model, newdata = c("geld", "koning"), type = "nearest", which = "word2doc")
predict(model, newdata = c("geld", "koning"), type = "nearest", which = "word2word")
predict(model, newdata = sentences, type = "nearest", which = "sent2doc", top_n = 7)
## Similar way on extracting similarities
emb <- predict(model, sentences, type = "embedding")
emb_docs <- as.matrix(model, type = "docs")
paragraph2vec_similarity(emb, emb_docs, top_n = 3)
# }
Run the code above in your browser using DataLab