# \donttest{
if(require(word2vec) && require(uwot) && require(dbscan) && require(udpipe)){
library(word2vec)
library(uwot)
library(dbscan)
data(be_parliament_2020, package = "doc2vec")
x <- data.frame(doc_id = be_parliament_2020$doc_id,
text = be_parliament_2020$text_nl,
stringsAsFactors = FALSE)
x$text <- txt_clean_word2vec(x$text)
x <- subset(x, txt_count_words(text) < 1000)
d2v <- paragraph2vec(x, type = "PV-DBOW", dim = 50,
lr = 0.05, iter = 10,
window = 15, hs = TRUE, negative = 0,
sample = 0.00001, min_count = 5,
threads = 1)
# write.paragraph2vec(d2v, "d2v.bin")
# d2v <- read.paragraph2vec("d2v.bin")
model <- top2vec(d2v, data = x,
control.dbscan = list(minPts = 50),
control.umap = list(n_neighbors = 15L, n_components = 4), trace = TRUE)
model <- top2vec(d2v, data = x,
control.dbscan = list(minPts = 50),
control.umap = list(n_neighbors = 15L, n_components = 3), umap = tumap,
trace = TRUE)
info <- summary(model, top_n = 7)
info$topwords
info$topdocs
library(udpipe)
info <- summary(model, top_n = 7, type = "c-tfidf")
info$topwords
## Change the model: reduce doc2vec model to 2D
model <- update(model, type = "umap",
n_neighbors = 100, n_components = 2, metric = "cosine", umap = tumap,
trace = TRUE)
info <- summary(model, top_n = 7)
info$topwords
info$topdocs
## Change the model: have minimum 200 points for the core elements in the hdbscan density
model <- update(model, type = "hdbscan", minPts = 200, trace = TRUE)
info <- summary(model, top_n = 7)
info$topwords
info$topdocs
} # End of main if statement running only if the required packages are installed
# }
##
## Example on a small sample
## with unrealistic hyperparameter settings especially regarding dim / iter / n_epochs
## in order to have a basic example finishing < 5 secs
##
if(require(word2vec) && require(uwot) && require(dbscan)){
library(uwot)
library(dbscan)
library(word2vec)
data(be_parliament_2020, package = "doc2vec")
x <- data.frame(doc_id = be_parliament_2020$doc_id,
text = be_parliament_2020$text_nl,
stringsAsFactors = FALSE)
x <- head(x, 1000)
x$text <- txt_clean_word2vec(x$text)
x <- subset(x, txt_count_words(text) < 1000)
d2v <- paragraph2vec(x, type = "PV-DBOW", dim = 10,
lr = 0.05, iter = 0,
window = 5, hs = TRUE, negative = 0,
sample = 0.00001, min_count = 5)
emb <- list(docs = as.matrix(d2v, which = "docs"),
words = as.matrix(d2v, which = "words"))
model <- top2vec(emb,
data = x,
control.dbscan = list(minPts = 50),
control.umap = list(n_neighbors = 15, n_components = 2,
init = "spectral"),
umap = tumap, trace = TRUE)
info <- summary(model, top_n = 7)
print(info, top_n = c(5, 2))
} # End of main if statement running only if the required packages are installed
Run the code above in your browser using DataLab