data(dekamer, package = "ruimtehol")
dekamer$text <- strsplit(dekamer$question, "\\W")
dekamer$text <- lapply(dekamer$text, FUN = function(x) x[x != ""])
dekamer$text <- sapply(dekamer$text,
FUN = function(x) paste(x, collapse = " "))
set.seed(123456789)
model <- embed_tagspace(x = tolower(dekamer$text),
y = dekamer$question_theme_main,
similarity = "dot",
early_stopping = 0.8, ngram = 1, p = 0.5,
dim = 10, minCount = 5)
embedding <- starspace_embedding(model, "federale politie", type = "document")
embedding_dictionary <- as.matrix(model)
embedding
colSums(embedding_dictionary[c("federale", "politie"), ]) / 2^0.5
if (FALSE) {
set.seed(123456789)
model <- embed_tagspace(x = tolower(dekamer$text),
y = dekamer$question_theme_main,
similarity = "cosine",
early_stopping = 0.8, ngram = 1,
dim = 10, minCount = 5)
embedding <- starspace_embedding(model, "federale politie", type = "document")
embedding_dictionary <- as.matrix(model)
euclidean_norm <- function(x) sqrt(sum(x^2))
manual <- colSums(embedding_dictionary[c("federale", "politie"), ])
manual / euclidean_norm(manual)
embedding
set.seed(123456789)
model <- embed_tagspace(x = tolower(dekamer$text),
y = dekamer$question_theme_main,
similarity = "dot",
early_stopping = 0.8, ngram = 3, p = 0,
dim = 10, minCount = 5, bucket = 1)
starspace_embedding(model, "federale politie", type = "document")
starspace_embedding(model, "federale politie", type = "ngram")
}
Run the code above in your browser using DataLab