# NOT RUN {
library(textreuse)
lsh_probability(h = 1000, b = 500, s = 0.1) # A 10 percent Jaccard overlap will be detected well
minhash <- minhash_generator(n = 1000, seed = 123456789)
data(joboffer)
terminology <- subset(joboffer, upos %in% c("NOUN", "ADJ"), select = c("sentence_id", "lemma"))
candidates <- textrank_candidates_lsh(x = terminology$lemma, sentence_id = terminology$sentence_id,
minhashFUN = minhash, bands = 500)
head(candidates)
# }
Run the code above in your browser using DataLab