library(tokenizers.bpe)
data(belgium_parliament, package = "tokenizers.bpe")
x <- subset(belgium_parliament, language %in% "dutch")
model <- BPEembedder(x, tokenizer = "bpe", args = list(vocab_size = 1000),
type = "cbow", dim = 20, iter = 10)
model
txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.")
values <- predict(model, txt, type = "encode")
Run the code above in your browser using DataLab