library(tokenizers.bpe)
data(belgium_parliament, package = "tokenizers.bpe")
path <- "traindata.txt"
folder <- getwd()
# \dontshow{
path <- tempfile("traindata_", fileext = ".txt")
folder <- tempdir()
# }
writeLines(belgium_parliament$text, con = path)
# \dontshow{
model <- sentencepiece(path, type = "char", vocab_size = 30, model_dir = folder)
model <- sentencepiece(path, type = "unigram", vocab_size = 50, model_dir = folder)
model <- sentencepiece(path, type = "bpe", vocab_size = 200, model_dir = folder)
# }
# \donttest{
model <- sentencepiece(path, type = "char",
model_dir = folder, verbose = TRUE)
model <- sentencepiece(path, type = "unigram", vocab_size = 20000,
model_dir = folder, verbose = TRUE)
model <- sentencepiece(path, type = "bpe", vocab_size = 4000,
model_dir = folder, verbose = TRUE)
txt <- c("De eigendomsoverdracht aan de deelstaten is ingewikkeld.",
"On est d'accord sur le prix de la biere?")
sentencepiece_encode(model, x = txt, type = "subwords")
sentencepiece_encode(model, x = txt, type = "ids")
model <- sentencepiece_load_model(file.path(folder, "sentencepiece.model"))
sentencepiece_encode(model, x = txt, type = "subwords")
sentencepiece_encode(model, x = txt, type = "ids")
# }
# \dontshow{
# clean up for CRAN
file.remove(file.path(folder, "sentencepiece.model"))
file.remove(file.path(folder, "sentencepiece.vocab"))
file.remove(path)
# }
Run the code above in your browser using DataLab