path <- getwd()
# \dontshow{
path <- tempdir()
# }
# \donttest{
##
## Download only the tokeniser model
##
dl <- sentencepiece_download_model("Russian", vocab_size = 50000, model_dir = path)
dl <- sentencepiece_download_model("English", vocab_size = 100000, model_dir = path)
dl <- sentencepiece_download_model("French", vocab_size = 25000, model_dir = path)
dl <- sentencepiece_download_model("multi", vocab_size = 320000, model_dir = path)
dl <- sentencepiece_download_model("Vlaams", vocab_size = 1000, model_dir = path)
dl <- sentencepiece_download_model("Dutch", vocab_size = 25000, model_dir = path)
dl <- sentencepiece_download_model("nl", vocab_size = 25000, model_dir = path)
str(dl)
model <- sentencepiece_load_model(dl$file_model)
##
## Download the tokeniser model + Glove embeddings of Byte Pairs
##
dl <- sentencepiece_download_model("nl", vocab_size = 1000, dim = 50, model_dir = path)
str(dl)
model <- sentencepiece_load_model(dl$file_model)
embedding <- read_word2vec(dl$glove$file_model)
# }
dl <- sentencepiece_download_model("nl", vocab_size = 1000, dim = 25,
model_dir = tempdir())
str(dl)
# \dontshow{
# clean up for CRAN
f <- list.files(tempdir(), pattern = ".vocab$|.model$", full.names = TRUE)
invisible(file.remove(f))
# }
Run the code above in your browser using DataLab