# unzip the files to a temporary directory
FILEDIR <- tempdir()
unzip(system.file("extdata", "encodedTextFiles.zip", package = "quanteda"), exdir = FILEDIR)
# get encoding from filename
filenames <- list.files(FILEDIR, "\\.txt$")
# strip the extension
filenames <- gsub(".txt$", "", filenames)
parts <- strsplit(filenames, "_")
fileencodings <- sapply(parts, "[", 3)
fileencodings
# find out which conversions are unavailable (through iconv())
cat("Encoding conversions not available for this platform:")
notAvailableIndex <- which(!(fileencodings %in% iconvlist()))
fileencodings[notAvailableIndex]
# try textfile
require(quanteda)
tfile <- textfile(paste0(FILEDIR, "/", "*.txt"))
substring(texts(tfile)[1], 1, 80) # gibberish
substring(texts(tfile)[4], 1, 80) # hex
substring(texts(tfile)[40], 1, 80) # hex
# read them in again
tfile <- textfile(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings)
substring(texts(tfile)[1], 1, 80) # English
substring(texts(tfile)[4], 1, 80) # Arabic, looking good
substring(texts(tfile)[40], 1, 80) # Cyrillic, looking good
substring(texts(tfile)[7], 1, 80) # Chinese, looking good
substring(texts(tfile)[26], 1, 80) # Hindi, looking good
tfile <- textfile(paste0(FILEDIR, "/", "*.txt"), encoding = fileencodings,
docvarsfrom = "filenames",
docvarnames = c("document", "language", "inputEncoding"))
encodingCorpus <- corpus(tfile, source = "Created by encoding-tests.R")
summary(encodingCorpus)
Run the code above in your browser using DataLab