# In this example, we follow a "pure R" approach.
library(dplyr)
reu <- system.file(package = "RcppCWB", "extdata", "examples", "reuters.txt")
tokens <- readLines(reu)
# Create new (and empty) directory structure
registry_tmp <- fs::path(tempdir(), "registry")
data_dir_tmp <- fs::path(tempdir(), "data_dir", "reuters")
if (dir.exists(registry_tmp)) unlink(registry_tmp, recursive = TRUE)
if (dir.exists(data_dir_tmp)) unlink(data_dir_tmp, recursive = TRUE)
dir.create(registry_tmp)
dir.create(data_dir_tmp, recursive = TRUE)
# Encode token stream (without compression)
p_attribute_encode(
corpus = "reuters",
token_stream = tokens,
p_attribute = "word",
data_dir = data_dir_tmp,
registry_dir = registry_tmp,
method = "R",
compress = FALSE,
quietly = TRUE,
encoding = "utf8"
)
# Augment registry file
registry_file_parse(corpus = "REUTERS", registry_dir = registry_tmp) %>%
registry_set_name("Reuters Sample Corpus") %>%
registry_set_property("charset", "utf8") %>%
registry_set_property("language", "en") %>%
registry_set_property("build_date", as.character(Sys.Date())) %>%
registry_file_write()
# Run query as a test
library(RcppCWB)
cqp_query(corpus = "REUTERS", query = '[]{3} "oil" []{3};')
regions <- cqp_dump_subcorpus(corpus = "REUTERS")
kwic <- apply(
regions, 1,
function(region){
ids <- cl_cpos2id(
"REUTERS",
p_attribute = "word",
registry = registry_tmp,
cpos = region[1]:region[2]
)
words <- cl_id2str(
corpus = "REUTERS",
p_attribute = "word",
registry = registry_tmp,
id = ids
)
paste0(words, collapse = " ")
}
)
kwic[1:10]
Run the code above in your browser using DataLab