require("RcppCWB")
registry_tmp <- fs::path(tempdir(), "cwb", "registry")
data_dir_tmp <- fs::path(tempdir(), "cwb", "indexed_corpora", "reuters")
cwb_dir_rcppcwb <- system.file(package = "RcppCWB", "extdata", "cwb")
registry_dir_rcppcwb <- fs::path(cwb_dir_rcppcwb, "registry")
data_dir_rcppcwb <- fs::path(cwb_dir_rcppcwb,"indexed_corpora", "reuters")
corpus_copy(
corpus = "REUTERS",
registry_dir = registry_dir_rcppcwb,
data_dir = data_dir_rcppcwb,
registry_dir_new = registry_tmp,
data_dir_new = data_dir_tmp
)
no_strucs <- cl_attribute_size(
corpus = "REUTERS",
attribute = "id",
attribute_type = "s",
registry = registry_tmp
)
cpos_matrix <- get_region_matrix(
corpus = "REUTERS",
struc = 0L:(no_strucs - 1L),
s_attribute = "id",
registry = registry_tmp
)
s_attribute_encode(
values = 1L:nrow(cpos_matrix),
data_dir = data_dir_tmp,
s_attribute = "article_id",
corpus = "REUTERS",
region_matrix = cpos_matrix,
method = "R",
registry_dir = registry_tmp,
encoding = "latin1",
verbose = TRUE,
delete = TRUE
)
cl_struc2str(
"REUTERS",
struc = 0L:(nrow(cpos_matrix) - 1L),
s_attribute = "article_id",
registry = registry_tmp
)
unlink(registry_tmp, recursive = TRUE)
unlink(data_dir_tmp, recursive = TRUE)
data_dir <- system.file(
package = "RcppCWB",
"extdata",
"cwb",
"indexed_corpora",
"reuters"
)
avs <- s_attribute_get_values(s_attribute = "id", data_dir = data_dir)
rng <- s_attribute_get_regions(
s_attribute = "id",
data_dir = system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters")
)
x <- data.frame(
cpos_left = c(1L, 5L, 10L, 20L, 25L),
cpos_right = c(2L, 5L, 12L, 21L, 27L),
value = c("ORG", "LOC", "ORG", "PERS", "ORG"),
stringsAsFactors = FALSE
)
y <- data.frame(
cpos_left = c(5, 11, 20, 25L, 30L),
cpos_right = c(5, 12, 22, 27L, 33L),
value = c("LOC", "ORG", "ORG", "ORG", "ORG"),
stringsAsFactors = FALSE
)
s_attribute_merge(x,y)
Run the code above in your browser using DataLab