CorpusData: Manage Corpus Data and Encode CWB Corpus.

Description

Manage Corpus Data and Encode CWB Corpus.

Usage

CorpusData

Format

An object of class R6ClassGenerator of length 24.

Arguments

x: A single filename, a character vector of filenames, or a directory with XML files.
body: An xpath expression defining the body of the xml document.
meta: A named character vector with xpath expressions.
mc: A numeric/integer value, number of cores to use.
compress: Logical, whether to compress corpus.
encoding: Encoding/charset of the CWB corpus.
registry_dir: Corpus registry, the directory where registry files are stored.
corpus: The name of the CWB corpus.
p_attributes: Positional attributes.
s_attributes: Columns that will be encoded as structural attributes.
data_dir: Directory where to create directory for indexed corpus files.
method: Either "R" or "CWB".
...: Arguments that are passed into tokenizers::tokenize_words().
verbose: Logical, whether to be verbose.
progress: Logical, whether to show progress bar.

Fiels

chunktable: A data.table with column "id" (unique values), columns with metadata, and a column with text chunks.
tokenstream: A data.table with a column "cpos" (corpus position), and columns with positional attributes, such as "word", "lemma", "pos", "stem".
metadata: A data.table with a column "id", to link data with chunks/tokenstream, columns with document-level metadata, and a column "cpos_left" and "cpos_right", which can be generated using method $add_corpus_positions().
sentences: A data.table.
named_entities: A code data.table

Methods

$new(): Initialize a new instance of class CorpusData.
$print(): Print summary of CorpusData object.
$purge(replacements = list(c("^\s*<.*?>\s*$", ""), c("\u2019", "'"))): Remove patterns from chunkdata that are known to cause problems. This is done most efficiently at the chunkdata level of data preparation as the length of the character vector to handle is much smaller than when tokenization/annotation has been performed.
$tokenize(verbose = TRUE): Simple tokenization of text in chunktable.
$add_corpus_positions(verbose = TRUE): Add column cpos to tokenstream and columns cpos_left and cpos_right to metadata.
$encode(corpus, p_attributes = "word", s_attributes = NULL, encoding, registry_dir = Sys.getenv("CORPUS_REGISTRY"), data_dir = NULL, method = c("R", "CWB"), verbose = TRUE, compress = FALSE): Encode corpus. If the corpus already exists, it will be removed.
$import_xml(filenames, body = "//body", meta = NULL, mc = NULL, progress = TRUE)

Examples

Run this code

# NOT RUN {
library(RcppCWB)
library(data.table)

# this example relies on the R method to write data to disk, there is also a method "CWB"
# that relies on CWB tools to generate the indexed corpus. The CWB can downloaded
# and installed within the package by calling cwb_install()

# create temporary registry file so that data in RcppCWB package can be used

registry_rcppcwb <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
registry_tmp <- file.path(normalizePath(tempdir(), winslash = "/"), "registry")
if (!dir.exists(registry_tmp)) dir.create(registry_tmp)
r <- registry_file_parse("REUTERS", registry_dir = registry_rcppcwb)
r[["home"]] <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters")
registry_file_write(r, corpus = "REUTERS", registry_dir = registry_tmp)

# decode structural attribute 'places'

s_attrs_places <- RcppCWB::s_attribute_decode(
  corpus = "REUTERS",
  data_dir = system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "reuters"),
  s_attribute = "places", method = "R"
)
s_attrs_places[["id"]] <- 1L:nrow(s_attrs_places)
setnames(s_attrs_places, old = "value", new = "places")

# decode positional attribute 'word'

tokens <- apply(s_attrs_places, 1, function(row){
  ids <- cl_cpos2id(
    corpus = "REUTERS", cpos = row[1]:row[2],
    p_attribute = "word", registry = registry_tmp
  )
  cl_id2str(corpus = "REUTERS", id = ids, p_attribute = "word", registry = registry_tmp)
})
tokenstream <- rbindlist(
lapply(
  1L:length(tokens),
  function(i) data.table(id = i, word = tokens[[i]]))
  )
tokenstream[["cpos"]] <- 0L:(nrow(tokenstream) - 1L)

# create CorpusData object (see vignette for further explanation)

CD <- CorpusData$new()
CD$tokenstream <- as.data.table(tokenstream)
CD$metadata <- as.data.table(s_attrs_places)

# Remove temporary registry with home dir still pointing to RcppCWB data dir
# to prevent data from being deleted
file.remove(file.path(registry_tmp, "reuters"))
file.remove(registry_tmp)

# create temporary directories (registry directory and one for indexed corpora)

tmpdir <- normalizePath(tempdir(), winslash = "/")
if (.Platform$OS.type == "windows") tmpdir <- normalizePath(tmpdir, winslash = "/")
registry_tmp <- file.path(tmpdir, "registry", fsep = "/")
data_dir_tmp <- file.path(tmpdir, "data_dir", fsep = "/")
if (!dir.exists(registry_tmp)) dir.create(registry_tmp <- file.path(tmpdir, "registry", fsep = "/"))
if (!dir.exists(data_dir_tmp)) dir.create(data_dir_tmp <- file.path(tmpdir, "data_dir", fsep = "/"))

CD$encode(
  corpus = "REUTERS", encoding = "utf8",
  p_attributes = "word", s_attributes = "places",
  registry_dir = registry_tmp, data_dir = data_dir_tmp,
  method = "R"
)
reg <- registry_data(name = "REUTERS", id = "REUTERS", home = data_dir_tmp, p_attributes = "word")
registry_file_write(data = reg, corpus = "REUTERS", registry_dir = registry_tmp)

# see whether it works

cl_cpos2id(corpus = "REUTERS", p_attribute = "word", cpos = 0L:4049L, registry = registry_tmp)
# }

Run the code above in your browser using DataLab