# NOT RUN {
# use readCorpus() to create an object of class kRp.corpus
# code is only run when the english language package can be loaded
if(require("koRpus.lang.en", quietly = TRUE)){
# "flat" corpus, parse all texts in the given dir
myCorpus <- readCorpus(
dir=file.path(
path.package("tm.plugin.koRpus"), "examples", "corpus", "Winner", "Wikipedia_prev"
),
# use tokenize() so examples run without a TreeTagger installation
tagger="tokenize",
lang="en"
)
# corpus with one category names "Source"
myCorpus <- readCorpus(
dir=file.path(
path.package("tm.plugin.koRpus"), "examples", "corpus", "Winner"
),
hierarchy=list(
Source=c(
Wikipedia_prev="Wikipedia (old)",
Wikipedia_new="Wikipedia (new)"
)
),
tagger="tokenize",
lang="en"
)
# two hieraryhical levels, "Topic" and "Source"
myCorpus <- readCorpus(
dir=file.path(path.package("tm.plugin.koRpus"), "examples", "corpus"),
hierarchy=list(
Topic=c(
Winner="Reality Winner",
Edwards="Natalie Edwards"
),
Source=c(
Wikipedia_prev="Wikipedia (old)",
Wikipedia_new="Wikipedia (new)"
)
),
tagger="tokenize",
lang="en"
)
# get hierarchy from directory tree
myCorpus <- readCorpus(
dir=file.path(path.package("tm.plugin.koRpus"), "examples", "corpus"),
hierarchy=TRUE,
tagger="tokenize",
lang="en"
)
# }
# NOT RUN {
# if the same corpus is available as TIF compliant data frame
myCorpus <- readCorpus(
dir=myCorpus_df,
hierarchy=list(
Topic=c(
Winner="Reality Winner",
Edwards="Natalie Edwards"
),
Source=c(
Wikipedia_prev="Wikipedia (old)",
Wikipedia_new="Wikipedia (new)"
)
),
lang="en",
format="obj"
)
# }
# NOT RUN {
} else {}
# }
Run the code above in your browser using DataLab