## ...
tc = create_tcorpus(c('Text one first sentence. Text one second sentence', 'Text two'))
tc$tokens
tc = create_tcorpus(c('Text one first sentence. Text one second sentence', 'Text two'),
split_sentences = TRUE)
tc$tokens
## with meta (easier to S3 method for data.frame)
meta = data.frame(doc_id = c(1,2), source = c('a','b'))
tc = create_tcorpus(c('Text one first sentence. Text one second sentence', 'Text two'),
split_sentences = TRUE,
doc_id = c(1,2),
meta = meta)
tc
d = data.frame(text = c('Text one first sentence. Text one second sentence.',
'Text two', 'Text three'),
date = c('2010-01-01','2010-01-01','2012-01-01'),
source = c('A','B','B'))
tc = create_tcorpus(d, split_sentences = TRUE)
tc
tc$tokens
## use multiple text columns
d$headline = c('Head one', 'Head two', 'Head three')
## use custom doc_id
d$doc_id = c('#1', '#2', '#3')
tc = create_tcorpus(d, text_columns = c('headline','text'), doc_column = 'doc_id',
split_sentences = TRUE)
tc
tc$tokens
## It makes little sense to have full texts as factors, but it tends to happen.
## The create_tcorpus S3 method for factors is essentially identical to the
## method for a character vector.
text = factor(c('Text one first sentence', 'Text one second sentence'))
tc = create_tcorpus(text)
tc$tokens
library(quanteda)
create_tcorpus(data_corpus_inaugural)
Run the code above in your browser using DataLab