tc = create_tcorpus(sotu_texts[1:5,], doc_column = 'id')
tc$n ## original number of tokens
## select only first 20 tokens per document
tc2 = subset(tc, token_id < 20)
tc2$n
## Note that the original is untouched
tc$n
## Now we subset by reference. This doesn't make a copy, but changes tc itself
tc$subset(token_id < 20)
tc$n
## you can filter on term frequency and document frequency with the freq_filter() and
## docfreq_filter() functions
tc = create_tcorpus(sotu_texts[c(1:5,800:805),], doc_column = 'id')
tc$subset( freq_filter(token, min = 2, max = 4) )
tc$tokens
###### subset can be used for meta data by using the subset_meta argument, or the subset_meta method
tc$n_meta
tc$meta
tc$subset(subset_meta = president == 'Barack Obama')
tc$n_meta
Run the code above in your browser using DataLab