# same as tokenize()
identical(tokenize(ukimmigTexts), segment(ukimmigTexts))
# segment into paragraphs
segment(ukimmigTexts[3:4], "paragraphs")
# segment a text into sentences
segmentedChar <- segment(ukimmigTexts, "sentences")
segmentedChar[2]
testCorpus <- corpus("##INTRO This is the introduction.
##DOC1 This is the first document.
Second sentence in Doc 1.
##DOC3 Third document starts here.
End of third document.")
testCorpusSeg <- segment(testCorpus, "tags")
summary(testCorpusSeg)
texts(testCorpusSeg)
# segment a corpus into sentences
segmentedCorpus <- segment(corpus(ukimmigTexts), "sentences")
identical(ndoc(segmentedCorpus), length(unlist(segmentedChar)))
Run the code above in your browser using DataLab