Count syntactic and lexical features of documents such as tokens, types, sentences, and character categories.
textstat_summary(x, ...)
corpus to be summarized
additional arguments passed through to dfm()
Count the total number of characters, tokens and sentences as well as special tokens such as numbers, punctuation marks, symbols, tags and emojis.
chars = number of characters; equal to nchar()
sents
= number of sentences; equal ntoken(tokens(x), what = "sentence")
tokens = number of tokens; equal to ntoken()
types = number of unique tokens; equal to ntype()
puncts = number of punctuation marks (^\p{P}+$
)
numbers = number of numeric tokens
(^\p{Sc}{0,1}\p{N}+([.,]*\p{N})*\p{Sc}{0,1}$
)
symbols = number of symbols (^\p{S}$
)
tags = number of tags; sum of pattern_username
and pattern_hashtag
in quanteda::quanteda_options()
emojis = number of emojis (^\p{Emoji_Presentation}+$
)
if (Sys.info()["sysname"] != "SunOS") {
library("quanteda")
corp <- data_corpus_inaugural[1:5]
textstat_summary(corp)
toks <- tokens(corp)
textstat_summary(toks)
dfmat <- dfm(toks)
textstat_summary(dfmat)
}
Run the code above in your browser using DataLab