# NOT RUN {
# simple example with interesting warning and error reporting
# the string might look like "AABB" but it isn't...
(string <- "\u0041\u0410\u0042\u0412")
tokenize(string,c("A","B"))
# make an ad-hoc orthography profile
profile <- cbind(
Grapheme = c("a","<U+00E4>","n","ng","ch","sch"),
Trans = c("a","e","n","N","x","sh"))
# tokenization
tokenize(c("nana", "<U+00E4>nngsch<U+00E4>", "ach"), profile)
# with replacements and a warning
tokenize(c("Nan<U+00E1>", "<U+00E4>nngsch<U+00E4>", "ach"), profile, transliterate = "Trans")
# different results of ordering
tokenize("aaa", c("a","aa"), order = NULL)
tokenize("aaa", c("a","aa"), order = "size")
# regexmatching does not catch overlap, which can lead to wrong results
# the second example results in a warning instead of just parsing "ab bb"
# this should occur only rarely in natural language
tokenize("abbb", profile = c("ab","bb"), order = NULL)
tokenize("abbb", profile = c("ab","bb"), order = NULL, regex = TRUE)
# different parsing methods can lead to different results
# note that in natural language this is VERY unlikely to happen
tokenize("abc", c("bc","ab","a","c"), order = NULL, method = "global")$strings
tokenize("abc", c("bc","ab","a","c"), order = NULL, method = "linear")$strings
# }
Run the code above in your browser using DataLab