text = c('A B C', 'D E F. G H I', 'A D', 'GGG')
tc = create_tcorpus(text, doc_id = c('a','b','c','d'), split_sentences = TRUE)
tc$tokens ## (example uses letters instead of words for simple query examples)
hits = search_features(tc, c('query label# A AND B', 'second query# (A AND Q) OR ("D E") OR I'))
hits ## print shows number of hits
hits$hits ## hits is a list, with hits$hits being a data.frame with specific features
summary(hits) ## summary gives hits per query
## sentence level
hits = search_features(tc, c('query label# A AND B', 'second query# (A AND Q) OR ("D E") OR I'),
context_level = 'sentence')
hits$hits ## hits is a list, with hits$hits being a data.frame with specific features
# \donttest{
## query language examples
## single term
search_features(tc, 'A')$hits
search_features(tc, 'G*')$hits ## wildcard *
search_features(tc, '*G')$hits ## wildcard *
search_features(tc, 'G*G')$hits ## wildcard *
search_features(tc, 'G?G')$hits ## wildcard ?
search_features(tc, 'G?')$hits ## wildcard ? (no hits)
## boolean
search_features(tc, 'A AND B')$hits
search_features(tc, 'A AND D')$hits
search_features(tc, 'A AND (B OR D)')$hits
search_features(tc, 'A NOT B')$hits
search_features(tc, 'A NOT (B OR D)')$hits
## sequence search (adjacent words)
search_features(tc, '"A B"')$hits
search_features(tc, '"A C"')$hits ## no hit, because not adjacent
search_features(tc, '"A (B OR D)"')$hits ## can contain nested OR
## cannot contain nested AND or NOT!!
search_features(tc, '')$hits ## can also use <> instead of "".
## proximity search (using ~ flag)
search_features(tc, '"A C"~5')$hits ## A AND C within a 5 word window
search_features(tc, '"A C"~1')$hits ## no hit, because A and C more than 1 word apart
search_features(tc, '"A (B OR D)"~5')$hits ## can contain nested OR
search_features(tc, '"A "~5')$hits ## can contain nested sequence (must use <>)
search_features(tc, '>~5')$hits ## <> is always OK, but cannot nest "" in ""
## cannot contain nested AND or NOT!!
## case sensitive search (~s flag)
search_features(tc, 'g')$hits ## normally case insensitive
search_features(tc, 'g~s')$hits ## use ~s flag to make term case sensitive
search_features(tc, '(a OR g)~s')$hits ## use ~s flag on everything between parentheses
search_features(tc, '(a OR G)~s')$hits
search_features(tc, '"a b"~s')$hits ## use ~s flag on everything between quotes
search_features(tc, '"A B"~s')$hits ## use ~s flag on everything between quotes
## ghost terms (~g flag)
search_features(tc, 'A AND B~g')$hits ## ghost term (~g) has to occur, but is not returned
search_features(tc, 'A AND Q~g')$hits ## no hi
# (can also be used on parentheses/quotes/anglebrackets for all nested terms)
## "unique_hits" versus "features" mode
tc = create_tcorpus('A A B')
search_features(tc, 'A AND B')$hits ## in "unique_hits" (default), only match full queries
# (B is not repeated to find a second match of A AND B)
search_features(tc, 'A AND B', mode = 'features')$hits ## in "features", match any match
# (note that hit_id in features mode is irrelevant)
# ghost terms (used for conditions) can be repeated
search_features(tc, 'A AND B~g')$hits
# }
Run the code above in your browser using DataLab