library(quanteda)
# tokenize corpus
toks <- tokens(cr_sample_corpus)
# build a tokenized corpus of contexts sorrounding a target term
immig_toks <- tokens_context(x = toks, pattern = "immigration", window = 6L)
# sample 100 instances of the target term, stratifying by party (only for example purposes)
set.seed(2022L)
immig_toks <- tokens_sample(immig_toks, size = 100, by = docvars(immig_toks, 'party'))
# compute the cosine similarity between each group's embedding
# and a specific set of features
set.seed(2021L)
get_cos_sim(x = immig_toks,
groups = docvars(immig_toks, 'party'),
features = c("reform", "enforce"),
pre_trained = cr_glove_subset,
transform = TRUE,
transform_matrix = cr_transform,
bootstrap = TRUE,
num_bootstraps = 100,
confidence_level = 0.95,
stem = TRUE,
as_list = FALSE)
Run the code above in your browser using DataLab