# \donttest{
# create corpus and DTM
my_corpus <- data.frame(
text = c(
"I hear babies crying I watch them grow",
"They'll learn much more than I'll ever know",
"And I think to myself",
"What a wonderful world",
"Yes I think to myself",
"What a wonderful world"
),
line_id = paste0("line", seq_len(6))
)
## some text preprocessing
my_corpus$clean_text <- tolower(gsub("'", "", my_corpus$text))
dtm <- dtm_builder(
data = my_corpus,
text = clean_text,
doc_id = line_id
)
# use colSums to get term frequencies
df <- data.frame(
vocab = colnames(dtm),
freqs = colSums(dtm)
)
# convert to probabilities
df$probs <- df$freqs / sum(df$freqs)
# create random DTM
ls_dtms <- df |>
rancors_builder(vocab,
probs,
n_cors = 20,
n_docs = 100,
len_mean = c(50, 200),
len_var = 5,
len_min = 20,
len_max = 1000,
seed = 59801
)
length(ls_dtms)
# }
Run the code above in your browser using DataLab