library(dplyr)
my_corpus <- data.frame(
text = c(
"I hear babies crying I watch them grow",
"They'll learn much more than I'll ever know",
"And I think to myself",
"What a wonderful world",
"Yes I think to myself",
"What a wonderful world"
),
line_id = paste0("line", seq_len(6))
)
## some text preprocessing
my_corpus$clean_text <- tolower(gsub("'", "", my_corpus$text))
# example 1 with R 4.1 pipe
# \donttest{
dtm <- my_corpus |>
dtm_builder(clean_text, line_id)
# }
# example 2 without pipe
dtm <- dtm_builder(
data = my_corpus,
text = clean_text,
doc_id = line_id
)
# example 3 with dplyr pipe and mutate
# \donttest{
dtm <- my_corpus %>%
mutate(
clean_text = gsub("'", "", text),
clean_text = tolower(clean_text)
) %>%
dtm_builder(clean_text, line_id)
# example 4 with dplyr and chunk of 3 terms
dtm <- my_corpus %>%
dtm_builder(clean_text,
line_id,
chunk = 3L
)
# }
# example 5 with user defined vocabulary
my.vocab <- c("wonderful", "world", "haiku", "think")
dtm <- dtm_builder(
data = my_corpus,
text = clean_text,
doc_id = line_id,
vocab = my.vocab
)
Run the code above in your browser using DataLab