# NOT RUN {
{
library(RtextSummary)
library(stringr)
library(tidyr)
library(dplyr)
data("opinosis")
# the data is reduced to pass CRAN checks of <5 sec run-time
# delete the line below to build the model on the entire dataset
opinosis = opinosis[1:2,]%>%mutate(text = substr(text, 0, 10) )
# 'stopwords_longlist' is a very long list of stopwords.
# it is not used in this example but can be useful for other datasets
data("stopwords_longlist")
opinosis$text = stringr::str_replace_all(
stringr::str_to_lower(opinosis$text),'[^a-z. ]','' )
# -- the model will be fit at the sentence level, which works well for this dataset
# for other datasets, also try fitting at the document level by commenting out the two lines below
tempdf = opinosis%>%
tidyr::separate_rows(text, sep = '\\.')
# ----------------------------------------
summary.model = TextSummary$new( stopword_list = c() )
summary.model$fit(tempdf$text)
# the parameters below work well for this dataset.
# For other datasets, try changing weight_method and avg_weight_by_word_count
df_sentence_level = summary.model$transform(
opinosis,
doc_id = 'topics',
txt_col = 'text',
summary_col = 'summary',
weight_method = 'Magnitude',
return_sentences = TRUE,
avg_weight_by_word_count = TRUE
)
# explore weight thresholds
quantile(df_sentence_level$wt, seq(0,1,0.1))
df_summary = summary.model$transform(
opinosis,
doc_id = 'topics',
txt_col = 'text',
summary_col = 'summary',
weight_method = 'Magnitude',
topN = 1,
weight_threshold=quantile(df_sentence_level$wt, 0.3 ),
return_sentences = FALSE,
replace_char = '',
avg_weight_by_word_count = TRUE
)
}
# }
Run the code above in your browser using DataLab