# NOT RUN {
# Decode first words of GERMAPARLMINI corpus (first sentence)
get_token_stream(0:9, corpus = "GERMAPARLMINI", p_attribute = "word")
# Decode first sentence and collapse tokens into single string
get_token_stream(0:9, corpus = "GERMAPARLMINI", p_attribute = "word", collapse = " ")
# Decode regions defined by two-column matrix
region_matrix <- matrix(c(0,9,10,25), ncol = 2, byrow = TRUE)
get_token_stream(region_matrix, corpus = "GERMAPARLMINI", p_attribute = "word", encoding = "latin1")
# Use argument 'beautify' to remove surplus whitespace
get_token_stream(
region_matrix,
corpus = "GERMAPARLMINI",
p_attribute = "word",
encoding = "latin1",
collapse = " ", beautify = TRUE
)
# Decode entire corpus (corpus object / specified by corpus ID)
fulltext <- get_token_stream("GERMAPARLMINI", p_attribute = "word")
corpus("GERMAPARLMINI") %>%
get_token_stream(p_attribute = "word") %>%
head()
# Decode subcorpus
corpus("REUTERS") %>%
subset(id == "127") %>%
get_token_stream(p_attribute = "word") %>%
head()
# Decode partition_bundle
pb_tokstr <- corpus("REUTERS") %>%
split(s_attribute = "id") %>%
get_token_stream(p_attribute = "word")
# get token stream for partition_bundle
pb <- partition_bundle("REUTERS", s_attribute = "id")
ts_list <- get_token_stream(pb, progress = FALSE)
# }
Run the code above in your browser using DataLab