if (FALSE) {
df <- tokenize(
data.frame(
doc_id = "odakyu-sen",
text = "\u5c0f\u7530\u6025\u7dda"
)
) |>
prettify(col_select = "POS1")
collapse_tokens(
df,
POS1 == "\u540d\u8a5e" & stringr::str_detect(token, "^[\\p{Han}]+$")
) |>
head()
}
Run the code above in your browser using DataLab