## Download modeldata (conll 2002 shared task in Dutch)
# \donttest{
x <- ner_download_modeldata("conll2002-nl")
# }
# for CRAN only - word on a subset of the data
x <- ner_download_modeldata("conll2002-nl", docs = 10)
if(is.data.frame(x)){
##
## Build Named Entity Recognition model on conll2002-nl
##
x$pos <- txt_sprintf("Parts of Speech: %s", x$pos)
x$token <- txt_sprintf("Token: %s", x$token)
crf_train <- subset(x, data == "ned.train")
crf_test <- subset(x, data == "testa")
model <- crf(y = crf_train$label,
x = crf_train[, c("token", "pos")],
group = crf_train$doc_id,
method = "lbfgs",
options = list(max_iterations = 3, feature.minfreq = 5,
c1 = 0, c2 = 1))
model
weights <- coefficients(model)
head(weights$states, n = 20)
head(weights$transitions, n = 20)
stats <- summary(model, "modeldetails.txt")
stats
plot(stats$iterations$loss)
## Use the CRF model to label a sequence
scores <- predict(model,
newdata = crf_test[, c("token", "pos")],
group = crf_test$doc_id)
head(scores)
crf_test$label <- scores$label
## cleanup for CRAN
if(file.exists(model$file_model)) file.remove(model$file_model)
if(file.exists("modeldetails.txt")) file.remove("modeldetails.txt")
}
# \donttest{
##
## More detailed example where text data was annotated with the webapp in the package
## This data is joined with a tokenised dataset to construct the training data which
## is further enriched with attributes of upos/lemma in the neighbourhood
##
if(require(udpipe)){
library(udpipe)
data(airbnb_chunks, package = "crfsuite")
udmodel <- udpipe_download_model("dutch-lassysmall")
if(!udmodel$download_failed){
udmodel <- udpipe_load_model(udmodel$file_model)
airbnb_tokens <- udpipe(x = unique(airbnb_chunks[, c("doc_id", "text")]),
object = udmodel)
x <- merge(airbnb_chunks, airbnb_tokens)
x <- crf_cbind_attributes(x, terms = c("upos", "lemma"), by = "doc_id")
model <- crf(y = x$chunk_entity,
x = x[, grep("upos|lemma", colnames(x), value = TRUE)],
group = x$doc_id,
method = "lbfgs", options = list(max_iterations = 5))
stats <- summary(model)
stats
plot(stats$iterations$loss, type = "b", xlab = "Iteration", ylab = "Loss")
scores <- predict(model,
newdata = x[, grep("upos|lemma", colnames(x))],
group = x$doc_id)
head(scores)
}
} # End of main if statement running only if the required packages are installed
# }
Run the code above in your browser using DataLab