data(patient_records)
# Weighted (probabilistic) comparison of forename, middlename and surname
criteria_1 <- as.list(patient_records[c("forename", "middlename", "surname")])
# Possible scores when m-probability is 0.95
prob_scores <- prob_score_range(attribute = criteria_1,
m_probability = 0.95,
u_probability = NULL)
if (FALSE) {
# Probabilistic record linkage with 'links_af_probabilistic()'
pids_1a <- links_af_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "stats")
# Equivalent with 'links_wf_probabilistic()'
pids_1b <- links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = TRUE,
check_duplicates = TRUE)
# Less thorough but faster equivalent with `links_wf_probabilistic()`
pids_1c <- links_wf_probabilistic(attribute = criteria_1,
cmp_func = exact_match,
attr_threshold = 1,
probabilistic = TRUE,
m_probability = 0.95,
score_threshold = prob_scores$mid_scorce,
display = "progress",
recursive = FALSE,
check_duplicates = FALSE)
# Each implementation can lead to different results
summary(pids_1a$pid)
summary(pids_1b$pid)
summary(pids_1c$pid)
}
# Weighted (non-probabilistic) comparison of forename, middlename and age difference
criteria_2 <- as.list(patient_records[c("forename", "middlename", "dateofbirth")])
age_diff <- function(x, y){
diff <- abs(as.numeric(x) - as.numeric(y))
wgt <- diff %in% 0:(365 * 10) & !is.na(diff)
wgt
}
pids_2a <- links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
# Larger weights can be assigned to particular attributes through `cmp_func`
# For example, a smaller age difference can contribute a higher score (e.g 0 to 3)
age_diff_2 <- function(x, y){
diff <- as.numeric(abs(x - y))
wgt <- diff %in% 0:(365 * 10) & !is.na(diff)
wgt[wgt] <- match(as.numeric(cut(diff[wgt], 3)), 3:1)
wgt
}
pids_2b <- links_af_probabilistic(attribute = criteria_2,
blocking_attribute = patient_records$surname,
cmp_func = c(exact_match, exact_match, age_diff_2),
score_threshold = number_line(3, 5),
probabilistic = FALSE,
display = "stats")
head(pids_2a$pid_weights, 10)
head(pids_2b$pid_weights, 10)
Run the code above in your browser using DataLab