multilink: Multifile Record Linkage and Duplicate Detection

Description

The multilink package implements the methodology of Aleshin-Guendel & Sadinle (2022). It handles the general problem of multifile record linkage and duplicate detection, where any number of files are to be linked, and any of the files may have duplicates.
Arguments

References

Serge Aleshin-Guendel & Mauricio Sadinle (2022). Multifile Partitioning for Record Linkage and Duplicate Detection. Journal of the American Statistical Association. [tools:::Rd_expr_doi("https://doi.org/10.1080/01621459.2021.2013242")] [arXiv]
Examples

Run this code
# Here we demonstrate an example workflow with the small no duplicate dataset
data(no_dup_data_small)

# Create the comparison data
comparison_list <- create_comparison_data(no_dup_data_small$records,
 types = c("bi", "lv", "lv", "lv", "lv", "bi", "bi"),
 breaks = list(NA,  c(0, 0.25, 0.5),  c(0, 0.25, 0.5),
               c(0, 0.25, 0.5), c(0, 0.25, 0.5),  NA, NA),
 file_sizes = no_dup_data_small$file_sizes,
 duplicates = c(0, 0, 0))

# Specify the prior
prior_list <- specify_prior(comparison_list, mus = NA, nus = NA, flat = 0,
 alphas = rep(1, 7), dup_upper_bound = c(1, 1, 1),
 dup_count_prior_family = NA, dup_count_prior_pars = NA,
 n_prior_family = "uniform", n_prior_pars = NA)

# Find initialization for the matching (this step is optional)
# The following line corresponds to only keeping pairs of records as
# potential matches in the initialization for which neither gname nor fname
# disagree at the highest level
pairs_to_keep <- (comparison_list$comparisons[, "gname_DL_3"] != TRUE) &
 (comparison_list$comparisons[, "fname_DL_3"] != TRUE)
Z_init <- initialize_partition(comparison_list, pairs_to_keep, seed = 42)

# Run the Gibbs sampler
results <- gibbs_sampler(comparison_list, prior_list, n_iter = 1000,
 Z_init = Z_init, seed = 42)

# Find the full Bayes estimate
# \donttest{
full_estimate <- find_bayes_estimate(results$partitions, burn_in = 100,
 L_FNM = 1, L_FM1 = 1, L_FM2 = 2, L_A = Inf, max_cc_size = 50)

# The number of clusters in the full estimate
length(unique(full_estimate))
# The number of entities represented in the records
length(unique(no_dup_data_small$IDs))

# Find which record pairs are truly coreferent based on IDs
true_links <- no_dup_data_small$IDs[comparison_list$record_pairs[, 1]] ==
no_dup_data_small$IDs[comparison_list$record_pairs[, 2]]

# Find which record pairs are in the same clusters in the full estimate
full_estimate_links <- full_estimate[comparison_list$record_pairs[, 1]] ==
full_estimate[comparison_list$record_pairs[, 2]]

# Find the number of true matches in the full estimate
true_matches <- sum(full_estimate_links & true_links)

# Precision of the full estimate
true_matches / sum(full_estimate_links)

# Recall of the full estimate
true_matches / sum(true_links)

# Find the partial Bayes estimate
partial_estimate <- find_bayes_estimate(results$partitions, burn_in = 100,
 L_FNM = 1, L_FM1 = 1, L_FM2 = 2, L_A = 0.1, max_cc_size = 12)

# The partial estimate abstains from making decisions for how many records?
sum(partial_estimate == -1)

# For the records which decisions were made for in the partial estimate,
# there are how many clusters?
length(unique(partial_estimate))

# Abstain rate of partial_estimate
sum(partial_estimate == -1) / length(partial_estimate)

# Relabel records where we abstained
partial_estimate[which(partial_estimate == -1)] <- length(partial_estimate) +
which(partial_estimate == -1)

# Find which record pairs are in the same clusters in the full estimate
partial_estimate_links <-
 partial_estimate[comparison_list$record_pairs[, 1]] ==
 partial_estimate[comparison_list$record_pairs[, 2]]

# Find the number of true matches in the partial estimate
true_matches_A <- sum(partial_estimate_links & true_links)

# Precision of the partial estimate
true_matches_A / sum(partial_estimate_links)
# }

# Here we demonstrate an example workflow with the small duplicate dataset
data(dup_data_small)

# Create the comparison data
comparison_list <- create_comparison_data(dup_data_small$records,
 types = c("bi", "lv", "lv", "lv", "lv", "bi", "bi"),
 breaks = list(NA,  c(0, 0.25, 0.5),  c(0, 0.25, 0.5),
               c(0, 0.25, 0.5), c(0, 0.25, 0.5),  NA, NA),
 file_sizes = dup_data_small$file_sizes,
 duplicates = c(1, 1, 1))

# Reduce the comparison data
# The following line corresponds to only keeping pairs of records for which
# neither gname nor fname disagree at the highest level
pairs_to_keep <- (comparison_list$comparisons[, "gname_DL_3"] != TRUE) &
 (comparison_list$comparisons[, "fname_DL_3"] != TRUE)
reduced_comparison_list <- reduce_comparison_data(comparison_list,
 pairs_to_keep, cc = 1)

# Specify the prior
prior_list <- specify_prior(reduced_comparison_list, mus = NA, nus = NA,
 flat = 0, alphas = rep(1, 7), dup_upper_bound = c(10, 10, 10),
 dup_count_prior_family = c("Poisson", "Poisson", "Poisson"),
 dup_count_prior_pars = list(c(1), c(1), c(1)), n_prior_family = "uniform",
 n_prior_pars = NA)

# Run the Gibbs sampler
results <- gibbs_sampler(reduced_comparison_list, prior_list, n_iter = 1000,
 seed = 42)

# Find the full Bayes estimate
# \donttest{
full_estimate <- find_bayes_estimate(results$partitions, burn_in = 100,
 L_FNM = 1, L_FM1 = 1, L_FM2 = 2, L_A = Inf, max_cc_size = 50)

# The number of  clusters in the full estimate (including records records
# determined not to be candidate matches to any other records using
# reduce_comparison_data)
length(unique(full_estimate)) +
sum(reduced_comparison_list$file_sizes_not_included)
# The number of entities represented in the records
length(unique(dup_data_small$IDs))

# Find which record pairs are truly coreferent based on IDs
true_links <- dup_data_small$IDs[comparison_list$record_pairs[, 1]] ==
dup_data_small$IDs[comparison_list$record_pairs[, 2]]

# Focus on the record pairs that were candidate matches
true_links_reduced <- true_links[reduced_comparison_list$pairs_to_keep]

# Calculate the number of prior false non-matches based on the indexing
# scheme used
prior_fnm <-
 nrow(comparison_list$record_pairs[true_links &
 (!reduced_comparison_list$pairs_to_keep), ])

# Find which record pairs are in the same clusters in the full estimate
full_estimate_links <-
 full_estimate[reduced_comparison_list$record_pairs[, 1]] ==
 full_estimate[reduced_comparison_list$record_pairs[, 2]]

# Find the number of true matches in the full estimate
true_matches <- sum(full_estimate_links & true_links_reduced)

# Precision of the full estimate
true_matches / sum(full_estimate_links)

# Recall of the full estimate
true_matches / (sum(true_links_reduced) + prior_fnm)

# Find the partial Bayes estimate
partial_estimate <- find_bayes_estimate(results$partitions, burn_in = 100,
 L_FNM = 1, L_FM1 = 1, L_FM2 = 2, L_A = 0.1, max_cc_size = 12)

# The partial estimate abstains from making decisions for how many records?
sum(partial_estimate == -1)

# For the records which decisions were made for in the partial estimate,
# there are how many clusters? (including records determined not to be
# candidate matches to any other records using reduce_comparison_data)
length(unique(partial_estimate)) +
 sum(reduced_comparison_list$file_sizes_not_included)

# Abstain rate of partial_estimat (excluding records determined not
# to be candidate matches to any other records using reduce_comparison_data)
sum(partial_estimate == -1) / length(partial_estimate)

# Relabel records where we abstained
partial_estimate[which(partial_estimate == -1)] <- length(partial_estimate) +
which(partial_estimate == -1)

# Find which record pairs are in the same clusters in the full estimate
partial_estimate_links <-
 partial_estimate[reduced_comparison_list$record_pairs[, 1]] ==
 partial_estimate[reduced_comparison_list$record_pairs[, 2]]

# Find the number of true matches in the partial estimate
true_matches_A <- sum(partial_estimate_links & true_links_reduced)

# Precision of the partial estimate
true_matches_A / sum(partial_estimate_links)

# Relabel the full and partial Bayes estimates
full_estimate_relabel <- relabel_bayes_estimate(reduced_comparison_list,
 full_estimate)

partial_estimate_relabel <- relabel_bayes_estimate(reduced_comparison_list,
 partial_estimate)

# Add columns to the records corresponding to their full and partial
# Bayes estimates
dup_data_small$records <- cbind(dup_data_small$records,
 full_estimate_id = full_estimate_relabel$link_id,
 partial_estimate_id = partial_estimate_relabel$link_id)
# }
Run the code above in your browser using DataLab