# Example with small duplicate dataset
data(dup_data_small)
# Create the comparison data
comparison_list <- create_comparison_data(dup_data_small$records,
types = c("bi", "lv", "lv", "lv", "lv", "bi", "bi"),
breaks = list(NA, c(0, 0.25, 0.5), c(0, 0.25, 0.5),
c(0, 0.25, 0.5), c(0, 0.25, 0.5), NA, NA),
file_sizes = dup_data_small$file_sizes,
duplicates = c(1, 1, 1))
# Reduce the comparison data
# The following line corresponds to only keeping pairs of records for which
# neither gname nor fname disagree at the highest level
pairs_to_keep <- (comparison_list$comparisons[, "gname_DL_3"] != TRUE) &
(comparison_list$comparisons[, "fname_DL_3"] != TRUE)
reduced_comparison_list <- reduce_comparison_data(comparison_list,
pairs_to_keep, cc = 1)
# Specify the prior
prior_list <- specify_prior(reduced_comparison_list, mus = NA, nus = NA,
flat = 0, alphas = rep(1, 7), dup_upper_bound = c(10, 10, 10),
dup_count_prior_family = c("Poisson", "Poisson", "Poisson"),
dup_count_prior_pars = list(c(1), c(1), c(1)), n_prior_family = "uniform",
n_prior_pars = NA)
# Run the Gibbs sampler
{
results <- gibbs_sampler(reduced_comparison_list, prior_list, n_iter = 1000,
seed = 42)
# Find the full Bayes estimate
full_estimate <- find_bayes_estimate(results$partitions, burn_in = 100,
L_FNM = 1, L_FM1 = 1, L_FM2 = 2, L_A = Inf, max_cc_size = 50)
# Find the partial Bayes estimate
partial_estimate <- find_bayes_estimate(results$partitions, burn_in = 100,
L_FNM = 1, L_FM1 = 1, L_FM2 = 2, L_A = 0.1, max_cc_size = 12)
# Relabel the full and partial Bayes estimates
full_estimate_relabel <- relabel_bayes_estimate(reduced_comparison_list,
full_estimate)
partial_estimate_relabel <- relabel_bayes_estimate(reduced_comparison_list,
partial_estimate)
# Add columns to the records corresponding to their full and partial
# Bayes estimates
dup_data_small$records <- cbind(dup_data_small$records,
full_estimate_id = full_estimate_relabel$link_id,
partial_estimate_id = partial_estimate_relabel$link_id)
}
Run the code above in your browser using DataLab