## Not run:
#
# # Method "a"
# #===========
#
# # Load PGR passport database
# GN <- GN1000
#
# # Specify as a vector the database fields to be used
# GNfields <- c("NationalID", "CollNo", "DonorID", "OtherID1", "OtherID2")
#
# # Clean the data
# GN[GNfields] <- lapply(GN[GNfields], function(x) DataClean(x))
# y1 <- list(c("Gujarat", "Dwarf"), c("Castle", "Cary"), c("Small", "Japan"),
# c("Big", "Japan"), c("Mani", "Blanco"), c("Uganda", "Erect"),
# c("Mota", "Company"))
# y2 <- c("Dark", "Light", "Small", "Improved", "Punjab", "SAM")
# y3 <- c("Local", "Bold", "Cary", "Mutant", "Runner", "Giant", "No.",
# "Bunch", "Peanut")
# GN[GNfields] <- lapply(GN[GNfields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
# GN[GNfields] <- lapply(GN[GNfields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
# GN[GNfields] <- lapply(GN[GNfields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))
#
# # Generate KWIC index
# GNKWIC <- KWIC(GN, GNfields)
#
# # Specify the exceptions as a vector
# exep <- c("A", "B", "BIG", "BOLD", "BUNCH", "C", "COMPANY", "CULTURE",
# "DARK", "E", "EARLY", "EC", "ERECT", "EXOTIC", "FLESH", "GROUNDNUT",
# "GUTHUKAI", "IMPROVED", "K", "KUTHUKADAL", "KUTHUKAI", "LARGE",
# "LIGHT", "LOCAL", "OF", "OVERO", "P", "PEANUT", "PURPLE", "R",
# "RED", "RUNNER", "S1", "SAM", "SMALL", "SPANISH", "TAN", "TYPE",
# "U", "VALENCIA", "VIRGINIA", "WHITE")
#
# # Specify the synsets as a list
# syn <- list(c("CHANDRA", "AH114"), c("TG1", "VIKRAM"))
#
# # Fetch probable duplicate sets
# GNdup <- ProbDup(kwic1 = GNKWIC, method = "a", excep = exep, fuzzy = TRUE,
# phonetic = TRUE, encoding = "primary",
# semantic = TRUE, syn = syn)
# GNdup
#
# # Method "b and c"
# #=================
#
# # Load PGR passport databases
# GN1 <- GN1000[!grepl("^ICG", GN1000$DonorID), ]
# GN1$DonorID <- NULL
# GN2 <- GN1000[grepl("^ICG", GN1000$DonorID), ]
# GN2 <- GN2[!grepl("S", GN2$DonorID), ]
# GN2$NationalID <- NULL
#
# # Specify as a vector the database fields to be used
# GN1fields <- c("NationalID", "CollNo", "OtherID1", "OtherID2")
# GN2fields <- c("DonorID", "CollNo", "OtherID1", "OtherID2")
#
# # Clean the data
# GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) DataClean(x))
# GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) DataClean(x))
# y1 <- list(c("Gujarat", "Dwarf"), c("Castle", "Cary"), c("Small", "Japan"),
# c("Big", "Japan"), c("Mani", "Blanco"), c("Uganda", "Erect"),
# c("Mota", "Company"))
# y2 <- c("Dark", "Light", "Small", "Improved", "Punjab", "SAM")
# y3 <- c("Local", "Bold", "Cary", "Mutant", "Runner", "Giant", "No.",
# "Bunch", "Peanut")
# GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
# GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
# GN1[GN1fields] <- lapply(GN1[GN1fields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))
# GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) MergeKW(x, y1, delim = c("space", "dash")))
# GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) MergePrefix(x, y2, delim = c("space", "dash")))
# GN2[GN2fields] <- lapply(GN2[GN2fields], function(x) MergeSuffix(x, y3, delim = c("space", "dash")))
#
# # Remove duplicated DonorID records in GN2
# GN2 <- GN2[!duplicated(GN2$DonorID), ]
#
# # Generate KWIC index
# GN1KWIC <- KWIC(GN1, GN1fields)
# GN2KWIC <- KWIC(GN2, GN2fields)
#
# # Specify the exceptions as a vector
# exep <- c("A", "B", "BIG", "BOLD", "BUNCH", "C", "COMPANY", "CULTURE",
# "DARK", "E", "EARLY", "EC", "ERECT", "EXOTIC", "FLESH", "GROUNDNUT",
# "GUTHUKAI", "IMPROVED", "K", "KUTHUKADAL", "KUTHUKAI", "LARGE",
# "LIGHT", "LOCAL", "OF", "OVERO", "P", "PEANUT", "PURPLE", "R",
# "RED", "RUNNER", "S1", "SAM", "SMALL", "SPANISH", "TAN", "TYPE",
# "U", "VALENCIA", "VIRGINIA", "WHITE")
#
# # Specify the synsets as a list
# syn <- list(c("CHANDRA", "AH114"), c("TG1", "VIKRAM"))
#
# # Fetch probable duplicate sets
# GNdupb <- ProbDup(kwic1 = GN1KWIC, kwic2 = GN2KWIC, method = "b",
# excep = exep, fuzzy = TRUE, phonetic = TRUE,
# encoding = "primary", semantic = TRUE, syn = syn)
# GNdupb
#
# GNdupc <- ProbDup(kwic1 = GN1KWIC, kwic2 = GN2KWIC, method = "c",
# excep = exep, fuzzy = TRUE, phonetic = TRUE,
# encoding = "primary", semantic = TRUE, syn = syn)
# GNdupc
#
# ## End(Not run)
Run the code above in your browser using DataLab