beesFlagged_out <- dupeSummary(
data = BeeBDC::beesFlagged,
# Should start with paste0(DataPath, "/Output/Report/"), instead of tempdir():
path = paste0(tempdir(), "/"),
# options are "ID","collectionInfo", or "both"
duplicatedBy = "collectionInfo", # I'm only running ID for the first lot because we might
# recover other info later
# The columns to generate completeness info from
completeness_cols = c("decimalLatitude", "decimalLongitude",
"scientificName", "eventDate"),
# idColumns = c("gbifID", "occurrenceID", "recordId","id"),
# The columns to ADDITIONALLY consider when finding duplicates in collectionInfo
collectionCols = c("decimalLatitude", "decimalLongitude", "scientificName", "eventDate",
"recordedBy"),
# The columns to combine, one-by-one with the collectionCols
collectInfoColumns = c("catalogNumber", "otherCatalogNumbers"),
# Custom comparisons - as a list of columns to compare
# RAW custom comparisons do not use the character and number thresholds
CustomComparisonsRAW = dplyr::lst(c("catalogNumber", "institutionCode", "scientificName")),
# Other custom comparisons use the character and number thresholds
CustomComparisons = dplyr::lst(c("gbifID", "scientificName"),
c("occurrenceID", "scientificName"),
c("recordId", "scientificName"),
c("id", "scientificName")),
# The order in which you want to KEEP duplicated based on data source
# try unique(check_time$dataSource)
sourceOrder = c("CAES", "Gai", "Ecd","BMont", "BMin", "EPEL", "ASP", "KP", "EcoS", "EaCO",
"FSCA", "Bal", "SMC", "Lic", "Arm",
"USGS", "ALA", "GBIF","SCAN","iDigBio"),
# !!!!!! BELS > GeoLocate
# Set the complexity threshold for id letter and number length
# minimum number of characters when WITH the numberThreshold
characterThreshold = 2,
# minimum number of numbers when WITH the characterThreshold
numberThreshold = 3,
# Minimum number of numbers WITHOUT any characters
numberOnlyThreshold = 5)
Run the code above in your browser using DataLab