# Parameters for column names standardization
standardize_column_names <- list(keep = NULL, rename = NULL)
# parameters to remove constant columns, empty rows and columns
remove_constants <- list(cutoff = 1)
# Parameters for substituting missing values with NA:
replace_missing_values <- list(target_columns = NULL, na_strings = "-99")
# Parameters for duplicates removal across all columns
remove_duplicates <- list(target_columns = NULL)
# Parameters for dates standardization
standardize_dates <- list(
target_columns = NULL,
error_tolerance = 0.4,
format = NULL,
timeframe = as.Date(c("1973-05-29", "2023-05-29")),
orders = list(
world_named_months = c("Ybd", "dby"),
world_digit_months = c("dmy", "Ymd"),
US_formats = c("Omdy", "YOmd")
)
)
# Parameters for subject IDs standardization
standardize_subject_ids <- list(
target_columns = "study_id",
prefix = "PS",
suffix = "P2",
range = c(1, 100),
nchar = 7
)
# convert the 'sex' column into numeric
to_numeric <- list(target_columns = "sex", lang = "en")
# the dictionary-based cleaning will not be performed here
dictionary = NULL
# no need to check for the sequence of date events
check_date_sequence <- NULL
cleaned_data <- clean_data(
data = readRDS(
system.file("extdata", "test_df.RDS", package = "cleanepi")
),
standardize_column_names = standardize_column_names,
remove_constants = remove_constants,
replace_missing_values = replace_missing_values,
remove_duplicates = remove_duplicates,
standardize_dates = standardize_dates,
standardize_subject_ids = standardize_subject_ids,
to_numeric = to_numeric,
dictionary = NULL,
check_date_sequence = NULL
)
Run the code above in your browser using DataLab