function.read_metadata: Change the Data Type and Reference Category

Description

When an user uploads a file in 'EQUAL-STATS' program, the program classify the variable types based on the nature of the data uploaded using the function function.read_data. However, the levels in the categorical data are determined by alphabetical order, which may not be appropriate in many situations, particularly when needs to compare a new treatment to the standard treatment (reference category) and when the event and no event have to be defined correctly. This can be addressed by uploading metadata.

Usage

function.read_metadata(rv, metadata_file_path)

Value

outcome: Whether the data types and the order of the categories (levels/factors) were modified successfully.
message: The message displayed to the user after the processing. This message also contains the reason for failure if the modification was unsuccessful.
data: Imported data (with the structure modified as per the metadata)
any_type: All variables in the data
quantitative: Quantitative variables in the data
counts: Count variables in the data
categorical: Categorical variables in the data
nominal: Categorical variables without any order in the data
binary: Categorical variables with only two possible values (factors/levels) in the data
ordinal: Ordered categorical variables
date: Any variables that were declared as date and could be convered to date
date: Any variables that were declared as time and could be convered to time

Arguments

rv: A list supplied by 'EQUAL-STATS' application based on user input.
metadata_file_path: The path to the metadata file.

Author

Kurinchi Gurusamy

References

https://sites.google.com/view/equal-group/home

Examples

Run this code

# Create simulated data ####
data <- cbind.data.frame(
  `Subject ID` = c(
    "S0001", "S0002", "S0003", "S0004", "S0005",
    "S0006", "S0007", "S0008", "S0009", "S0010",
    "S0011", "S0012", "S0013", "S0014", "S0015",
    "S0016", "S0017", "S0018", "S0019", "S0020",
    "S0021", "S0022", "S0023", "S0024", "S0025",
    "S0026", "S0027", "S0028", "S0029", "S0030"),
  `Centre` = c(
    "C_0001", "C_0002", "C_0002", "C_0002", "C_0002",
    "C_0001", "C_0001", "C_0003", "C_0001", "C_0003",
    "C_0001", "C_0002", "C_0002", "C_0001", "C_0003",
    "C_0002", "C_0002", "C_0003", "C_0001", "C_0002",
    "C_0002", "C_0002", "C_0002", "C_0003", "C_0002",
    "C_0001", "C_0003", "C_0001", "C_0001", "C_0001"),
  `Treatment` = c(
    "Intensive rehabilitation", "Intensive rehabilitation", "Standard rehabilitation",
    "Intensive rehabilitation", "Intensive rehabilitation", "Intensive rehabilitation",
    "Intensive rehabilitation", "Intensive rehabilitation", "Intensive rehabilitation",
    "Standard rehabilitation", "Intensive rehabilitation", "Standard rehabilitation",
    "Standard rehabilitation", "Intensive rehabilitation", "Intensive rehabilitation",
    "Intensive rehabilitation", "Standard rehabilitation", "Standard rehabilitation",
    "Intensive rehabilitation", "Standard rehabilitation", "Intensive rehabilitation",
    "Intensive rehabilitation", "Standard rehabilitation", "Intensive rehabilitation",
    "Intensive rehabilitation", "Standard rehabilitation", "Standard rehabilitation",
    "Intensive rehabilitation", "Standard rehabilitation", "Intensive rehabilitation"),
  `Obesity status` = c(
    "Obese", "Non-obese", "Obese", "Non-obese", "Non-obese",
    "Obese", "Obese", "Obese", "Non-obese", "Obese",
    "Non-obese", "Non-obese", "Obese", "Non-obese", "Obese",
    "Obese", "Non-obese", "Obese", "Obese", "Obese",
    "Non-obese", "Non-obese", "Non-obese", "Obese", "Obese",
    "Non-obese", "Obese", "Obese", "Obese", "Obese"),
  `Unable to walk independently at 6 weeks` = c(
    "unable", "able", "able", "unable", "able",
    "able", "unable", "unable", "unable", "unable",
    "able", "unable", "able", "unable", "unable",
    "able", "unable", "unable", "unable", "unable",
    "able", "able", "able", "able", "unable",
    "able", "able", "unable", "able", "unable"),
  `Mobility score at 6 months` = c(
    86, 65.1, 48, 99.8, 73.4, 70, 74.7, 36.5, 64.6, 85.4,
    41.7, 60.1, 73.3, 42.4, 55.3, 47.3, 85.9, 63, 64.6, 101.8,
    108.1, 72.3, 96.4, 87.5, 66.2, 92.9, 47.7, 55.8, 56.4, 133.8),
  `Pain at 6 weeks` = c(
    "3_severe", "1_mild", "1_mild", "2_moderate", "1_mild",
    "1_mild", "2_moderate", "2_moderate", "1_mild", "3_severe",
    "1_mild", "2_moderate", "1_mild", "3_severe", "3_severe",
    "1_mild", "2_moderate", "3_severe", "2_moderate", "2_moderate",
    "1_mild", "1_mild", "1_mild", "1_mild", "2_moderate",
    "1_mild", "1_mild", "2_moderate", "1_mild", "2_moderate"),
  `Number of falls within 6 months` = c(
    3, 2, 3, 2, 2, 1, 4, 2, 2, 5,
    3, 2, 2, 2, 5, 3, 2, 2, 3, 4,
    3, 1, 2, 2, 2, 7, 2, 1, 1, 8),
  `Mobility score at 12 months` = c(
    90, 69.1, 52, 103.8, 77.4, 74, 78.7, 40.5, 68.6, 89.4,
    45.7, 64.1, 77.3, 46.4, 59.3, 51.3, 89.9, 67, 68.6, 105.8,
    112.1, 76.3, 100.4, 91.5, 70.2, 96.9, 51.7, 59.8, 60.4, 137.8)
)
# Create a meta-data file
# The first row indicates data type and the second row indicates
# the reference category (for all types of categorical variables)
metadata_first_two_rows <- cbind.data.frame(
c("nominal","S0001"), c("nominal", "C_0001"), c("binary", "Standard rehabilitation"),
c("binary", "Non-obese"), c("binary","able"), c("quantitative", NA), c("ordinal", "1_mild"),
c("counts", NA), c("quantitative", NA))
colnames(metadata_first_two_rows) <- colnames(data)
# The subsequent rows indicate the levels in the user-defined order
# for all categorical variables
# For this simulation, we will change the reference category
# but retain the remaining default order of variable levels
# First find the maximum number of levels
maximum_number_of_levels <- max(as.numeric(sapply(1:ncol(data), function(y) {
  if ((metadata_first_two_rows[1,y] == "quantitative") |
  (metadata_first_two_rows[1,y] == "counts")) {
    NA
  } else {
    nlevels(factor(data[,y]))
  }
})), na.rm = TRUE)
metadata_subsequent_rows <- lapply(1:ncol(data), function(y) {
  if ((metadata_first_two_rows[1,y] == "quantitative") |
  (metadata_first_two_rows[1,y] == "counts")) {
    output <- rep(NA,(maximum_number_of_levels-1))
  } else {
    categories <- sort(unique(data[,y]))
    categories <- categories[categories != metadata_first_two_rows[2,y]]
    output <- c(categories, rep(NA,(maximum_number_of_levels-1-length(categories))))
  }
}
)
metadata_subsequent_rows <- do.call(cbind.data.frame, metadata_subsequent_rows)
colnames(metadata_subsequent_rows) <- colnames(data)
metadata <- rbind.data.frame(metadata_first_two_rows, metadata_subsequent_rows)
# Simulate lists provided by EQUAL-STATS ####
entry <- list()
entry <- lapply(1:15, function(x) entry[[x]] <- '')
rv <- list(
  StorageFolder = tempdir(),
  first_menu_choice = "Summary_Measures",
  second_menu_choice = NA,
  entry = entry,
  import_data = NULL,
  same_row_different_row = NA,
  submit_button_to_appear = FALSE,
  summary_measures_choices = c("EQUAL-STATS choice", "Total observations",
  "Missing observations", "Available observations"),
  analysis_outcome = list(),
  code = list(),
  plan = list(),
  results = list(),
  plots_list = list(),
  reports = list()
)
# Store the data and metadata in a folder
data_file_path = paste0(tempdir(), "/data.csv")
write.csv(data, file = data_file_path, row.names = FALSE, na = "")
metadata_file_path = paste0(tempdir(), "/metadata.csv")
write.csv(metadata, file = metadata_file_path, row.names = FALSE, na = "")
# Load the necessary packages and functions
library(stringr)
# Read the data
rv$import_data <- function.read_data(data_file_path)
# Final function ####
meta_data_implemented_data_types <- function.read_metadata(rv, metadata_file_path)

Run the code above in your browser using DataLab