suppressMessages(library(dplyr))
# Example codebook file
cps_rect_ddi_file <- ipums_example("cps_00157.xml")
# Function to extract Minnesota cases from CPS example
# (This can also be accomplished by including case selections
# in an extract definition)
#
# Function must take `x` and `pos` to refer to data and row position,
# respectively.
filter_mn <- function(x, pos) {
x[x$STATEFIP == 27, ]
}
# Initialize callback
filter_mn_callback <- IpumsDataFrameCallback$new(filter_mn)
# Process data in chunks, filtering to MN cases in each chunk
read_ipums_micro_chunked(
cps_rect_ddi_file,
callback = filter_mn_callback,
chunk_size = 1000,
verbose = FALSE
)
# Tabulate INCTOT average by state without storing full dataset in memory
read_ipums_micro_chunked(
cps_rect_ddi_file,
callback = IpumsDataFrameCallback$new(
function(x, pos) {
x %>%
mutate(
INCTOT = lbl_na_if(
INCTOT,
~ grepl("Missing|N.I.U.", .lbl)
)
) %>%
filter(!is.na(INCTOT)) %>%
group_by(STATEFIP = as_factor(STATEFIP)) %>%
summarize(INCTOT_SUM = sum(INCTOT), n = n(), .groups = "drop")
}
),
chunk_size = 1000,
verbose = FALSE
) %>%
group_by(STATEFIP) %>%
summarize(avg_inc = sum(INCTOT_SUM) / sum(n))
# `x` will be a list when using `read_ipums_micro_list_chunked()`
read_ipums_micro_list_chunked(
ipums_example("cps_00159.xml"),
callback = IpumsSideEffectCallback$new(function(x, pos) {
print(
paste0(
nrow(x$PERSON), " persons and ",
nrow(x$HOUSEHOLD), " households in this chunk."
)
)
}),
chunk_size = 1000,
verbose = FALSE
)
# Using the biglm package, you can even run a regression without storing
# the full dataset in memory
if (requireNamespace("biglm")) {
lm_results <- read_ipums_micro_chunked(
ipums_example("cps_00160.xml"),
IpumsBiglmCallback$new(
INCTOT ~ AGE + HEALTH, # Model formula
function(x, pos) {
x %>%
mutate(
INCTOT = lbl_na_if(
INCTOT,
~ grepl("Missing|N.I.U.", .lbl)
),
HEALTH = as_factor(HEALTH)
)
}
),
chunk_size = 1000,
verbose = FALSE
)
summary(lm_results)
}
Run the code above in your browser using DataLab