cmap: Apply the same function to all chunks

Description

Apply the same function to all chunks

`cimap.disk.frame` accepts a two argument function where the first argument is a data.frame and the second is the chunk ID

`lazy` is convenience function to apply `.f` to every chunk

`delayed` is an alias for lazy and is consistent with the naming in Dask and Dagger.jl

Usage

cmap(.x, .f, ...)
# S3 method for disk.frame
cmap(
  .x,
  .f,
  ...,
  outdir = NULL,
  keep = NULL,
  chunks = nchunks(.x),
  compress = 50,
  lazy = TRUE,
  overwrite = FALSE,
  vars_and_pkgs = future::getGlobalsAndPackages(.f, envir = parent.frame()),
  .progress = TRUE
)
cmap_dfr(.x, .f, ..., .id = NULL)
# S3 method for disk.frame
cmap_dfr(.x, .f, ..., .id = NULL, use.names = fill, fill = FALSE, idcol = NULL)
cimap(.x, .f, ...)
# S3 method for disk.frame
cimap(
  .x,
  .f,
  outdir = NULL,
  keep = NULL,
  chunks = nchunks(.x),
  compress = 50,
  lazy = TRUE,
  overwrite = FALSE,
  ...
)
cimap_dfr(.x, .f, ..., .id = NULL)
# S3 method for disk.frame
cimap_dfr(
  .x,
  .f,
  ...,
  .id = NULL,
  use.names = fill,
  fill = FALSE,
  idcol = NULL
)
lazy(.x, .f, ...)
# S3 method for disk.frame
lazy(.x, .f, ...)
delayed(.x, .f, ...)
chunk_lapply(...)
map(.x, .f, ...)
# S3 method for disk.frame
map(...)
# S3 method for default
map(.x, .f, ...)
imap_dfr(.x, .f, ..., .id = NULL)
# S3 method for disk.frame
imap_dfr(...)
# S3 method for default
imap_dfr(.x, .f, ..., .id = NULL)
imap(.x, .f, ...)
# S3 method for default
imap(.x, .f, ...)
# S3 method for disk.frame
map_dfr(...)
# S3 method for default
map_dfr(.x, .f, ..., .id = NULL)

Arguments

a disk.frame

a function to apply to each of the chunks

...

for compatibility with `purrr::map`

outdir

the output directory

keep

the columns to keep from the input

chunks

The number of chunks to output

compress

0-100 fst compression ratio

lazy

if TRUE then do this lazily

overwrite

if TRUE removes any existing chunks in the data

vars_and_pkgs

variables and packages to send to a background session. This is typically automatically detected

.progress

A logical, for whether or not to print a progress bar for multiprocess, multisession, and multicore plans. From furrr

.id

not used

use.names

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

fill

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

idcol

for cmap_dfr's call to data.table::rbindlist. See data.table::rbindlist

Examples

Run this code

# NOT RUN {
cars.df = as.disk.frame(cars)

# return the first row of each chunk lazily
# 
cars2 = cmap(cars.df, function(chunk) {
 chunk[,1]
})

collect(cars2)

# same as above but using purrr 
cars2 = cmap(cars.df, ~.x[1,])

collect(cars2)

# return the first row of each chunk eagerly as list
cmap(cars.df, ~.x[1,], lazy = FALSE)

# return the first row of each chunk eagerly as data.table/data.frame by row-binding
cmap_dfr(cars.df, ~.x[1,])

# lazy and delayed are just an aliases for cmap(..., lazy = TRUE)
collect(lazy(cars.df, ~.x[1,]))
collect(delayed(cars.df, ~.x[1,]))

# clean up cars.df
delete(cars.df)
cars.df = as.disk.frame(cars)

# .x is the chunk and .y is the ID as an integer

# lazy = TRUE support is not available at the moment
cimap(cars.df, ~.x[, id := .y], lazy = FALSE)

cimap_dfr(cars.df, ~.x[, id := .y])

# clean up cars.df
delete(cars.df)
# }

Run the code above in your browser using DataLab