fileply(file, groupby, fun = identity, collect = "none",
temploc = getwd(), nbins = 10, chunk = 50000, spill = 1e+06,
cores = 1, buffer = 1e+09, keepddf = FALSE, ...)
fun
is applied on each chunk)list
or dataframe
or none
. none
keeps the resulting ddo on disk.data.table
function asis.groupby
are used to
split the data and load only the subset(possibly many if multiple cores are
in action) into the memory. If groupby
is missing, chunkwise
processing is performed on each subset of the distributed dataframe. A user
defined fun
is applied and results are written to a distributed
object(list or a KV pairs) on disk. collect
argument. The default is set to 'none' which would not the
data back into memory.gc
called. Using appropriate number
of cores keeps memory utilization in check. Setting a smaller buffer
value keeps memory usage low, see localDiskControl
,
but makes the execution slower.# split-apply-combine
write.table(mtcars, "mtcars.csv", row.names = FALSE, sep = ",")
temp <- fileply(file = "mtcars.csv"
, groupby = c("carb", "gear")
, fun = identity
, collect = "list"
, sep = ","
, header = TRUE
)
temp
unlink("mtcars.csv")
# chunkwise processing
write.table(mtcars, "mtcars.csv", row.names = FALSE, sep = ",")
temp <- fileply(file = "mtcars.csv"
, chunk = 10
, fun = function(x){list(nrow(x))}
, collect = "dataframe"
, sep = ","
, header = TRUE
)
temp
unlink("mtcars.csv")
# example for collect='none'
write.table(mtcars, "mtcars.csv", row.names = FALSE, sep = ",")
outdir <- utils::capture.output(temp <- fileply(file = "mtcars.csv"
, groupby = c("carb", "gear")
, fun = identity
, sep = ","
, header = TRUE
)
, file = NULL
, type = "message"
)
outdir <- gsub("Output Directory: ", "", outdir[5])
diskKV <- datadr::ddo(datadr::localDiskConn(outdir))
diskKV
diskKV[[1]]
unlink(outdir, recursive = TRUE)
unlink("mtcars.csv")
Run the code above in your browser using DataLab