# Toy example with iris data
# First we create a hdd data set to run the example
hdd_path = tempfile()
write_hdd(iris, hdd_path, rowsPerChunk = 40)
# your data set is in the hard drive, in hdd format already.
data_hdd = hdd(hdd_path)
# summary information on the whole file:
summary(data_hdd)
# You can use the argument 'file' to subselect slices.
# Let's have some descriptive statistics of the first slice of HDD
summary(data_hdd[, file = 1])
# It extract the data from the first HDD slice and
# returns a data.table in memory, we then apply summary to it
# You can use the special argument .N, as in data.table.
# the following query shows the first and last lines of
# each slice of the HDD data set:
data_hdd[c(1, .N), file = 1:.N]
# Extraction of observations for which the variable
# Petal.Width is lower than 0.1
data_hdd[Petal.Width < 0.2, ]
# You can apply data.table syntax:
data_hdd[, .(pl = Petal.Length)]
# and create variables
data_hdd[, pl2 := Petal.Length**2]
# You can use the by clause, but then
# the by is applied slice by slice, NOT on the full data set:
data_hdd[, .(mean_pl = mean(Petal.Length)), by = Species]
# If the data you extract does not fit into memory,
# you can create a new HDD file with the argument 'newfile':
hdd_path_new = tempfile()
data_hdd[, pl2 := Petal.Length**2, newfile = hdd_path_new]
# check the result:
data_hdd_bis = hdd(hdd_path_new)
summary(data_hdd_bis)
print(data_hdd_bis)
Run the code above in your browser using DataLab