cheapr
In cheapr, ‘cheap’ means fast and memory-efficient, and that’s exactly the philosophy that cheapr aims to follow.
Installation
You can install cheapr like so:
install.packages("cheapr")
or you can install the development version of cheapr:
remotes::install_github("NicChr/cheapr")
Some common operations that cheapr can do much faster and more efficiently include:
Counting, finding, removing and replacing
NA
and scalar valuesCreating factors
Creating multiple sequences in a vectorised way
Sub-setting vectors and data frames efficiently
Safe, flexible and fast greatest common divisor and lowest common multiple
Lags/leads
Lightweight
integer64
supportIn-memory Math (no copies, vectors updated by reference)
Summary statistics of data frame variables
Binning of continuous data
Let’s first load the required packages
library(cheapr)
library(bench)
num_na()
is a useful function to efficiently return the number of NA
values and can be used in a variety of problems.
Almost all the NA
handling functions in cheapr can make use of
multiple cores on your machine through openMP.
x <- rep(NA, 10^6)
# 1 core by default
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 986µs 1.04ms 954. 2.41KB 0
#> 2 sum(is.na(x)) 780µs 1.75ms 575. 3.81MB 45.6
# 4 cores
options(cheapr.cores = 4)
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 267µs 338.4µs 2881. 0B 0
#> 2 sum(is.na(x)) 782µs 1.75ms 577. 3.81MB 48.3
options(cheapr.cores = 4)
Efficient NA counts by row/col
m <- matrix(x, ncol = 10^3)
# Number of NA values by row
mark(row_na_counts(m),
rowSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 row_na_counts(m) 1.34ms 2.66ms 374. 9.11KB 0
#> 2 rowSums(is.na(m)) 3.48ms 3.69ms 267. 3.82MB 23.9
# Number of NA values by col
mark(col_na_counts(m),
colSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 col_na_counts(m) 721.5µs 894.9µs 1123. 9.1KB 0
#> 2 colSums(is.na(m)) 2.63ms 2.83ms 351. 3.82MB 31.9
is_na
is a multi-threaded alternative to is.na
x <- rnorm(10^6)
x[sample.int(10^6, 10^5)] <- NA
mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(x) 739µs 1.79ms 564. 3.81MB 66.5
#> 2 is_na(x) 425µs 848.45µs 1184. 3.82MB 109.
### posixlt method is much faster
hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600),
tz = "UTC")
hours[sample.int(10^6, 10^5)] <- NA
mark(is.na(hours), is_na(hours))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(hours) 1.22s 1.22s 0.821 61MB 0.821
#> 2 is_na(hours) 5.26ms 7.5ms 111. 13.9MB 13.9
It differs in 2 regards:
- List elements are regarded as
NA
when either that element is anNA
value or it is a list containing onlyNA
values. - For data frames,
is_na
returns a logical vector whereTRUE
defines an empty row of onlyNA
values.
# List example
is.na(list(NA, list(NA, NA), 10))
#> [1] TRUE FALSE FALSE
is_na(list(NA, list(NA, NA), 10))
#> [1] TRUE TRUE FALSE
# Data frame example
df <- data.frame(x = c(1, NA, 3),
y = c(NA, NA, NA))
df
#> x y
#> 1 1 NA
#> 2 NA NA
#> 3 3 NA
is_na(df)
#> [1] FALSE TRUE FALSE
is_na(df)
#> [1] FALSE TRUE FALSE
# The below identity should hold
identical(is_na(df), row_na_counts(df) == ncol(df))
#> [1] TRUE
is_na
and all the NA
handling functions fall back on calling
is.na()
if no suitable method is found. This means that custom objects
like vctrs rcrds and more are supported.
Cheap data frame summaries with overview
Inspired by the excellent skimr package, overview()
is a cheaper
alternative designed for larger data.
df <- data.frame(
x = sample.int(100, 10^7, TRUE),
y = factor_(sample(LETTERS, 10^7, TRUE)),
z = rnorm(10^7)
)
overview(df, hist = TRUE)
#> obs: 10000000
#> cols: 3
#>
#> ----- Numeric -----
#> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100
#> 1 x integer 0 1 100 50.52 1 26 51 76 100
#> 2 z numeric 0 1 10000000 0 -4.95 -0.67 0 0.67 4.97
#> iqr sd hist
#> 1 50 28.87 ▇▇▇▇▇
#> 2 1.35 1 ▁▂▇▂▁
#>
#> ----- Categorical -----
#> col class n_missing p_complete n_unique n_levels min max
#> 1 y factor 0 1 26 26 A Z
mark(overview(df))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 overview(df) 1.26s 1.26s 0.793 2.09KB 0
Cheaper and consistent subsetting with sset
sset(iris, 1:5)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
sset(iris, 1:5, j = "Species")
#> Species
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
# sset always returns a data frame when input is a data frame
sset(iris, 1, 1) # data frame
#> Sepal.Length
#> 1 5.1
iris[1, 1] # not a data frame
#> [1] 5.1
x <- sample.int(10^6, 10^4, TRUE)
y <- sample.int(10^6, 10^4, TRUE)
mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y])
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(x, x %in_% y) 82.5µs 122µs 8488. 88.1KB 6.66
#> 2 sset(x, x %in% y) 152.8µs 230µs 4388. 285.3KB 8.80
#> 3 x[x %in% y] 130.5µs 210µs 4824. 324.4KB 14.2
sset
uses an internal range-based subset when i
is an ALTREP integer
sequence of the form m:n.
mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE])
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, 0:10^5) 132.5µs 518.8µs 1944. 1.53MB 26.8
#> 2 df[0:10^5, , drop = FALSE] 6.1ms 7.28ms 136. 4.83MB 6.57
It also accepts negative indexes
mark(sset(df, -10^4:0),
df[-10^4:0, , drop = FALSE],
check = FALSE) # The only difference is the row names
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, -10^4:0) 47.8ms 49.5ms 15.8 152MB 5.94
#> 2 df[-10^4:0, , drop = FALSE] 748.5ms 748.5ms 1.34 776MB 4.01
The biggest difference between sset
and [
is the way logical vectors
are handled. The two main differences when i
is a logical vector are:
NA
values are ignored, only the locations ofTRUE
values are used.i
must be the same length asx
and is not recycled.
# Examples with NAs
x <- c(1, 5, NA, NA, -5)
x[x > 0]
#> [1] 1 5 NA NA
sset(x, x > 0)
#> [1] 1 5
# Example with length(i) < length(x)
sset(x, TRUE)
#> Error in check_length(i, length(x)): i must have length 5
# This is equivalent
x[TRUE]
#> [1] 1 5 NA NA -5
# to..
sset(x)
#> [1] 1 5 NA NA -5
Vector and data frame lags with lag_()
set.seed(37)
lag_(1:10, 3) # Lag(3)
#> [1] NA NA NA 1 2 3 4 5 6 7
lag_(1:10, -3) # Lead(3)
#> [1] 4 5 6 7 8 9 10 NA NA NA
# Using an example from data.table
library(data.table)
#> Warning: package 'data.table' was built under R version 4.4.1
dt <- data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])
# Similar to data.table::shift()
lag_(dt, 1) # Lag
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
lag_(dt, -1) # Lead
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2011 0.07883715 2 b
#> 2: 2012 0.64879698 3 c
#> 3: 2013 0.49685336 4 d
#> 4: 2014 0.71878731 5 e
#> 5: NA NA NA <NA>
With lag_
we can update variables by reference, including entire data
frames
# At the moment, shift() cannot do this
lag_(dt, set = TRUE)
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
dt # Was updated by reference
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
lag2_
is a more generalised variant that supports vectors of lags,
custom ordering and run lengths.
lag2_(dt, order = 5:1) # Reverse order lag (same as lead)
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2011 0.07883715 2 b
#> 3: 2012 0.64879698 3 c
#> 4: 2013 0.49685336 4 d
#> 5: NA NA NA <NA>
lag2_(dt, -1) # Same as above
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2011 0.07883715 2 b
#> 3: 2012 0.64879698 3 c
#> 4: 2013 0.49685336 4 d
#> 5: NA NA NA <NA>
lag2_(dt, c(1, -1)) # Alternating lead/lag
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2011 0.07883715 2 b
#> 3: 2010 0.54964085 1 a
#> 4: 2013 0.49685336 4 d
#> 5: 2012 0.64879698 3 c
lag2_(dt, c(-1, 0, 0, 0, 0)) # Lead e.g. only first row
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
Greatest common divisor and smallest common multiple
gcd2(5, 25)
#> [1] 5
scm2(5, 6)
#> [1] 30
gcd(seq(5, 25, by = 5))
#> [1] 5
scm(seq(5, 25, by = 5))
#> [1] 300
x <- seq(1L, 1000000L, 1L)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 1.2µs 1.5µs 626912. 0B 0
x <- seq(0, 10^6, 0.5)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 51.5ms 51.8ms 19.2 0B 0
Creating many sequences
As an example, to create 3 sequences with different increments,
the usual approach might be to use lapply to loop through the increment
values together with seq()
# Base R
increments <- c(1, 0.5, 0.1)
start <- 1
end <- 5
unlist(lapply(increments, \(x) seq(start, end, x)))
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
In cheapr you can use seq_()
which accepts vector arguments.
seq_(start, end, increments)
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4
#> [20] 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
#> [39] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
Use add_id = TRUE
to label the individual sequences.
seq_(start, end, increments, add_id = TRUE)
#> 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3
#> 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
If you know the sizes of your sequences beforehand, use sequence_()
seq_sizes <- c(3, 5, 10)
sequence_(seq_sizes, from = 0, by = 1/3, add_id = TRUE) |>
enframe_()
#> # A tibble: 18 × 2
#> name value
#> <chr> <dbl>
#> 1 1 0
#> 2 1 0.333
#> 3 1 0.667
#> 4 2 0
#> 5 2 0.333
#> 6 2 0.667
#> 7 2 1
#> 8 2 1.33
#> 9 3 0
#> 10 3 0.333
#> 11 3 0.667
#> 12 3 1
#> 13 3 1.33
#> 14 3 1.67
#> 15 3 2
#> 16 3 2.33
#> 17 3 2.67
#> 18 3 3
You can also calculate the sequence sizes using seq_size()
seq_size(start, end, increments)
#> [1] 5 9 41
‘Cheaper’ Base R alternatives
which
x <- rep(TRUE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 2.03ms 3.41ms 282. 3.81MB 2.09
#> 2 base_which 629.4µs 2.62ms 397. 7.63MB 9.62
x <- rep(FALSE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 230µs 290µs 3230. 0B 0
#> 2 base_which 456µs 480µs 2055. 3.81MB 20.1
x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 1.24ms 2ms 488. 1.91MB 2.08
#> 2 base_which 781µs 1.74ms 588. 7.63MB 12.0
x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 2.98ms 4.25ms 236. 3.81MB 2.09
#> 2 base_which 938.1µs 3.1ms 330. 9.54MB 9.22
x <- sample(c(TRUE, FALSE), 10^6, TRUE)
x[sample.int(10^6, 10^4)] <- NA
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 1.93ms 2.56ms 374. 1.89MB 2.09
#> 2 base_which 3.19ms 4.2ms 240. 5.7MB 2.07
factor
x <- sample(seq(-10^3, 10^3, 0.01))
y <- do.call(paste0, expand.grid(letters, letters, letters, letters))
mark(cheapr_factor = factor_(x),
base_factor = factor(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 9.19ms 9.74ms 102. 4.59MB 0
#> 2 base_factor 490.32ms 494.23ms 2.02 27.84MB 0
mark(cheapr_factor = factor_(x, order = FALSE),
base_factor = factor(x, levels = unique(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 4.37ms 5.08ms 192. 1.53MB 0
#> 2 base_factor 752.8ms 752.8ms 1.33 22.79MB 0
mark(cheapr_factor = factor_(y),
base_factor = factor(y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 208.06ms 211.32ms 4.60 5.23MB 0
#> 2 base_factor 3.23s 3.23s 0.309 54.35MB 0
mark(cheapr_factor = factor_(y, order = FALSE),
base_factor = factor(y, levels = unique(y)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 4.97ms 8.01ms 119. 3.49MB 0
#> 2 base_factor 53.61ms 59.46ms 16.9 39.89MB 0
intersect & setdiff
x <- sample.int(10^6, 10^5, TRUE)
y <- sample.int(10^6, 10^5, TRUE)
mark(cheapr_intersect = intersect_(x, y, dups = FALSE),
base_intersect = intersect(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_intersect 2.71ms 2.89ms 337. 1.18MB 2.30
#> 2 base_intersect 4.91ms 5.28ms 182. 5.16MB 0
mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE),
base_setdiff = setdiff(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_setdiff 2.97ms 3.15ms 315. 1.76MB 0
#> 2 base_setdiff 4.97ms 5.51ms 179. 5.71MB 2.29
%in_%
and %!in_%
mark(cheapr = x %in_% y,
base = x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.75ms 1.83ms 539. 781.34KB 0
#> 2 base 2.52ms 3.08ms 325. 2.53MB 2.24
mark(cheapr = x %!in_% y,
base = !x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.68ms 1.82ms 543. 787.84KB 0
#> 2 base 2.82ms 3.25ms 306. 2.91MB 2.25
as_discrete
as_discrete
is a cheaper alternative to cut
x <- rnorm(10^7)
b <- seq(0, max(x), 0.2)
mark(cheapr_cut = as_discrete(x, b, left = FALSE),
base_cut = cut(x, b))
#> Warning: Some expressions had a GC in every iteration; so filtering is
#> disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_cut 216ms 219ms 4.54 38.2MB 0
#> 2 base_cut 642ms 642ms 1.56 267.1MB 1.56