# ---------------------------------------------------------------------------
# True joins (inner/left/right/full): basic usage
# ---------------------------------------------------------------------------
# data frames
x <- data.table::fread(data.table = FALSE, input = "
country pop_m
Australia 27.2
Brazil 212.0
Chad 3.0
")
y <- data.table::fread(data.table = FALSE, input = "
country forest_pc
Brazil 59.1
Chad 3.2
Denmark 15.8
")
# ---------------------------------------------------------------------------
# `indicate = TRUE` adds a front column ".join" indicating whether a row is
# from `x` only (1L), from `y` only (2L), or joined from both (3L)
fjoin_full(x, y, on = "country", indicate = TRUE)
fjoin_left(x, y, on = "country", indicate = TRUE)
fjoin_right(x, y, on = "country", indicate = TRUE)
fjoin_inner(x, y, on = "country", indicate = TRUE)
# ---------------------------------------------------------------------------
# Core options and arguments (in a 1:1 equality join with fjoin_full())
# ---------------------------------------------------------------------------
# data frames
dfQ <- data.table::fread(data.table = FALSE, quote ="'", input = "
id quantity notes other_cols
2 5 '' ...
1 6 '' ...
3 7 '' ...
NA 8 'oranges (not listed)' ...
")
dfP <- data.table::fread(data.table = FALSE, input = "
id item price other_cols
NA apples 10 ...
3 bananas 20 ...
2 cherries 30 ...
1 dates 40 ...
")
# ---------------------------------------------------------------------------
# (1) basic syntax
# cf. dplyr: full_join(dfQ, dfP, join_by(id), na.matches = "never")
fjoin_full(dfQ, dfP, on = "id")
# (2) join-select in one line
fjoin_full(dfQ, dfP, on = "id", select = c("item", "price", "quantity"))
# equivalent operation in dplyr
# x <- dfQ |> select(id, quantity)
# y <- dfP |> select(id, item, price)
# full_join(x, y, join_by(id), na.matches = "never") |>
# select(id, item, price, quantity)
# ---------------------------------------------------------------------------
# (an aside) equality matches on NA if you insist
fjoin_full(dfQ, dfP, on = "id", select = c("item", "price", "quantity", "notes"), match.na = TRUE)
# (3) indicator column (in Stata since 1984)
fjoin_full(
dfQ,
dfP,
on = "id",
select = c("item", "price", "quantity"),
indicate = TRUE
)
# (4) order rows by y then x
fjoin_full(
dfQ,
dfP,
on = "id",
select = c("item", "price", "quantity"),
indicate = TRUE,
order = "right"
)
# (5) display code instead
fjoin_full(
dfQ,
dfP,
on = "id",
select = c("item", "price", "quantity"),
indicate = TRUE,
order = "right",
do = FALSE
)
# ---------------------------------------------------------------------------
# M:M inequality join reduced to 1:1 using `mult.x` and `mult.y`
# ---------------------------------------------------------------------------
# data.table (`mult`) and dplyr (`multiple`) have options for reducing the
# cardinality on one side of the join from many ("all") to one ("first" or
# "last"). fjoin (`mult.x`, `mult.y`) permits this on either side of the
# join, or on both sides at once.
# This example (using `fjoin_left()`) shows an application to temporally
# ordered data frames of "events" and "reactions".
# data frames
events <- data.table::fread(data.table = FALSE, input = "
event_id event_ts
1 10
2 20
3 40
")
reactions <- data.table::fread(data.table = FALSE, input = "
reaction_id reaction_ts
1 30
2 50
3 60
")
# ---------------------------------------------------------------------------
# (1) for each event, all subsequent reactions (M:M)
fjoin_left(
events,
reactions,
on = c("event_ts < reaction_ts"),
)
# (2) for each event, the next reaction (1:M)
fjoin_left(
events,
reactions,
on = c("event_ts < reaction_ts"),
mult.x = "first"
)
# (3) for each event, the next reaction, provided there was no intervening event (1:1)
fjoin_left(
events,
reactions,
on = c("event_ts < reaction_ts"),
mult.x = "first",
mult.y = "last"
)
# ---------------------------------------------------------------------------
# Natural join
# ---------------------------------------------------------------------------
fjoin_inner(x, y, on = NA) # note `NA` not `NULL`/omitted
try(fjoin_left(x, y)) # to prevent accidental natural joins
# ---------------------------------------------------------------------------
# Mock join (code "ghostwriter" for data.table users)
# ---------------------------------------------------------------------------
fjoin_inner(on = c("id"))
Run the code above in your browser using DataLab