l = list(
data.table(id1=c(1:4, 2:5), v1=1:8),
data.table(id1=2:3, v2=1:2),
data.table(id1=3:5, v3=1:3)
)
mergelist(l, on="id1")
## using keys
l = list(
data.table(id1=c(1:4, 2:5), v1=1:8),
data.table(id1=3:5, id2=1:3, v2=1:3, key="id1"),
data.table(id2=1:4, v3=4:1, key="id2")
)
mergelist(l)
## select columns
l = list(
data.table(id1=c(1:4, 2:5), v1=1:8, v2=8:1),
data.table(id1=3:5, v3=1:3, v4=3:1, v5=1L, key="id1")
)
mergelist(l, cols=list(NULL, c("v3", "v5")))
## different arguments for each merge pair
l = list(
data.table(id1=1:4, id2=4:1),
data.table(id1=c(1:3, 1:2), v2=c(1L, 1L, 1:2, 2L)),
data.table(id2=4:5)
)
mergelist(l,
on = list("id1", "id2"), ## first merge on id1, second on id2
how = list("inner", "anti"), ## first inner join, second anti join
mult = list("last", NULL)) ## use default 'mult' in second join
## detecting duplicates matches
l = list(
data.table(id1=c(1:4, 2:5), v1=1:8), ## dups in LHS are fine
data.table(id1=c(2:3, 2L), v2=1:3), ## dups in RHS
data.table(id1=3:5, v3=1:3)
)
lapply(l[-1L], `[`, j = if (.N>1L) .SD, by = "id1") ## duplicated rows
try(mergelist(l, on="id1"))
# \donttest{
## 'star schema' and 'snowflake schema' examples (realistic data sizes)
### populate fact: US population by state and date
gt = state.x77[, "Population"]
gt = data.table(state_id=seq_along(state.name), p=gt[state.name] / sum(gt), k=1L)
tt = as.IDate(paste0(as.integer(time(uspop)), "-01-01"))
tt = as.data.table(stats::approx(tt, c(uspop), tt[1L]:tt[length(tt)]))
tt = tt[, .(date=as.IDate(x), date_id=seq_along(x), pop=y, k=1L)]
fact = tt[gt, on="k", allow.cartesian=TRUE,
.(state_id=i.state_id, date_id=x.date_id, population=x.pop * i.p)]
setkeyv(fact, c("state_id", "date_id"))
### populate dimensions: time and geography
time = data.table(key="date_id",
date_id= seq_along(tt$date), date=tt$date,
month_id=month(tt$date), month=month.name[month(tt$date)],
year_id=year(tt$date)-1789L, year=as.character(year(tt$date)),
week_id=week(tt$date), week=as.character(week(tt$date)),
weekday_id=wday(tt$date)-1L, weekday=weekdays(tt$date)
)
time[weekday_id == 0L, weekday_id := 7L][]
geog = data.table(key="state_id",
state_id=seq_along(state.name), state_abb=state.abb, state_name=state.name,
division_id=as.integer(state.division),
division_name=as.character(state.division),
region_id=as.integer(state.region),
region_name=as.character(state.region)
)
rm(gt, tt)
### denormalize 'star schema'
l = list(fact, time, geog)
str(l)
mergelist(l)
rm(l)
### turn 'star schema' into 'snowflake schema'
make.lvl = function(x, cols) {
stopifnot(is.data.table(x))
lvl = x[, unique(.SD), .SDcols=cols]
setkeyv(lvl, cols[1L])
setindexv(lvl, as.list(cols))
lvl
}
time = list(
date = make.lvl(
time, c("date_id", "date", "year_id", "month_id", "week_id", "weekday_id")),
weekday = make.lvl(time, c("weekday_id", "weekday")),
week = make.lvl(time, c("week_id", "week")),
month = make.lvl(time, c("month_id", "month")),
year = make.lvl(time, c("year_id", "year"))
)
geog = list(
state = make.lvl(geog, c("state_id", "state_abb", "state_name", "division_id")),
division = make.lvl(geog, c("division_id", "division_name", "region_id")),
region = make.lvl(geog, c("region_id", "region_name"))
)
### denormalize 'snowflake schema'
#### left join all
l = c(list(fact=fact), time, geog)
str(l)
mergelist(l)
rm(l)
#### merge hierarchies alone, reduce sizes in merges of geog dimension
ans = mergelist(list(
fact,
mergelist(time),
mergelist(rev(geog), how="right")
))
rm(ans)
#### same but no unnecessary copies
ans = mergelist(list(
fact,
setmergelist(time),
setmergelist(rev(geog), how="right")
))
# }
Run the code above in your browser using DataLab