DF = data.frame(a=1:5, b=6:10)
DT = data.table(a=1:5, b=6:10)
DT[2] # select * from DT where row number = 2
DT[2:3,sum(b)] # select sum(b) from DT where row number in (2,3)
DT[2:5,plot(a)] # used for j's side effect only i.e. displaying the plot
DT[c(FALSE,TRUE)] # extract all even numbered rows via standard R recycling
flush.console()
tt = subset(DF,a==3)
ss = DT[a==3]
identical(as.data.table(tt), ss)
tt = subset(DF,a==3,b)[[1]]+1
ss = DT[a==3,b+1]
identical(tt, ss)
tt = with(subset(DF,a==3),a+b+1)
ss = DT[a==3,a+b+1]
identical(tt, ss)
Lkp=1:3
tt = DF[with(DF,a %in% Lkp),]
ss = DT[a %in% Lkp]
identical(as.data.table(tt), ss)
# Examples above all use vector scans.
# Examples below all use binary search.
DT = data.table(a=letters[1:5], b=6:10)
setkey(DT,a)
identical(DT["d"],DT[4])
identical(DT[J("d")], DT[4])
identical(DT[c("c","d")], DT[J(c("c","d"))])
DT = data.table(id=rep(c("A","B"),each=3), date=c(20080501L,20080502L,20080506L), v=1:6)
setkey(DT,id,date)
DT
DT["A"] # all 3 rows for A
DT[J("A",20080502L)] # date matches exactly
DT[J("A",20080505L)] # NA since 5 May missing (outer join)
DT[J("A",20080505L),nomatch=0] # inner join
dts = c(20080501L, 20080502L, 20080505L, 20080506L, 20080507L, 20080508L)
DT[J("A",dts)] # 3 dates match exactly
DT[J("A",dts),roll=TRUE] # roll previous data forward
DT[J("A",dts),rolltolast=TRUE] # roll all but last observation forward
DT[J("A",dts),rolltolast=TRUE,nomatch=0] # remove time series after last
DT(DT[J("A",dts),roll=TRUE],dts) # joined to date from dts
dts = rev(seq(as.Date("2008-06-30"), by=-1, length=5000))
dts = as.integer(gsub("-","",dts))
ids = paste(rep(LETTERS,each=26),LETTERS,sep="")
DT = data.table(CJ(id=ids, date=dts), v=rnorm(length(ids)*length(dts)))
setkey(DT,id,date)
system.time(tt <<- DT[id=="FD"]) # vector scan. user 1.16 system 0.11 elapsed 1.27
system.time(ss <<- DT["FD"]) # binary search. user 0.02 system 0.00 elapsed 0.02
identical({setkey(tt,id,date);tt}, ss)
tables(mb=TRUE)
tt = DT[,mean(v),by="id"][c("FD","FE")] # select mean(v) from DT group by id having id in ('FD','FE')
ss = DT[c("FD","FE"),mean(v)] # more efficient way to group for known subgroups
identical(tt, ss)
tt = DT[c("FD","FE")][,mean(v),by="id,month=as.integer(date/100)"]
# Ensure you have at least 2.7GB free disk space comfortably before running this
# See Notes section above regarding ff
require(ff)
n=180000000L
DT = data.table(id=ff(0L,length=n), date=ff(1L,length=n), val=ff(0,length=n))
DT$id[167000001L] = 20
DT$val[167000002L] = 3.14
DT$date[167000003L] = 42
DT[167000000:167000005]
physical(DT$id)
rm(DT)
gc() # return memory to OS
# See over 90 further examples in \link{test.data.table}()
Run the code above in your browser using DataLab