# NOT RUN {
library(dplyr)
if (has_lahman("sqlite")) {
# Left joins ----------------------------------------------------------------
lahman_s <- lahman_sqlite()
batting <- tbl(lahman_s, "Batting")
team_info <- select(tbl(lahman_s, "Teams"), yearID, lgID, teamID, G, R:H)
# Combine player and whole team statistics
first_stint <- select(filter(batting, stint == 1), playerID:H)
both <- left_join(first_stint, team_info, type = "inner", by = c("yearID", "teamID", "lgID"))
head(both)
explain(both)
# Join with a local data frame
grid <- expand.grid(
teamID = c("WAS", "ATL", "PHI", "NYA"),
yearID = 2010:2012)
top4a <- left_join(batting, grid, copy = TRUE)
explain(top4a)
# Indices don't really help here because there's no matching index on
# batting
top4b <- left_join(batting, grid, copy = TRUE, auto_index = TRUE)
explain(top4b)
# Semi-joins ----------------------------------------------------------------
people <- tbl(lahman_s, "Master")
# All people in half of fame
hof <- tbl(lahman_s, "HallOfFame")
semi_join(people, hof)
# All people not in the hall of fame
anti_join(people, hof)
# Find all managers
manager <- tbl(lahman_s, "Managers")
semi_join(people, manager)
# Find all managers in hall of fame
famous_manager <- semi_join(semi_join(people, manager), hof)
famous_manager
explain(famous_manager)
# Anti-joins ----------------------------------------------------------------
# batters without person covariates
anti_join(batting, people)
# Arbitrary predicates ------------------------------------------------------
# Find all pairs of awards given to the same player
# with at least 18 years between the awards:
awards_players <- tbl(lahman_s, "AwardsPlayers")
inner_join(
awards_players, awards_players,
sql_on = paste0(
"(LHS.playerID = RHS.playerID) AND ",
"(LHS.yearID < RHS.yearID - 18)"
)
)
}
# }
Run the code above in your browser using DataCamp Workspace