# NOT RUN {
# Pitching data
require("dplyr")
###################################
# cleanup, and add some other stats
###################################
# Restrict to AL and NL data, 1901+
# All data re SH, SF and GIDP are missing, so remove
# Intentional walks (IBB) not recorded until 1955
pitching <- Pitching %>%
               filter(yearID >= 1901 & lgID %in% c("AL", "NL")) %>%
               select(-(28:30)) %>%  # remove SH, SF, GIDP
               mutate(BAOpp = round(H/(H + IPouts), 3),  # loose def'n
                      WHIP = round((H + BB) * 3/IPouts, 2),
                      KperBB = round(ifelse(yearID >= 1955, 
                                            SO/(BB - IBB), SO/BB), 2))
                                            
#####################
# some simple queries
#####################
# Team pitching statistics, Toronto Blue Jays, 1993
tor93 <- pitching %>%
           filter(yearID == 1993 & teamID == "TOR") %>%
           arrange(ERA)
# Career pitching statistics, Greg Maddux
subset(pitching, playerID == "maddugr01")
# Best ERAs for starting pitchers post WWII
pitching %>% 
    filter(yearID >= 1946 & IPouts >= 600) %>%
    group_by(lgID) %>%
    arrange(ERA) %>%
    do(head(., 5))
# Best K/BB ratios post-1955 among starters (excludes intentional walks)
pitching %>% 
    filter(yearID >= 1955 & IPouts >= 600) %>%
    mutate(KperBB = SO/(BB - IBB)) %>%
    arrange(desc(KperBB)) %>%
    head(., 10)
    
# Best K/BB ratios among relievers post-1950 (min. 20 saves)
pitching %>% 
    filter(yearID >= 1950 & SV >= 20) %>%
    arrange(desc(KperBB)) %>%
    head(., 10)
###############################################
# Winningest pitchers in each league each year:
###############################################
# Add name & throws information:
peopleInfo <- People %>%
                select(playerID, nameLast, nameFirst, throws)
                
# Merge peopleInfo into the pitching data
pitching1 <- right_join(peopleInfo, pitching, by = "playerID")
# Extract the pitcher with the maximum number of wins 
# each year, by league
winp <- pitching1 %>%
         group_by(yearID, lgID) %>%
         filter(W == max(W)) %>% 
         select(nameLast, nameFirst, teamID, W, throws)
# A simple ANCOVA model of wins vs. year, league and hand (L/R)
anova(lm(formula = W ~ yearID + I(yearID^2) + lgID + throws, data = winp))
# Nature of managing pitching staffs has altered importance of
# wins over time
# }
# NOT RUN {
require("ggplot2") 
# compare loess smooth with quadratic fit
ggplot(winp, aes(x = yearID, y = W)) +
    geom_point(aes(colour = throws, shape=lgID), size = 2) +
    geom_smooth(method="loess", size=1.5, color="blue") +
    geom_smooth(method = "lm", se=FALSE, color="black", 
                 formula = y ~ poly(x,2)) +
    ylab("League maximum Wins") + xlab("Year") +
    ggtitle("Maximum pitcher wins by year")
    
## To reinforce this, plot the mean IPouts by year and league,
## which gives some idea of pitcher usage. Restrict pitcher
## pool to those who pitched at least 100 innings in a year.
pitching %>% filter(IPouts >= 300) %>%  # >= 100 IP
ggplot(., aes(x = yearID, y = IPouts, color = lgID)) +
  geom_smooth(method="loess") +
  labs(x = "Year", y = "IPouts")
## Another indicator: total number of complete games pitched
## (Mirrors the trend from the preceding plot.)
pitching %>% 
   group_by(yearID, lgID) %>%
   summarise(totalCG = sum(CG, na.rm = TRUE)) %>%
   ggplot(., aes(x = yearID, y = totalCG, color = lgID)) +
      geom_point() +
      geom_path() +
      labs(x = "Year", y = "Number of complete games")
# }
# NOT RUN {
# }
Run the code above in your browser using DataLab