gg_forest: Build a paired forest plot + companion table (patchwork-ready)

Description

gg_forest() creates two aligned ggplot objects: (1) a forest plot with confidence intervals and weighted points (including summary diamonds), and (2) a “table” rendered as text in a ggplot panel with column headers on the top axis. The returned objects share the same y values so they can be combined with patchwork. This uses lower level functions in the package like geom_forestpooint, geom_foreststripe, geom_foresttable and scale_x_foresttable that could be used to customize the look of the forest plot further.

Usage

gg_forest(
  data,
  y,
  x,
  data_cols,
  xmin_std,
  xmax_std,
  size_prop = 1,
  max_size = 15,
  is_summary = FALSE,
  xmin_ici = NULL,
  xmax_ici = NULL,
  vline = NULL,
  ci_colors = c(ici = "gray65", std = "black"),
  use_log_scale = FALSE,
  stripe_table = TRUE,
  stripe_figure = TRUE,
  start_stripe = 3,
  col_nudge = 0,
  table_format_list = NULL,
  col_align = NULL,
  table_header_size = 16,
  table_text_size = 4,
  fig_xlab = "Estimate",
  ...
)

Value

An object of class "gg_forest": a list with two ggplot objects (forest, table) suitable for composition with patchwork.

Arguments

data: A data frame containing one row per forest row (study and optional summary rows) and all referenced columns.
y: String. Column name used for the forest-row coordinate.
x: String. Column name containing the point estimate.
data_cols: A names character vector of column names to display in the table. The values identify the variable names and the names are the header labels that will be used in the table.
xmin_std, xmax_std: Strings. Column names for the standard CI bounds.
size_prop: Numeric scalar or string column name giving point-size weights. If a numeric scalar, the all points will recieve the same weight.
max_size: Numeric. Maximum size for points (passed to ggplot2::scale_size_area()). This will control the overall size of the points and may need some tweaking depending on the size of the figure and the number of rows.
is_summary: Logical scalar or string column name indicating summary rows. If a logical scalar, all rows will be get square points (if FALSE) or diamond points (if TRUE).
xmin_ici, xmax_ici: Optional strings for the names of the inferential CI bounds.
vline: Numeric. Value at which a dashed vertical line will be drawn. If NULL, no line is drawn.
ci_colors: Named character vector with entries "std" and "ici".
use_log_scale: Logical. If TRUE, uses a log-scaled x axis.
stripe_table: Logical. Draw alternating row stripes in the table.
stripe_figure: Logical. Draw alternating row stripes in the forest plot.
start_stripe: Integer. Row index at which striping begins.
col_nudge: Numeric. Horizontal nudge applied to table text. This is particularly useful for left-aligned columns where the header text and cell text do not line up natively.
table_format_list: Optional named list of formatting functions for data_cols. This defaults to as.character for factors and characters, sprintf("%d") for integers, and sprintf("%.2f") for numerics. The names of the list should match the values of data_cols.
col_align: Optional character vector giving alignment for each table column ("left", "center", "right").
table_header_size: Numeric. Font size for table headers.
table_text_size: Numeric. Font size for table body text.
fig_xlab: Character. X-axis label for the forest plot.
...: Additional arguments passed to geom_forestpoint().

Details

Column arguments are provided as strings and are evaluated safely using .data[[...]].

Examples

Run this code

# Load Packages
library(emmeans)
library(VizTest)
library(dplyr)

# Use built-in Esophageal Cancer Data
data(esoph)

# Aggregate data by age group
ag_data <- aggregate(esoph[,c("ncases", "ncontrols")], list(age = esoph$age), sum)

# Turn counts into integerss (not required, but makes printing nicer)
ag_data$ncases <- as.integer(ag_data$ncases)
ag_data$ncontrols <- as.integer(ag_data$ncontrols)

# Make age into unordered factor
ag_data$age <- factor(as.character(ag_data$age), 
                      levels=levels(esoph$age))
                      
# Estimate model of prevalence by age and overall (the summary model)                       
model1 <- glm(cbind(ncases, ncontrols) ~ age,
              data = ag_data, family = binomial())
model_sum <- glm(cbind(ncases, ncontrols) ~ 1,
                 data = ag_data, family = binomial())

# Make data frame of results for plotting using emmeans
fit <- emmeans(model1, "age")
fit_ci <- confint(fit)

# add in original count data
ag_data <- cbind(fit, ag_data[,c("ncases", "ncontrols")])

# turn coefficients and confidence intervals into odds ratio scale 
ag_data$or <- exp(ag_data$emmean)
ag_data$lower <- exp(ag_data$asymp.LCL)
ag_data$upper <- exp(ag_data$asymp.UCL)
# Make summary data frame that we can use for plotting                  
fit_sum <- data.frame(age= "Summary", emmean = coef(model_sum), 
  SE = unname(sqrt(vcov(model_sum))), or = exp(coef(model_sum)))
sum_ci <- confint(model_sum)
fit_sum$lower <- exp(sum_ci[1])
fit_sum$upper <- exp(sum_ci[2])
fit_sum$ncases <- sum(ag_data$ncases)
fit_sum$ncontrols <- sum(ag_data$ncontrols)
rownames(fit_sum) <- NULL

# Find the optimal visual testing intervals
viztest(fit, include_zero=FALSE, make_plot=FALSE, test_level = .05)

# Add inferential CIs to data (not for summary, though)
fit_ici <- confint(fit, level = .75)
ag_data$lower_ici <- exp(fit_ici$asymp.LCL)
ag_data$upper_ici <- exp(fit_ici$asymp.UCL)

# bind together the age-specific and summary data frames for plotting
ag_data <- dplyr::bind_rows(ag_data, fit_sum)

# identify summary row
ag_data$is_sum <- ag_data$age == "Summary"

# add point-size weight
ag_data$pt_size <- 1/ag_data$SE^2

# make age_label such that ages plot smallest at top and summary at bottom
ag_data$age_label <- factor(ag_data$age, levels=rev(ag_data$age))

# Make gg forest plot
out <- gg_forest(ag_data, 
    y = "age_label", 
    x = "or", 
    xmin_std = "lower", 
    xmax_std = "upper", 
    xmin_ici = "lower_ici", 
    xmax_ici = "upper_ici",
    size_prop = "pt_size", 
    is_summary = "is_sum", 
    use_log_scale = TRUE, 
    data_cols = c("Age" = "age_label", 
                   "Controls" = "ncontrols", 
                   "Cases"="ncases", 
                   "OR" = "or"), 
    max_size=5,
    table_header_size = 16, 
    table_text_size = 5, 
    col_nudge=c(-.085, 0,0,0), 
    diamond_aspect=15, 
    diamond_row_frac = .9)
   
# print plot
plot(out, widths=1, 1)