options(digits=3)
set.seed(173)
sex <- factor(sample(c("m","f"), 500, rep=TRUE))
age <- rnorm(500, 50, 5)
treatment <- factor(sample(c("Drug","Placebo"), 500, rep=TRUE))
# Generate a 3-choice variable; each of 3 variables has 5 possible levels
symp <- c('Headache','Stomach Ache','Hangnail',
'Muscle Ache','Depressed')
symptom1 <- sample(symp, 500,TRUE)
symptom2 <- sample(symp, 500,TRUE)
symptom3 <- sample(symp, 500,TRUE)
Symptoms <- mChoice(symptom1, symptom2, symptom3, label='Primary Symptoms')
table(Symptoms)
# Note: In this example, some subjects have the same symptom checked
# multiple times; in practice these redundant selections would be NAs
# mChoice will ignore these redundant selections
#Frequency table sex*treatment, sex*Symptoms
summary(sex ~ treatment + Symptoms, fun=table)
# could also do summary(sex ~ treatment +
# mChoice(symptom1,symptom2,symptom3), fun=table)
#Compute mean age, separately by 3 variables
summary(age ~ sex + treatment + Symptoms)
f <- summary(treatment ~ age + sex + Symptoms, method="reverse", test=TRUE)
f
# trio of numbers represent 25th, 50th, 75th percentile
print(f, long=TRUE)
plot(f)
plot(f, conType='bp', prtest='P')
bpplt() # annotated example showing layout of bp plot
#Compute predicted probability from a logistic regression model
#For different stratifications compute receiver operating
#characteristic curve areas (C-indexes)
predicted <- plogis(.4*(sex=="m")+.15*(age-50))
positive.diagnosis <- ifelse(runif(500)<=predicted, 1, 0)
roc <- function(z) {
x <- z[,1];
y <- z[,2];
n <- length(x);
if(n<2)return(c(ROC=NA));
n1 <- sum(y==1);
c(ROC= (mean(rank(x)[y==1])-(n1+1)/2)/(n-n1) );
}
y <- cbind(predicted, positive.diagnosis)
options(digits=2)
summary(y ~ age + sex, fun=roc)
options(digits=3)
summary(y ~ age + sex, fun=roc, method="cross")
#Use stratify() to produce a table in which time intervals go down the
#page and going across 3 continuous variables are summarized using
#quartiles, and are stratified by two treatments
set.seed(1)
d <- expand.grid(visit=1:5, treat=c('A','B'), reps=1:100)
d$sysbp <- rnorm(100*5*2, 120, 10)
label(d$sysbp) <- 'Systolic BP'
d$diasbp <- rnorm(100*5*2, 80, 7)
d$diasbp[1] <- NA
d$age <- rnorm(100*5*2, 50, 12)
g <- function(y) {
N <- apply(y, 2, function(w) sum(!is.na(w)))
h <- function(x) {
qu <- quantile(x, c(.25,.5,.75), na.rm=TRUE)
names(qu) <- c('Q1','Q2','Q3')
c(N=sum(!is.na(x)), qu)
}
w <- as.vector(apply(y, 2, h))
names(w) <- as.vector( outer(c('N','Q1','Q2','Q3'), dimnames(y)[[2]],
function(x,y) paste(y,x)))
w
}
#Use na.rm=FALSE to count NAs separately by column
s <- summary(cbind(age,sysbp,diasbp) ~ visit + stratify(treat),
na.rm=FALSE, fun=g, data=d)
#The result is very wide. Re-do, putting treatment vertically
x <- with(d, factor(paste('Visit', visit, treat)))
summary(cbind(age,sysbp,diasbp) ~ x, na.rm=FALSE, fun=g, data=d)
#Compose LaTeX code directly
g <- function(y) {
h <- function(x) {
qu <- format(round(quantile(x, c(.25,.5,.75), na.rm=TRUE),1),nsmall=1)
paste('{\\scriptsize(',sum(!is.na(x)),
')} \\hfill{\\scriptsize ', qu[1], '} \\textbf{', qu[2],
'} {\\scriptsize ', qu[3],'}', sep='')
}
apply(y, 2, h)
}
s <- summary(cbind(age,sysbp,diasbp) ~ visit + stratify(treat),
na.rm=FALSE, fun=g, data=d)
# latex(s, prn=FALSE)
## need option in latex to not print n
#Put treatment vertically
s <- summary(cbind(age,sysbp,diasbp) ~ x, fun=g, data=d, na.rm=FALSE)
# latex(s, prn=FALSE)
#Plot estimated mean life length (assuming an exponential distribution)
#separately by levels of 4 other variables. Repeat the analysis
#by levels of a stratification variable, drug. Automatically break
#continuous variables into tertiles.
#We are using the default, method='response'
## Not run:
# life.expect <- function(y) c(Years=sum(y[,1])/sum(y[,2]))
# attach(pbc)
# S <- Surv(follow.up.time, death)
# s2 <- summary(S ~ age + albumin + ascites + edema + stratify(drug),
# fun=life.expect, g=3)
#
#
# #Note: You can summarize other response variables using the same
# #independent variables using e.g. update(s2, response~.), or you
# #can change the list of independent variables using e.g.
# #update(s2, response ~.- ascites) or update(s2, .~.-ascites)
# #You can also print, typeset, or plot subsets of s2, e.g.
# #plot(s2[c('age','albumin'),]) or plot(s2[1:2,])
#
#
# s2 # invokes print.summary.formula.response
#
#
# #Plot results as a separate dot chart for each of the 3 strata levels
# par(mfrow=c(2,2))
# plot(s2, cex.labels=.6, xlim=c(0,40), superposeStrata=FALSE)
#
#
# #Typeset table, creating s2.tex
# w <- latex(s2, cdec=1)
# #Typeset table but just print LaTeX code
# latex(s2, file="") # useful for Sweave
#
#
# #Take control of groups used for age. Compute 3 quartiles for
# #both cholesterol and bilirubin (excluding observations that are missing
# #on EITHER ONE)
#
#
# age.groups <- cut2(age, c(45,60))
# g <- function(y) apply(y, 2, quantile, c(.25,.5,.75))
# y <- cbind(Chol=chol,Bili=bili)
# label(y) <- 'Cholesterol and Bilirubin'
# #You can give new column names that are not legal S names
# #by enclosing them in quotes, e.g. 'Chol (mg/dl)'=chol
#
#
# s <- summary(y ~ age.groups + ascites, fun=g)
#
#
# par(mfrow=c(1,2), oma=c(3,0,3,0)) # allow outer margins for overall
# for(ivar in 1:2) { # title
# isub <- (1:3)+(ivar-1)*3 # *3=number of quantiles/var.
# plot(s3, which=isub, main='',
# xlab=c('Cholesterol','Bilirubin')[ivar],
# pch=c(91,16,93)) # [, closed circle, ]
# }
# mtext(paste('Quartiles of', label(y)), adj=.5, outer=TRUE, cex=1.75)
# #Overall (outer) title
#
#
# prlatex(latex(s3, trios=TRUE))
# # trios -> collapse 3 quartiles
#
#
# #Summarize only bilirubin, but do it with two statistics:
# #the mean and the median. Make separate tables for the two randomized
# #groups and make plots for the active arm.
#
#
# g <- function(y) c(Mean=mean(y), Median=median(y))
#
#
# for(sub in c("D-penicillamine", "placebo")) {
# ss <- summary(bili ~ age.groups + ascites + chol, fun=g,
# subset=drug==sub)
# cat('\n',sub,'\n\n')
# print(ss)
#
#
# if(sub=='D-penicillamine') {
# par(mfrow=c(1,1))
# plot(s4, which=1:2, dotfont=c(1,-1), subtitles=FALSE, main='')
# #1=mean, 2=median -1 font = open circle
# title(sub='Closed circle: mean; Open circle: median', adj=0)
# title(sub=sub, adj=1)
# }
#
#
# w <- latex(ss, append=TRUE, fi='my.tex',
# label=if(sub=='placebo') 's4b' else 's4a',
# caption=paste(label(bili),' {\\em (',sub,')}', sep=''))
# #Note symbolic labels for tables for two subsets: s4a, s4b
# prlatex(w)
# }
#
#
# #Now consider examples in 'reverse' format, where the lone dependent
# #variable tells the summary function how to stratify all the
# #'independent' variables. This is typically used to make tables
# #comparing baseline variables by treatment group, for example.
#
#
# s5 <- summary(drug ~ bili + albumin + stage + protime + sex +
# age + spiders,
# method='reverse')
# #To summarize all variables, use summary(drug ~., data=pbc)
# #To summarize all variables with no stratification, use
# #summary(~a+b+c) or summary(~.,data=\dots)
#
#
# options(digits=1)
# print(s5, npct='both')
# #npct='both' : print both numerators and denominators
# plot(s5, which='categorical')
# Key(locator(1)) # draw legend at mouse click
# par(oma=c(3,0,0,0)) # leave outer margin at bottom
# plot(s5, which='continuous')
# Key2() # draw legend at lower left corner of plot
# # oma= above makes this default key fit the page better
#
#
# options(digits=3)
# w <- latex(s5, npct='both', here=TRUE)
# # creates s5.tex
#
#
# #Turn to a different dataset and do cross-classifications on possibly
# #more than one independent variable. The summary function with
# #method='cross' produces a data frame containing the cross-
# #classifications. This data frame is suitable for multi-panel
# #trellis displays, although `summarize' works better for that.
#
#
# attach(prostate)
# size.quartile <- cut2(sz, g=4)
# bone <- factor(bm,labels=c("no mets","bone mets"))
#
#
# s7 <- summary(ap>1 ~ size.quartile + bone, method='cross')
# #In this case, quartiles are the default so could have said sz + bone
#
#
# options(digits=3)
# print(s7, twoway=FALSE)
# s7 # same as print(s7)
# w <- latex(s7, here=TRUE) # Make s7.tex
#
#
# library(trellis,TRUE)
# invisible(ps.options(reset=TRUE))
# trellis.device(postscript, file='demo2.ps')
#
#
# dotplot(S ~ size.quartile|bone, data=s7, #s7 is name of summary stats
# xlab="Fraction ap>1", ylab="Quartile of Tumor Size")
# #Can do this more quickly with summarize:
# # s7 <- summarize(ap>1, llist(size=cut2(sz, g=4), bone), mean,
# # stat.name='Proportion')
# # dotplot(Proportion ~ size | bone, data=s7)
#
#
# summary(age ~ stage, method='cross')
# summary(age ~ stage, fun=quantile, method='cross')
# summary(age ~ stage, fun=smean.sd, method='cross')
# summary(age ~ stage, fun=smedian.hilow, method='cross')
# summary(age ~ stage, fun=function(x) c(Mean=mean(x), Median=median(x)),
# method='cross')
# #The next statements print real two-way tables
# summary(cbind(age,ap) ~ stage + bone,
# fun=function(y) apply(y, 2, quantile, c(.25,.75)),
# method='cross')
# options(digits=2)
# summary(log(ap) ~ sz + bone,
# fun=function(y) c(Mean=mean(y), quantile(y)),
# method='cross')
#
#
# #Summarize an ordered categorical response by all of the needed
# #cumulative proportions
# summary(cumcategory(disease.severity) ~ age + sex)
#
# ## End(Not run)
Run the code above in your browser using DataLab