data(lsa)
### Example 1: only means, SD and variances for each country
### We only consider domain 'reading'
rd <- lsa[which(lsa[,"domain"] == "reading"),]
### We only consider the first "nest".
rdN1 <- rd[which(rd[,"nest"] == 1),]
### First, we only consider year 2010
rdN1y10<- rdN1[which(rdN1[,"year"] == 2010),]
# \donttest{
### mean estimation
means1 <- repMean(datL = rdN1y10, ID="idstud", wgt="wgt", type = "JK2", PSU = "jkzone",
repInd = "jkrep", imp="imp", groups = "country", dependent = "score",
na.rm=FALSE, doCheck=TRUE, engine = "BIFIEsurvey")
### reporting function: the function does not know which content domain is being considered,
### so the user may add new columns in the output using the 'add' argument
res1 <- report(means1, add = list(domain = "reading"))
### Example 1a: Additionally to example 1, we decide to estimate whether
### each country's mean differ significantly from the overall mean as well
### as from the individual means of the other contries
means1a<- repMean(datL = rdN1y10, ID="idstud", wgt="wgt", type = "JK2", PSU = "jkzone",
repInd = "jkrep", imp="imp", groups = "country", group.splits = 0:1,
group.differences.by = "country", cross.differences = TRUE,
dependent = "score", na.rm=FALSE, doCheck=TRUE, hetero=FALSE)
res1a <- report(means1a, add = list(domain = "reading"))
### See that the means of all countries significantly differ from the overall mean.
print(res1a[intersect(which(res1a[,"comparison"] == "crossDiff"),
which(res1a[,"parameter"] == "mean")),], digits = 3)
### Example 2: Sex differences by country. Assume equally weighted cases by omitting
### 'wgt' argument.
means2 <- repMean(datL = rdN1y10, ID="idstud", type = "JK2", PSU = "jkzone",
repInd = "jkrep", imp="imp", groups = c("country", "sex"), group.splits = 0:2,
group.differences.by="sex", dependent = "score", na.rm=FALSE, doCheck=TRUE,
cross.differences =TRUE, crossDiffSE.engine= "lm")
res2 <- report(means2,add = list(domain = "reading"))
### Example 2a: Additionally to example 2, we decide to estimate whether
### each country's mean differ significantly from the overall mean. (Note: by default,
### such cross level differences are estimated using 'weighted effect coding'. Use the
### 'crossDiffSE' argument to choose alternative methods.) Moreover, we estimate whether
### each country's sex difference differ significantly from the sex difference in the
### whole population.
means2a<- repMean(datL = rdN1y10, ID="idstud", wgt="wgt", type = "JK2", PSU = "jkzone",
repInd = "jkrep", imp="imp", groups = c("country", "sex"), group.splits = 0:2,
group.differences.by="sex", cross.differences = list(c(0,1), c(0,2)),
dependent = "score", na.rm=FALSE, doCheck=TRUE,
crossDiffSE.engine= "lm", clusters = "idclass")
res2a <- report(means2a,add = list(domain = "reading"))
### Third example: like example 2a, but using nested imputations of dependent variable,
### and additionally estimating trend: use 'rd' instead of 'rdN1y10'
### assume equally weighted cases by omitting 'wgt' argument
### ignoring jackknife by omitting 'type', 'PSU' and 'repInd' argument
means3T<- repMean(datL = rd, ID="idstud", imp="imp", nest="nest",
groups = c("country", "sex"), group.splits = 0:2, group.differences.by="sex",
cross.differences = list(c(0,1), c(0,2)), dependent = "score", na.rm=FALSE,
doCheck=TRUE, trend = "year", linkErr = "leScore",
crossDiffSE = "wec", crossDiffSE.engine= "lavaan")
res3T <- report(means3T, add = list(domain = "reading"))
### Example 3a: like example 3, but providing linking errors in an additional data.frame
### This is optional for two measurement occasions but mandatory if the analysis contains
### more than two measurement occasions
linkErr<- data.frame ( trendLevel1 = 2010, trendLevel2 = 2015, depVar = "score",
parameter = "mean", unique(lsa[,c("domain", "leScore")]),
stringsAsFactors = FALSE)
colnames(linkErr) <- car::recode(colnames(linkErr), "'leScore'='linkingError'")
### note that the linking errors for the specified domain have to be chosen via
### subsetting
means3a<- repMean(datL = rd, ID="idstud", imp="imp", nest="nest",
groups = c("country", "sex"),
group.splits = 0:2, group.differences.by="sex",
cross.differences = list(c(0,1), c(0,2)),
dependent = "score", na.rm=FALSE, doCheck=TRUE, trend = "year",
linkErr = linkErr[which(linkErr[,"domain"] == "reading"),],
crossDiffSE = "wec", crossDiffSE.engine= "lavaan")
res3a <- report(means3a, add = list(domain = "reading"))
### Fourth example: using a loop do analyse 'reading' and 'listening' comprehension
### in one function call. Again with group and cross differences and trends, and
### trend differences
### we use weights but omit jackknife analysis by omitting 'type', 'PSU' and 'repInd'
### argument
means4T<- by ( data = lsa, INDICES = lsa[,"domain"], FUN = function (sub.dat) {
repMean(datL = sub.dat, ID="idstud", wgt="wgt", imp="imp", nest="nest",
groups = c("country", "sex"), group.splits = 0:2,
group.differences.by="sex",
cross.differences = list(c(0,1), c(0,2)), dependent = "score",
na.rm=FALSE, doCheck=TRUE,
trend = "year", linkErr = "leScore", crossDiffSE.engine= "lm") })
ret4T <- do.call("rbind", lapply(names(means4T), FUN = function ( domain ) {
report(means4T[[domain]], add = list(domain = domain))}))
### Fifth example: compute adjusted means, also with trend estimation
### Note: all covariates must be numeric or 0/1 dichotomous
rdN1[,"mignum"] <- as.numeric(rdN1[,"mig"])
rdN1[,"sexnum"] <- car::recode(rdN1[,"sex"], "'male'=0; 'female'=1", as.numeric=TRUE,
as.factor=FALSE)
means5 <- repMean(datL = rdN1, ID="idstud", wgt="wgt", type = "JK2", PSU = "jkzone",
repInd = "jkrep", imp="imp", groups = "country",
adjust = c("sexnum", "ses", "mignum"), useEffectLiteR = FALSE,
dependent = "score", na.rm=FALSE, doCheck=TRUE, trend = "year",
linkErr = "leScore")
res5 <- report(means5, add = list(domain = "reading"))
if (FALSE) {
############################################################################################
# Example 6: R code for running the PISA 2015 science example to compare group means #
# with the total mean using weighted effect coding #
############################################################################################
# Warning: large PISA data set requires at least 16 GB free working memory (RAM):
### define necessary directories (note: writing permissions required)
folder <- tempdir()
### download PISA 2015 zipped student questionnaire data (420 MB) to a folder with
### writing permissions
download.file(url = "https://webfs.oecd.org/pisa/PUF_SPSS_COMBINED_CMB_STU_QQQ.zip",
destfile = file.path(folder, "pisa2015.zip"))
### unzip PISA 2015 student questionnaire data (1.5 GB) to temporary folder
zip::unzip(zipfile = file.path(folder, "pisa2015.zip"), files= "CY6_MS_CMB_STU_QQQ.sav",
exdir=folder)
### read data
pisa <- foreign::read.spss(file.path (folder, "CY6_MS_CMB_STU_QQQ.sav"),
to.data.frame=TRUE, use.value.labels = FALSE, use.missings = TRUE)
# dependent variables
measure.vars <- paste0("PV", 1:10, "SCIE")
### choose desired variables and reshape into the long format
# 'CNTSTUID' = individual student identifier
# 'CNT' = country identifier
# 'SENWT' = senate weight (assume a population of 5000 in each country)
# 'W_FSTUWT' = final student weight
# 'OECD' = dummy variable indicating which country is part of the OECD
# 'W_FSTURWT' (1 to 80) = balanced repeated replicate weights
# 'PV1SCIE' to 'PV10SCIE' = 10 plausible values of (latent) science performance
pisaLong <- reshape2::melt(pisa, id.vars = c("CNTSTUID", "CNT", "SENWT", "W_FSTUWT",
"OECD", paste0("W_FSTURWT", 1:80)),
measure.vars = measure.vars, value.name = "value", variable.name="imp",
na.rm=TRUE)
### choose OECD countries
oecd <- pisaLong[which(pisaLong[,"OECD"] == 1),]
### analyze data
### analysis takes approximately 30 minutes on an Intel i5-6500 machine with 32 GB RAM
means <- repMean( datL = oecd, # data.frame in the long format
ID = "CNTSTUID", # student identifier
dependent = "value", # the dependent variable in the data
groups = "CNT", # the grouping variable
wgt = "SENWT", # (optional) weighting variable. We use senate
# weights (assume a population of 5000 in each
# country)
type = "Fay", # type of replication method. Corresponding to
# the PISA sampling method, we use "Fay"
rho = 0.5, # shrinkage factor for weights in Fay's method
scale = NULL, # scaling constant for variance, set to NULL
# according to PISA's sampling method
rscales = NULL, # scaling constant for variance, set to NULL
# according to PISA's sampling method
repWgt = paste0("W_FSTURWT", 1:80), # the replicate weights,
# provided by the OECD
imp = "imp", # the imputation variable
mse = FALSE, # if TRUE, compute variances based on sum of
# squares around the point estimate, rather
# than the mean of the replicates.
group.splits = 0:1, # defining the 'levels' for which means should
# be computed. 0:1 implies that means for the
# whole sample (level 0) as well as for groups
# (level 1) are computed
cross.differences = TRUE, # defines whether (and which) cross level mean
# differences should be computed. TRUE means
# that all cross level mean differences are
# computed
crossDiffSE = "wec", # method for standard errors of mean
# differences
crossDiffSE.engine = "lm", # software implementation for standard
# errors of mean differences
hetero = TRUE, # assume heteroscedastic group variances
stochasticGroupSizes = FALSE # assume fixed group sizes
)
### call a reporting function to generate user-friendly output
results <- report(means, exclude = c("Ncases", "NcasesValid", "var", "sd"))
}# }
Run the code above in your browser using DataLab