JoSAE-package: Provides functions for small area estimation and the estimate of variances (mean squared errors).

Description

This package currently implements the GREG and unit level EBLUP estimators as well as their variance estimators. It also contains data and extensive examples that explain its use.

Arguments

Details

ll{ Package: JoSAE Type: Package Version: 0.1 Date: 2011-07-14 License: GPL-2 LazyLoad: yes } The aim in the analysis of survey samples is frequently to derive estimates of subpopulation characteristics. Often, the sample available for the subpopulation is, however, too small to allow a reliable estimate. If an auxiliary variable exists that is correlated with the variable of interest, small area estimation (SAE) provides methods to solve the problem (Rao 2003). The purpose of this package is primarily to document the functions used in the publication of Breidenbach and Astrup (2011). The data used in that study are also provided. You might wonder why this package is called JoSAE. Well, first of all, JoSAE sounds good. The other problem was that the packages SAE and SAE2 already exist (Gomez-Rubio, 2008). They are, however, not available on CRAN and unmaintained (as of July 2011). They also do not seem to implement the variance estimators that we needed. So I just combined SAE with the first part of my name.

References

Breidenbach, J. and Astrup, R., (submitted 2011), Small area estimation of forest attributes in the Norwegian national forest inventory based on canopy height models derived from digital aerial images. European Journal of Forest Research. Rao, J.N.K. (2003), Small area estimation. Wiley. Gomez-Rubio, (2008), Tutorial on small area estimation, UseR conference 2008, August 12-14, Technische Universitat Dortmund, Germany

Examples

Run this code

#mean auxiliary variables for the populations in the domains
data(JoSAE.domain.data)
	#data for the sampled elements
data(JoSAE.sample.data)
plot(biomass.ha~mean.canopy.ht,JoSAE.sample.data)


##small area estimation
    #get an overview of the domains
        #mean of the response and predictor variables from the sample. For the response this is the sample mean estimator.
tmp <- aggregate(JoSAE.sample.data[,c("biomass.ha", "mean.canopy.ht")]
                            , by=list(domain.ID=JoSAE.sample.data$domain.ID), mean)
names(tmp)[2:3] <- paste(names(tmp)[2:3], ".bar.sample", sep="")

        #number of samples within the domains
tmp1 <- aggregate(cbind(n.i=JoSAE.sample.data$biomass.ha)
                  , by=list(domain.ID=JoSAE.sample.data$domain.ID), length)

        #combine it with the population information of the domains
overview.domains <- cbind(JoSAE.domain.data, tmp[,-1], n.i=tmp1[,-1])


    #fit the models
        #lm - the auxiliary variable explains forest biomass rather good
summary(fit.lm <- lm(biomass.ha ~ mean.canopy.ht, data=JoSAE.sample.data))

        #lme
summary(fit.lme <- lme(biomass.ha ~ mean.canopy.ht, data=JoSAE.sample.data
                       , random=~1|domain.ID))

    #mean lm residual -- needed for GREG
overview.domains$mean.resid.lm <- aggregate(resid(fit.lm)
                                            , by=list(domain.ID=JoSAE.sample.data$domain.ID)
                                            , mean)[,2]

    #mean lme residual -- needed for EBLUP.var
overview.domains$mean.resid.lme <- aggregate(resid(fit.lme, level=0)
                                             , by=list(domain.ID=JoSAE.sample.data$domain.ID)
                                             , mean)[,2]

    #synthetic estimate
overview.domains$synth <- predict(fit.lm
                                  , newdata=
                                  data.frame(mean.canopy.ht=JoSAE.domain.data$mean.canopy.ht.bar))



    #GREG estimate
overview.domains$GREG <- overview.domains$synth +  overview.domains$mean.resid.lm

    #EBLUP estimate
overview.domains$EBLUP <- predict(fit.lme
                                     , newdata=data.frame(mean.canopy.ht=JoSAE.domain.data$mean.canopy.ht.bar
                                       , domain.ID=JoSAE.domain.data$domain.ID)
                                     , level=1)

    #gamma
overview.domains$gamma.i <- JoSAE.gamma.i.f(lme.obj=fit.lme
                                          , n.i=overview.domains$n.i)

    #variance of the sample mean estimate
overview.domains$sample.var <-
    aggregate(JoSAE.sample.data$biomass.ha
          , by=list(domain.ID=JoSAE.sample.data$domain.ID), var)[,-1]/overview.domains$n.i

    #variance of the GREG estimate
overview.domains$GREG.var <-
    aggregate(resid(fit.lm)
          , by=list(domain.ID=JoSAE.sample.data$domain.ID),var)[,-1]/overview.domains$n.i

    #variance of the EBLUP
        #compute the A.i matrices for all domains (only needed once)
domain.ID <- JoSAE.domain.data$domain.ID
            #initialize the result vector
a.i.mats <- vector(mode="list", length=length(domain.ID))
for(i in 1:length(domain.ID)){
    print(i)
    a.i.mats[[i]] <- eblup.mse.f.c2.ai(gamma.i=overview.domains$gamma.i[overview.domains$domain.ID==domain.ID[i]]
                                 , n.i=overview.domains$n.i[overview.domains$domain.ID==domain.ID[i]]
                                 , lme.obj=fit.lme
                                 , X.i=as.matrix(cbind(i=1
                                   , x=JoSAE.sample.data[JoSAE.sample.data$domain.ID==domain.ID[i], "mean.canopy.ht"]
                                   ))
         )
}
                #add all the matrices
sum.A.i.mats <- Reduce("+", a.i.mats)

        #the assymptotic var-cov matrix
asy.var.cov.mat <- eblup.mse.f.c3.asyvarcovarmat(n.i=overview.domains$n.i
                                           , lme.obj=fit.lme)


        #put together the variance components
		###### Some changes are required here, if you apply it to own data!
result <- NULL
for(i in 1:length(domain.ID)){
    print(i)
    #first comp
    mse.c1.tmp <- eblup.mse.f.c1(gamma.i=overview.domains$gamma.i[overview.domains$domain.ID==domain.ID[i]]
                           , n.i=overview.domains$n.i[overview.domains$domain.ID==domain.ID[i]]
                           , lme.obj=fit.lme)
    #second comp
    mse.c2.tmp <- eblup.mse.f.c2(gamma.i=overview.domains$gamma.i[overview.domains$domain.ID==domain.ID[i]]
                           , X.i=as.matrix(cbind(i=1##cbind!!
                                   , x=JoSAE.sample.data[JoSAE.sample.data$domain.ID==domain.ID[i]
                                     , "mean.canopy.ht"]##change to other varnames if necessary
                             ))
                           , X.bar.i =as.matrix(rbind(i=1##rbind!!
                                   , x=JoSAE.domain.data[JoSAE.domain.data$domain.ID==domain.ID[i]
                                     , "mean.canopy.ht.bar"]##change to other varnames if necessary
                             ))
                           , sum.A.i = sum.A.i.mats
                           )
    #third comp
    mse.c3.tmp <- eblup.mse.f.c3(n.i=overview.domains$n.i[overview.domains$domain.ID==domain.ID[i]]
                           , lme.obj=fit.lme
                           , asympt.var.covar=asy.var.cov.mat)
    #third star comp
    mse.c3.star.tmp <- eblup.mse.f.c3.star( n.i=overview.domains$n.i[overview.domains$domain.ID==domain.ID[i]]
                                     , lme.obj=fit.lme
                                     , mean.resid.i=overview.domains$mean.resid.lme[overview.domains$domain.ID==domain.ID[i]]
                                     , asympt.var.covar=asy.var.cov.mat)
    #save result
    result <- rbind(result, data.frame(kommune=domain.ID[i]
                                       , n.i=overview.domains$n.i[overview.domains$domain.ID==domain.ID[i]]
                                       , c1=as.numeric(mse.c1.tmp), c2=as.numeric(mse.c2.tmp)
                                       , c3=as.numeric(mse.c3.tmp), c3star=as.numeric(mse.c3.star.tmp)))
}

            #derive the actual EBLUP variances
overview.domains$EBLUP.var.1 <- result$c1 + result$c2 + 2* result$c3star
overview.domains$EBLUP.var.2 <- result$c1 + result$c2 + result$c3 + result$c3star


    #display the estimates and the sampling error (sqrt(var)) in two tables
round(data.frame(overview.domains[,c("domain.ID", "n.i", "N.i")],
                 overview.domains[,c("biomass.ha.bar.sample", "GREG", "synth", "EBLUP")]),2)
        #the sampling errors of the eblup is mostly smaller than the one of the greg estimate.
        #both are always smaller than the sample mean variance.
round(data.frame(overview.domains[,c("domain.ID", "n.i", "N.i")],
                 sqrt(overview.domains[,c("sample.var", "GREG.var", "EBLUP.var.1", "EBLUP.var.2")])),2)

##END: small area estimation



## aggregate the results to the large study area that contains all domains

    #variance of the sample mean for the whole area
var.all.sampmean <- var(JoSAE.sample.data$biomass.ha)/sum(overview.domains$n.i)

    #variance of the GREG estimate for the whole area
var.all.greg <- var(resid(fit.lm))/sum(overview.domains$n.i)

    #EBLUP variance for the whole area
        #gamma
gamma.all <- JoSAE.gamma.i.f(lme.obj=fit.lme, n.i=sum(overview.domains$n.i))

        #first comp
mse.c1.tmp <- eblup.mse.f.c1(gamma.i=gamma.all
                           , n.i=sum(overview.domains$n.i)
                           , lme.obj=fit.lme)

        #second comp
mse.c2.tmp <- eblup.mse.f.c2(gamma.i=gamma.all
                           , X.i=as.matrix(cbind(i=1##cbind!!
                                   , x=mean(JoSAE.sample.data[#mean does not need to be weighted because unit-level observations
                                     , "mean.canopy.ht"])##change to other varnames if necessary
                             ))
                           , X.bar.i =as.matrix(rbind(i=1##rbind!!
                                   , x=weighted.mean(JoSAE.domain.data[
                                     , "mean.canopy.ht.bar"], w=JoSAE.domain.data$N.i)##change to other varnames if necessary
                             ))
                           , sum.A.i = sum.A.i.mats
                           )

        #third comp
mse.c3.tmp <- eblup.mse.f.c3(n.i=sum(overview.domains$n.i)
                           , lme.obj=fit.lme
                           , asympt.var.covar=asy.var.cov.mat)

        #third star comp
mse.c3.star.tmp <- eblup.mse.f.c3.star( n.i=sum(overview.domains$n.i)
                                     , lme.obj=fit.lme
                                     , mean.resid.i=weighted.mean(overview.domains$mean.resid.lme, w=overview.domains$n.i)
                                     , asympt.var.covar=asy.var.cov.mat)

        #combine to final variance
var.all.eblup <- mse.c1.tmp + mse.c2.tmp + mse.c3.tmp + mse.c3.star.tmp



    #combine the 3 estimators:
    #EBLUP estimate is almost the same as sample mean -> design consitency
    #GREG sampling error (SE) is slightly smaller than EBLUP SE. Both are considerably smaller
    #than the sample mean SE.
tmp <- data.frame(
      cbind(sample.mean=weighted.mean(overview.domains$biomass.ha.bar.sample, w=overview.domains$N.i)
            , GREG.mean=weighted.mean(overview.domains$GREG, w=overview.domains$N.i)
            , EBLUP.mean=weighted.mean(overview.domains$EBLUP, w=overview.domains$N.i))
      ,
      sqrt(cbind(SE.sample=var.all.sampmean, SE.GREG=var.all.greg, SE.EBLUP=var.all.eblup[,1]))
      )

tmp
##END: aggregate the results to the large study area that contains all domains

Run the code above in your browser using DataLab

Description

Arguments

Details

References

See Also

Examples