train.spLearner,SpatialPointsDataFrame,ANY,SpatialPixelsDataFrame-method: Train a spatial prediction and/or interpolation model using Ensemble Machine Learning

Description

Automated spatial predictions and/or interpolation using Ensemble Machine Learning. Extends functionality of the mlr package. Suitable for predicting numeric, binomial and factor-type variables.

Usage

# S4 method for SpatialPointsDataFrame,ANY,SpatialPixelsDataFrame
train.spLearner(
  observations,
  formulaString,
  covariates,
  SL.library,
  family = stats::gaussian(),
  method = "stack.cv",
  predict.type,
  super.learner = "regr.lm",
  subsets = 5,
  lambda = 0.5,
  cov.model = "exponential",
  subsample = 10000,
  parallel = "multicore",
  oblique.coords = TRUE,
  nearest = FALSE,
  buffer.dist = FALSE,
  theta.list = seq(0, 180, length.out = 14) * pi/180,
  spc = TRUE,
  id = NULL,
  weights = NULL,
  n.obs = 10,
  ...
)

Arguments

observations

SpatialPointsDataFrame.

formulaString

ANY.

covariates

SpatialPixelsDataFrame.

SL.library

List of learners,

family

Family e.g. gaussian(),

method

Ensemble stacking method (see makeStackedLearner) usually stack.cv,

predict.type

Prediction type 'prob' or 'response',

super.learner

Ensemble stacking model usually regr.lm,

subsets

Number of subsets for repeated CV,

lambda

Target variable transformation (0.5 or 1),

cov.model

Covariance model for variogram fitting,

subsample

For large datasets consider random subsetting training data,

parallel

logical, Initiate parallel processing,

oblique.coords

Specify whether to use oblique coordinates as covariates,

nearest

Specify whether to use nearest values and distances i.e. the method of Sekulic et al. (2020),

buffer.dist

Specify whether to use buffer distances to points as covariates,

theta.list

List of angles (in radians) used to derive oblique coordinates,

spc

specifies whether to apply principal components transformation.

Id column name to control clusters of data,

weights

Optional weights (per row) that learners will use to account for variable data quality,

n.obs

Number of nearest observations to be found in meteo::near.obs (by default 10),

...

other arguments that can be passed on to mlr::makeStackedLearner,

Value

object of class spLearner, which contains fitted model, variogram model and spatial grid used for Cross-validation.

References

Moller, A. B., Beucher, A. M., Pouladi, N., and Greve, M. H. (2020). Oblique geographic coordinates as covariates for digital soil mapping. SOIL, 6, 269<U+2013>289. 10.5194/soil-6-269-2020
Hengl, T., Nussbaum, M., Wright, M. N., Heuvelink, G. B., and Graler, B. (2018) Random Forest as a generic framework for predictive modeling of spatial and spatio-temporal variables. PeerJ 6:e5518. 10.7717/peerj.5518
Lu, B., & Hardin, J. (2021). A unified framework for random forest prediction error estimation. Journal of Machine Learning Research, 22(8), 1<U+2013>41. https://jmlr.org/papers/v22/18-558.html
Meinshausen, N. (2006). Quantile regression forests. Journal of Machine Learning Research, 7(Jun), 983<U+2013>999. https://jmlr.org/papers/v7/meinshausen06a.html
Sekulic, A., Kilibarda, M., Heuvelink, G. B., Nikolic, M. & Bajat, B. (2020). Random Forest Spatial Interpolation. Remote. Sens. 12, 1687, 10.3390/rs12101687

Examples

Run this code

# NOT RUN {
library(rgdal)
library(mlr)
library(rpart)
library(nnet)
demo(meuse, echo=FALSE)
## Regression:
sl = c("regr.rpart", "regr.nnet", "regr.glm")
system.time( m <- train.spLearner(meuse["lead"],
      covariates=meuse.grid[,c("dist","ffreq")],
      oblique.coords = FALSE, lambda=0,
      parallel=FALSE, SL.library=sl) )
summary(m@spModel$learner.model$super.model$learner.model)
# }
# NOT RUN {
library(plotKML)
## regression-matrix:
str(m@vgmModel$observations@data)
meuse.y <- predict(m, error.type="weighted.sd")
plot(raster::raster(meuse.y$pred["response"]), col=plotKML::R_pal[["rainbow_75"]][4:20],
   main="Predictions spLearner", axes=FALSE, box=FALSE)

## Regression with default settings:
m <- train.spLearner(meuse["zinc"], covariates=meuse.grid[,c("dist","ffreq")],
        parallel=FALSE, lambda = 0)
## Ensemble model (meta-learner):
summary(m@spModel$learner.model$super.model$learner.model)
meuse.y <- predict(m)
## Plot of predictions and prediction error (RMSPE)
op <- par(mfrow=c(1,2), oma=c(0,0,0,1), mar=c(0,0,4,3))
plot(raster::raster(meuse.y$pred["response"]), col=plotKML::R_pal[["rainbow_75"]][4:20],
   main="Predictions spLearner", axes=FALSE, box=FALSE)
points(meuse, pch="+")
plot(raster::raster(meuse.y$pred["model.error"]), col=rev(bpy.colors()),
   main="Prediction errors", axes=FALSE, box=FALSE)
points(meuse, pch="+")
par(op)
while (!is.null(dev.list())) dev.off()
## Plot of prediction intervals:
pts = list("sp.points", meuse, pch = "+", col="black")
spplot(meuse.y$pred[,c("q.lwr","q.upr")], col.regions=plotKML::R_pal[["rainbow_75"]][4:20],
   sp.layout = list(pts),
   main="Prediction intervals (alpha = 0.318)")
while (!is.null(dev.list())) dev.off()

## Method from https://doi.org/10.3390/rs12101687
#library(meteo)
mN <- train.spLearner(meuse["zinc"], covariates=meuse.grid[,c("dist","ffreq")],
        parallel=FALSE, lambda=0, nearest=TRUE)
meuse.N <- predict(mN)
## Plot of predictions and prediction error (RMSPE)
op <- par(mfrow=c(1,2), oma=c(0,0,0,1), mar=c(0,0,4,3))
plot(raster::raster(meuse.N$pred["response"]), col=plotKML::R_pal[["rainbow_75"]][4:20],
   main="Predictions spLearner meteo::near.obs", axes=FALSE, box=FALSE)
points(meuse, pch="+")
plot(raster::raster(meuse.N$pred["model.error"]), col=rev(bpy.colors()),
   main="Prediction errors", axes=FALSE, box=FALSE)
points(meuse, pch="+")
par(op)
while (!is.null(dev.list())) dev.off()

## Classification:
SL.library <- c("classif.ranger", "classif.xgboost", "classif.nnTrain")
mC <- train.spLearner(meuse["soil"], covariates=meuse.grid[,c("dist","ffreq")],
   SL.library = SL.library, super.learner = "classif.glmnet", parallel=FALSE)
meuse.soil <- predict(mC)
spplot(meuse.soil$pred[grep("prob.", names(meuse.soil$pred))],
        col.regions=plotKML::SAGA_pal[["SG_COLORS_YELLOW_RED"]], zlim=c(0,1))
spplot(meuse.soil$pred[grep("error.", names(meuse.soil$pred))],
         col.regions=rev(bpy.colors()))

## SIC1997
data("sic1997")
X <- sic1997$swiss1km[c("CHELSA_rainfall","DEM")]
mR <- train.spLearner(sic1997$daily.rainfall, covariates=X, lambda=1,
         nearest = TRUE, parallel=FALSE)
summary(mR@spModel$learner.model$super.model$learner.model)
rainfall1km <- predict(mR, what="mspe")
op <- par(mfrow=c(1,2), oma=c(0,0,0,1), mar=c(0,0,4,3))
plot(raster::raster(rainfall1km$pred["response"]), col=plotKML::R_pal[["rainbow_75"]][4:20],
    main="Predictions spLearner", axes=FALSE, box=FALSE)
points(sic1997$daily.rainfall, pch="+")
plot(raster::raster(rainfall1km$pred["model.error"]), col=rev(bpy.colors()),
    main="Prediction errors", axes=FALSE, box=FALSE)
points(sic1997$daily.rainfall, pch="+")
par(op)
while (!is.null(dev.list())) dev.off()

## Ebergotzen data set
data(eberg_grid)
gridded(eberg_grid) <- ~x+y
proj4string(eberg_grid) <- CRS("+init=epsg:31467")
data(eberg)
eb.s <- sample.int(nrow(eberg), 1400)
eberg <- eberg[eb.s,]
coordinates(eberg) <- ~X+Y
proj4string(eberg) <- CRS("+init=epsg:31467")
## Binomial variable
summary(eberg$TAXGRSC)
eberg$Parabraunerde <- ifelse(eberg$TAXGRSC=="Parabraunerde", 1, 0)
X <- eberg_grid[c("PRMGEO6","DEMSRT6","TWISRT6","TIRAST6")]
mB <- train.spLearner(eberg["Parabraunerde"], covariates=X,
   family=binomial(), cov.model = "nugget", parallel=FALSE)
eberg.Parabraunerde <- predict(mB)
plot(raster::raster(eberg.Parabraunerde$pred["prob.1"]),
   col=plotKML::SAGA_pal[["SG_COLORS_YELLOW_RED"]], zlim=c(0,1))
points(eberg["Parabraunerde"], pch="+")

## Factor variable:
data(eberg)
coordinates(eberg) <- ~X+Y
proj4string(eberg) <- CRS("+init=epsg:31467")
X <- eberg_grid[c("PRMGEO6","DEMSRT6","TWISRT6","TIRAST6")]
mF <- train.spLearner(eberg["TAXGRSC"], covariates=X, parallel=FALSE)
TAXGRSC <- predict(mF)
plot(raster::stack(TAXGRSC$pred[grep("prob.", names(TAXGRSC$pred))]),
    col=plotKML::SAGA_pal[["SG_COLORS_YELLOW_RED"]], zlim=c(0,1))
plot(raster::stack(TAXGRSC$pred[grep("error.", names(TAXGRSC$pred))]),
    col=plotKML::SAGA_pal[["SG_COLORS_YELLOW_BLUE"]], zlim=c(0,0.45))
while (!is.null(dev.list())) dev.off()
# }

Run the code above in your browser using DataLab