library(lmomco)
data(annMax)
# Goodness of Fit is measured by RMSE of cumulated distribution function and ?ecdf
dlf <- distLfit(annMax, cdf=TRUE, nbest=17)
distLplot(dlf, cdf=TRUE, sel=c("wak", "revgum"))
dlf$gof
x <- sort(annMax)
segments(x0=x, y0=plmomco(x, dlf$parameter$revgum), y1=ecdf(annMax)(x), col=2)
segments(x0=x, y0=plmomco(x, dlf$parameter$wak), y1=ecdf(annMax)(x), col=4, lwd=2)
plot(ecdf(annMax), add=TRUE)
# RMSE: root of average of ( errors squared ) , errors = line distances
# weights by three different weighting schemes
distLgofPlot(dlf, ranks=FALSE, weights=TRUE)
distLgof(dlf, ks=TRUE, plot=FALSE)$gof
# Kolmogorov-Smirnov Tests return slightly different values:
ks.test(annMax, "pnorm", mean(annMax), sd(annMax) )$p.value
ks.test(annMax, "cdfnor", parnor(lmoms(annMax)))$p.value
# GOF: how well do the distributions fit the original data?
pairs(dlf$gof[,1:3]) # measures of goodness of fit are correlated quite well here.
# In the next example, however, we see that it does matter which one is used.
# compare Ranks with different proportions used for GOF
par(mfrow=c(1,2))
distLgofPlot(dlf, weights=FALSE)
d <- distLgof(dlf, gofProp=0.8, plot=TRUE, weights=FALSE)
par(mfrow=c(1,1))
# effect of Proportion of values used to calculate RMSE
dlf100 <- distLfit(annMax, gofProp=1, nbest=19, breaks=10) # the default gofProp: 100%
distLgofPlot(dlf100)
dlf50 <- distLgof(dlf100, gofProp=0.5)
# so revgum, nor and rice do well on the upper half by R2, but bad by RMSE
distLplot(dlf50, breaks=10)
# The red dashed line shows the cut above which the data were used to get R2/rmse
distLplot(dlf=dlf50, cdf=TRUE)
distLplot(dlf=dlf50, selection=c("pe3","wei", "rice", "nor", "revgum"),
xlim=c(60,120), ylim=c(0.5, 1), cdf=TRUE, col=1)
dlf50$gof
compranks <- function(d)
{
gofProp <- 0.5
x <- sort(annMax, decreasing=TRUE)[ 1:(gofProp*length(annMax)) ]
tcdfs <- plmomco(x,dlf50$parameter[[d]])
ecdfs <- ecdf(annMax)(x) # Empirical CDF
# Root Mean Square Error, R squared:
berryFunctions::linReg(tcdfs, ecdfs, lwd=1, pch=16, main=d, digits=5, xlim=c(0.5, 1),
ylim=c(0.5, 1), pos1="topleft")
abline(a=0,b=1, lty=3)
c(berryFunctions::rmse(tcdfs, ecdfs), berryFunctions::rsquare(tcdfs, ecdfs))
}
dn <- rownames(dlf50$gof)
op <- par(mfrow=c(5,4), mar=rep(1,4), xaxt="n", yaxt="n")
for(i in dn) compranks(i)
par(op)
# so revgum, nor and rice systematically deviate from ECDF.
# RMSE is better to sort by than R2
## Not run: ## to save CRAN check computing time
#
# # custom weights
# cw <- c("gpa"=7, "gev"=3, "wak"=6, "wei"=4, "kap"=3.5, "gum"=3, "ray"=2.1,
# "ln3"=2, "pe3"=2.5, "gno"=4, "gam"=5)
# dlf <- distLfit(annMax, plot=FALSE, weightc=cw)
# distLgofPlot(dlf, ranks=TRUE)
#
#
# dev.new()
# distLplot(dlf50, cdf=TRUE, sel=c("pe3", "rice", "revgum"), order=T)
# x <- sort(annMax, decreasing=TRUE)[ 1:(0.5*length(annMax)) ]
# tcdfs <- plmomco(x,dlf50$parameter[["revgum"]])
# ecdfs <- ecdf(annMax)(x) # Empirical CDF
# plot(x, tcdfs, type="o", col=2)
# points(x, ecdfs)
# dev.new()
# linReg(tcdfs, ecdfs, type="o")
# abline(a=0,b=1, lty=3)
#
# dev.new(record=TRUE)
# for(i in 1:10/10) distLgof(dlf100, gofProp=i, weights=FALSE,
# main=paste("upper", i*100, "% used for R2"))
# # depending on which proportion of the data the GOF is calculated with, different
# # distributions are selected to be the 5 best.
# dev.off()
# ## End(Not run) # end dontrun
Run the code above in your browser using DataLab