Learn R Programming

extremeStat (version 0.6.0)

distLgof: Quality of distribution fits

Description

Calculate goodness of fit for several distributions, plot rank comparison.

Usage

distLgof(dlf, gofProp, plot = TRUE, progbars = length(dlf$dat) > 200, ks = TRUE, weightc = NA, quiet = FALSE, ...)

Arguments

dlf
List as returned by distLfit, containing the elements dat, datname, parameter, gofProp
gofProp
Overrides value in list. Proportion (0:1) of highest values in dat to compute goodness of fit (dist / ecdf) with. This enables to focus on the dist tail
plot
Call distLgofPlot? DEFAULT: TRUE
progbars
Show progress bars for each loop? DEFAULT: TRUE if n > 200
ks
Include ks.test results in dlf$gof? Computing is much faster when FALSE. DEFAULT: TRUE
weightc
Optional: a named vector with custom weights for each distribution. Are internally normalized to sum=1 after removing nonfitted dists. Names must match the parameter names from distLfit. DEFAULT: NA
quiet
Suppress notes? DEFAULT: FALSE
...
Further arguments passed to distLgofPlot

Value

List as explained in extremeStat. The added element is gof, a data.frame the root mean square error (RMSE) and R squared (R2) for the top gofProp of dat, if ks=TRUE, the p and D values from a simple ks.test, as well as weights by three different approaches for each distribution function. The weights are inverse to RMSE, weight1 for all dists, weight2 places zero weight on the worst function, weight3 on the worst half of functions.

See Also

distLgofPlot, distLfit. More complex estimates of quality of fits: Fard, M.N.P. and Holmquist, B. (2013, Chilean Journal of Statistics): Powerful goodness-of-fit tests for the extreme value distribution. http://chjs.mat.utfsm.cl/volumes/04/01/Fard_Holmquist(2013).pdf

Examples

Run this code

library(lmomco)
data(annMax)

# Goodness of Fit is measured by RMSE of cumulated distribution function and ?ecdf
dlf <- distLfit(annMax, cdf=TRUE, nbest=17)
distLplot(dlf, cdf=TRUE, sel=c("wak", "revgum"))
dlf$gof
x <- sort(annMax)
segments(x0=x, y0=plmomco(x, dlf$parameter$revgum), y1=ecdf(annMax)(x), col=2)
segments(x0=x, y0=plmomco(x, dlf$parameter$wak), y1=ecdf(annMax)(x), col=4, lwd=2)
plot(ecdf(annMax), add=TRUE)
# RMSE: root of average of ( errors squared )  ,   errors = line distances

# weights by three different weighting schemes
distLgofPlot(dlf, ranks=FALSE, weights=TRUE)
distLgof(dlf, ks=TRUE, plot=FALSE)$gof

# Kolmogorov-Smirnov Tests return slightly different values:
ks.test(annMax, "pnorm", mean(annMax), sd(annMax) )$p.value
ks.test(annMax, "cdfnor", parnor(lmoms(annMax)))$p.value


# GOF: how well do the distributions fit the original data?
pairs(dlf$gof[,1:3]) # measures of goodness of fit are correlated quite well here.
# In the next example, however, we see that it does matter which one is used.

# compare Ranks with different proportions used for GOF
par(mfrow=c(1,2))
distLgofPlot(dlf, weights=FALSE)
d <- distLgof(dlf, gofProp=0.8, plot=TRUE, weights=FALSE)
par(mfrow=c(1,1))


# effect of Proportion of values used to calculate RMSE
dlf100 <- distLfit(annMax, gofProp=1, nbest=19, breaks=10) # the default gofProp: 100%
distLgofPlot(dlf100)

dlf50 <- distLgof(dlf100, gofProp=0.5)
# so revgum, nor and rice do well on the upper half by R2, but bad by RMSE
distLplot(dlf50, breaks=10)
# The red dashed line shows the cut above which the data were used to get R2/rmse

distLplot(dlf=dlf50, cdf=TRUE)
distLplot(dlf=dlf50, selection=c("pe3","wei", "rice", "nor", "revgum"),
          xlim=c(60,120), ylim=c(0.5, 1), cdf=TRUE, col=1)
dlf50$gof

compranks <- function(d)
{
gofProp <- 0.5
x <- sort(annMax, decreasing=TRUE)[  1:(gofProp*length(annMax))  ]
tcdfs <- plmomco(x,dlf50$parameter[[d]])
ecdfs <- ecdf(annMax)(x) # Empirical CDF
# Root Mean Square Error, R squared:
berryFunctions::linReg(tcdfs, ecdfs, lwd=1, pch=16, main=d, digits=5, xlim=c(0.5, 1),
       ylim=c(0.5, 1), pos1="topleft")
abline(a=0,b=1, lty=3)
c(berryFunctions::rmse(tcdfs, ecdfs), berryFunctions::rsquare(tcdfs, ecdfs))
}
dn <- rownames(dlf50$gof)

op <- par(mfrow=c(5,4), mar=rep(1,4), xaxt="n", yaxt="n")
for(i in dn) compranks(i)
par(op)
# so revgum, nor and rice systematically deviate from ECDF.
# RMSE is better to sort by than R2


## Not run:  ## to save CRAN check computing time
# 
# # custom weights
# cw <- c("gpa"=7, "gev"=3, "wak"=6, "wei"=4, "kap"=3.5, "gum"=3, "ray"=2.1,
#         "ln3"=2, "pe3"=2.5, "gno"=4, "gam"=5)
# dlf <- distLfit(annMax, plot=FALSE, weightc=cw)
# distLgofPlot(dlf, ranks=TRUE)
# 
# 
# dev.new()
# distLplot(dlf50, cdf=TRUE, sel=c("pe3", "rice", "revgum"), order=T)
# x <- sort(annMax, decreasing=TRUE)[  1:(0.5*length(annMax))  ]
# tcdfs <- plmomco(x,dlf50$parameter[["revgum"]])
# ecdfs <- ecdf(annMax)(x) # Empirical CDF
# plot(x, tcdfs, type="o", col=2)
# points(x, ecdfs)
# dev.new()
# linReg(tcdfs, ecdfs, type="o")
# abline(a=0,b=1, lty=3)
# 
# dev.new(record=TRUE)
# for(i in 1:10/10) distLgof(dlf100, gofProp=i, weights=FALSE,
#                            main=paste("upper", i*100, "% used for R2"))
# # depending on which proportion of the data the GOF is calculated with, different
# # distributions are selected to be the 5 best.
# dev.off()
# ## End(Not run) # end dontrun

Run the code above in your browser using DataLab