get_Y_AB_bounds_bootstrap: Estimate Y_A and Y_B bounds with bootstrap

Description

Estimate the confidence intervals for the cumulative distributions of Y_A and Y_B using bootstrap. Much slower than the Dvoretzky–Kiefer–Wolfowitz approach.

Usage

get_Y_AB_bounds_bootstrap(
  X_A_observed,
  X_B_observed,
  alpha = 0.05,
  EPSILON = 1e-20,
  nOfBootstrapSamples = 1000,
  ignoreMinimumLengthCheck = FALSE
)

Value

Returns a list with the following fields:

- p: values in the interval [0,1] that represent the nOfEstimationPoints points in which the densities are estimated. Useful for plotting.

- Y_A_cumulative_estimation: an array with the estimated cumulative diustribution function of Y_A from 0 to p[[i]].

- Y_A_cumulative_upper: an array with the upper bounds of confidence 1 - alpha of the cumulative density of Y_A

- Y_A_cumulative_lower: an array with the lower bounds of confidence 1 - alpha of the cumulative density of Y_A

- Y_B_cumulative_estimation: The same as Y_A_cumulative_estimation for Y_B.

- Y_B_cumulative_upper: The same as Y_A_cumulative_upper for Y_B

- Y_B_cumulative_lower: The same as Y_A_cumulative_lower for Y_B

- diff_estimation: Y_A_cumulative_estimation - Y_B_cumulative_estimation

- diff_upper: an array with the upper bounds of confidence 1 - alpha of the difference between the cumulative distributions

- diff_lower: an array with the lower bounds of confidence 1 - alpha of the difference between the cumulative distributions

Arguments

X_A_observed: array of the observed samples (real values) of X_A.
X_B_observed: array of the observed samples (real values) of X_B, it needs to have the same length as X_A.
alpha: (optional, default value 0.05) the error of the confidence interval. If alpha = 0.05 then we have 95 percent confidence interval.
EPSILON: (optional, default value 1e-20) minimum difference between two values to be considered different.
nOfBootstrapSamples: (optional, default value 1e3) how many bootstrap samples to average. Increases computation time.
ignoreMinimumLengthCheck: (optional, default value FALSE) wether to check for a minimum length in X_A and X_B.

Examples

Run this code

library(ggplot2)

### Example 1 ###
X_A_observed <- rnorm(100, mean = 2, sd = 1)
X_B_observed <- rnorm(100, mean = 2.1, sd = 0.5)
# \donttest{
 res <- get_Y_AB_bounds_bootstrap(X_A_observed, X_B_observed)
# }
# \dontshow{
# easier on computation for testing.
res <- get_Y_AB_bounds_bootstrap(X_A_observed, X_B_observed, nOfBootstrapSamples=1e2)
# }
fig1 = plot_Y_AB(res, plotDifference=FALSE)+ ggplot2::ggtitle("Example 1")
print(fig1)


# \donttest{
### Example 2 ###
# Comparing the estimations with the actual distributions for two normal distributions.
###################################
## sample size = 100 ##############
###################################
X_A_observed <- rnorm(100,mean = 1, sd = 1)
X_B_observed <- rnorm(100,mean = 1.3, sd = 0.5)
res <- get_Y_AB_bounds_bootstrap(X_A_observed, X_B_observed)

X_A_observed_large_sample <- sort(rnorm(1e4, mean = 1, sd = 1))
X_B_observed_large_sample <- sort(rnorm(1e4, mean = 1.3, sd = 0.5))
actualDistributions <- getEmpiricalCumulativeDistributions(
        X_A_observed_large_sample,
        X_B_observed_large_sample,
        nOfEstimationPoints=1e4,
        EPSILON=1e-20)


actualDistributions$Y_A_cumulative_estimation <- lm(Y_A_cumulative_estimation ~
        p + I(p^2) + I(p^3)+ I(p^4)+ I(p^5)+ I(p^6)+I(p^7)+ I(p^8),
        data = actualDistributions)$fitted.values
actualDistributions$Y_B_cumulative_estimation <- lm(Y_B_cumulative_estimation ~
        p + I(p^2) + I(p^3)+ I(p^4)+ I(p^5)+ I(p^6)+I(p^7)+ I(p^8),
        data = actualDistributions)$fitted.values

fig = plot_Y_AB(res, plotDifference=FALSE) +

geom_line(data=as.data.frame(actualDistributions),
aes(x=p, y=Y_A_cumulative_estimation, colour = "Actual Y_A", linetype="Actual Y_A")) +

geom_line(data=as.data.frame(actualDistributions),
aes(x=p, y=Y_B_cumulative_estimation, colour = "Actual Y_B", linetype="Actual Y_B")) +

scale_colour_manual("", breaks = c("X_A", "X_B","Actual Y_A", "Actual Y_B"),
values = c("X_A"="#00BFC4", "X_B"="#F8766D", "Actual Y_A"="#0000FF", "Actual Y_B"="#FF0000"))+

scale_linetype_manual("", breaks = c("X_A", "X_B","Actual Y_A", "Actual Y_B"),
values = c("X_A"="solid", "X_B"="dashed", "Actual Y_A"="solid", "Actual Y_B"="solid"))+

ggtitle("100 samples used in the estimation")
print(fig)

###################################
## sample size = 300 ##############
###################################
X_A_observed <- rnorm(300,mean = 1, sd = 1)
X_B_observed <- rnorm(300,mean = 1.3, sd = 0.5)
res <- get_Y_AB_bounds_bootstrap(X_A_observed, X_B_observed)

X_A_observed_large_sample <- sort(rnorm(1e4, mean = 1, sd = 1))
X_B_observed_large_sample <- sort(rnorm(1e4, mean = 1.3, sd = 0.5))
actualDistributions <- getEmpiricalCumulativeDistributions(
        X_A_observed_large_sample,
        X_B_observed_large_sample,
        nOfEstimationPoints=1e4,
        EPSILON=1e-20)


actualDistributions$Y_A_cumulative_estimation <- lm(Y_A_cumulative_estimation ~
        p + I(p^2) + I(p^3)+ I(p^4)+ I(p^5)+ I(p^6)+I(p^7)+ I(p^8),
        data = actualDistributions)$fitted.values

actualDistributions$Y_B_cumulative_estimation <- lm(Y_B_cumulative_estimation ~
       p + I(p^2) + I(p^3)+ I(p^4)+ I(p^5)+ I(p^6)+I(p^7)+ I(p^8),
       data = actualDistributions)$fitted.values

fig = plot_Y_AB(res, plotDifference=FALSE) +

geom_line(data=as.data.frame(actualDistributions),
aes(x=p, y=Y_A_cumulative_estimation, colour = "Actual Y_A", linetype="Actual Y_A")) +

geom_line(data=as.data.frame(actualDistributions),
aes(x=p, y=Y_B_cumulative_estimation, colour = "Actual Y_B", linetype="Actual Y_B")) +

scale_colour_manual("", breaks = c("X_A", "X_B","Actual Y_A", "Actual Y_B"),
values = c("X_A"="#00BFC4", "X_B"="#F8766D", "Actual Y_A"="#0000FF", "Actual Y_B"="#FF0000"))+

scale_linetype_manual("", breaks = c("X_A", "X_B","Actual Y_A", "Actual Y_B"),
values = c("X_A"="solid", "X_B"="dashed", "Actual Y_A"="solid", "Actual Y_B"="solid"))+

ggtitle("300 samples used in the estimation")
print(fig)
# }

Run the code above in your browser using DataLab