grkw: Gradient of the Negative Log-Likelihood for the Kumaraswamy (Kw) Distribution

Description

Computes the gradient vector (vector of first partial derivatives) of the negative log-likelihood function for the two-parameter Kumaraswamy (Kw) distribution with parameters alpha ($\alpha$) and beta ($\beta$). This provides the analytical gradient often used for efficient optimization via maximum likelihood estimation.

Usage

grkw(par, data)

Value

Returns a numeric vector of length 2 containing the partial derivatives of the negative log-likelihood function $-\ell(\theta | \mathbf{x})$ with respect to each parameter: $(-\partial \ell/\partial \alpha, -\partial \ell/\partial \beta)$. Returns a vector of NaN if any parameter values are invalid according to their constraints, or if any value in data is not in the interval (0, 1).

Arguments

par: A numeric vector of length 2 containing the distribution parameters in the order: alpha ($\alpha > 0$), beta ($\beta > 0$).
data: A numeric vector of observations. All values must be strictly between 0 and 1 (exclusive).

Author

Lopes, J. E.

Details

The components of the gradient vector of the negative log-likelihood ($-\nabla \ell(\theta | \mathbf{x})$) for the Kw model are:

$$ -\frac{\partial \ell}{\partial \alpha} = -\frac{n}{\alpha} - \sum_{i=1}^{n}\ln(x_i) + (\beta-1)\sum_{i=1}^{n}\frac{x_i^{\alpha}\ln(x_i)}{v_i} $$ $$ -\frac{\partial \ell}{\partial \beta} = -\frac{n}{\beta} - \sum_{i=1}^{n}\ln(v_i) $$

where $v_i = 1 - x_i^{\alpha}$. These formulas represent the derivatives of $-\ell(\theta)$, consistent with minimizing the negative log-likelihood. They correspond to the relevant components of the general GKw gradient (grgkw) evaluated at $\gamma=1, \delta=0, \lambda=1$.

References

Kumaraswamy, P. (1980). A generalized probability density function for double-bounded random processes. Journal of Hydrology, 46(1-2), 79-88.

Jones, M. C. (2009). Kumaraswamy's distribution: A beta-type distribution with some tractability advantages. Statistical Methodology, 6(1), 70-81.

(Note: Specific gradient formulas might be derived or sourced from additional references).

Examples

Run this code

# \donttest{
## Example 1: Basic Gradient Evaluation

# Generate sample data
set.seed(123)
n <- 1000
true_params <- c(alpha = 2.5, beta = 3.5)
data <- rkw(n, alpha = true_params[1], beta = true_params[2])

# Evaluate gradient at true parameters
grad_true <- grkw(par = true_params, data = data)
cat("Gradient at true parameters:\n")
print(grad_true)
cat("Norm:", sqrt(sum(grad_true^2)), "\n")

# Evaluate at different parameter values
test_params <- rbind(
  c(1.5, 2.5),
  c(2.0, 3.0),
  c(2.5, 3.5),
  c(3.0, 4.0)
)

grad_norms <- apply(test_params, 1, function(p) {
  g <- grkw(p, data)
  sqrt(sum(g^2))
})

results <- data.frame(
  Alpha = test_params[, 1],
  Beta = test_params[, 2],
  Grad_Norm = grad_norms
)
print(results, digits = 4)


## Example 2: Gradient in Optimization

# Optimization with analytical gradient
fit_with_grad <- optim(
  par = c(2, 2),
  fn = llkw,
  gr = grkw,
  data = data,
  method = "BFGS",
  hessian = TRUE,
  control = list(trace = 0)
)

# Optimization without gradient
fit_no_grad <- optim(
  par = c(2, 2),
  fn = llkw,
  data = data,
  method = "BFGS",
  hessian = TRUE,
  control = list(trace = 0)
)

comparison <- data.frame(
  Method = c("With Gradient", "Without Gradient"),
  Alpha = c(fit_with_grad$par[1], fit_no_grad$par[1]),
  Beta = c(fit_with_grad$par[2], fit_no_grad$par[2]),
  NegLogLik = c(fit_with_grad$value, fit_no_grad$value),
  Iterations = c(fit_with_grad$counts[1], fit_no_grad$counts[1])
)
print(comparison, digits = 4, row.names = FALSE)


## Example 3: Verifying Gradient at MLE

mle <- fit_with_grad$par
names(mle) <- c("alpha", "beta")

# At MLE, gradient should be approximately zero
gradient_at_mle <- grkw(par = mle, data = data)
cat("\nGradient at MLE:\n")
print(gradient_at_mle)
cat("Max absolute component:", max(abs(gradient_at_mle)), "\n")
cat("Gradient norm:", sqrt(sum(gradient_at_mle^2)), "\n")


## Example 4: Numerical vs Analytical Gradient

# Manual finite difference gradient
numerical_gradient <- function(f, x, data, h = 1e-7) {
  grad <- numeric(length(x))
  for (i in seq_along(x)) {
    x_plus <- x_minus <- x
    x_plus[i] <- x[i] + h
    x_minus[i] <- x[i] - h
    grad[i] <- (f(x_plus, data) - f(x_minus, data)) / (2 * h)
  }
  return(grad)
}

# Compare at several points
test_points <- rbind(
  c(1.5, 2.5),
  c(2.0, 3.0),
  mle,
  c(3.0, 4.0)
)

cat("\nNumerical vs Analytical Gradient Comparison:\n")
for (i in 1:nrow(test_points)) {
  grad_analytical <- grkw(par = test_points[i, ], data = data)
  grad_numerical <- numerical_gradient(llkw, test_points[i, ], data)
  
  cat("\nPoint", i, ": alpha =", test_points[i, 1], 
      ", beta =", test_points[i, 2], "\n")
  
  comparison <- data.frame(
    Parameter = c("alpha", "beta"),
    Analytical = grad_analytical,
    Numerical = grad_numerical,
    Abs_Diff = abs(grad_analytical - grad_numerical),
    Rel_Error = abs(grad_analytical - grad_numerical) / 
                (abs(grad_analytical) + 1e-10)
  )
  print(comparison, digits = 8)
}


## Example 5: Gradient Path Visualization

# Create grid
alpha_grid <- seq(mle[1] - 1, mle[1] + 1, length.out = 20)
beta_grid <- seq(mle[2] - 1, mle[2] + 1, length.out = 20)
alpha_grid <- alpha_grid[alpha_grid > 0]
beta_grid <- beta_grid[beta_grid > 0]

# Compute gradient vectors
grad_alpha <- matrix(NA, nrow = length(alpha_grid), ncol = length(beta_grid))
grad_beta <- matrix(NA, nrow = length(alpha_grid), ncol = length(beta_grid))

for (i in seq_along(alpha_grid)) {
  for (j in seq_along(beta_grid)) {
    g <- grkw(c(alpha_grid[i], beta_grid[j]), data)
    grad_alpha[i, j] <- -g[1]  # Negative for gradient ascent
    grad_beta[i, j] <- -g[2]
  }
}

# Plot gradient field

plot(mle[1], mle[2], pch = 19, col = "#8B0000", cex = 1.5,
     xlim = range(alpha_grid), ylim = range(beta_grid),
     xlab = expression(alpha), ylab = expression(beta),
     main = "Gradient Vector Field", las = 1)

# Subsample for clearer visualization
step <- 2
for (i in seq(1, length(alpha_grid), by = step)) {
  for (j in seq(1, length(beta_grid), by = step)) {
    arrows(alpha_grid[i], beta_grid[j],
           alpha_grid[i] + 0.05 * grad_alpha[i, j],
           beta_grid[j] + 0.05 * grad_beta[i, j],
           length = 0.05, col = "#2E4057", lwd = 1)
  }
}

points(true_params[1], true_params[2], pch = 17, col = "#006400", cex = 1.5)
legend("topright",
       legend = c("MLE", "True"),
       col = c("#8B0000", "#006400"),
       pch = c(19, 17), bty = "n")
grid(col = "gray90")


## Example 6: Score Test Statistic

# Score test for H0: theta = theta0
theta0 <- c(2, 3)
score_theta0 <- -grkw(par = theta0, data = data)  # Score is negative gradient

# Fisher information at theta0 (using Hessian)
fisher_info <- hskw(par = theta0, data = data)

# Score test statistic
score_stat <- t(score_theta0) %*% solve(fisher_info) %*% score_theta0
p_value <- pchisq(score_stat, df = 2, lower.tail = FALSE)

cat("\nScore Test:\n")
cat("H0: alpha = 2, beta = 3\n")
cat("Score vector:", score_theta0, "\n")
cat("Test statistic:", score_stat, "\n")
cat("P-value:", format.pval(p_value, digits = 4), "\n")

# }

Run the code above in your browser using DataLab