PF_EM: EM Estimation with Particle Filters and Smoothers

Description

Method to estimate the hyper parameters with an EM algorithm.

Usage

PF_EM(formula, data, model = "logit", by, max_T, id, a_0, Q_0, Q,
  order = 1, control = PF_control(...), trace = 0, seed = NULL,
  type = "RW", fixed = NULL, random = NULL, Fmat, fixed_effects, G,
  theta, J, K, psi, phi, ...)

Arguments

formula

coxph like formula with Surv(tstart, tstop, event) on the left hand site of ~.

data

data.frame or environment containing the outcome and covariates.

model

either 'logit' for binary outcomes with the logistic link function, 'cloglog' for binary outcomes with the inverse cloglog link function, or 'exponential' for piecewise constant exponential distributed arrival times.

interval length of the bins in which parameters are fixed.

max_T

end of the last interval interval.

vector of ids for each row of the in the design matrix.

a_0

vector $a_0$ for the initial coefficient vector for the first iteration (optional). Default is estimates from static model (see static_glm).

Q_0

covariance matrix for the prior distribution.

initial covariance matrix for the state equation.

order

order of the random walk.

control

see PF_control.

trace

argument to get progress information. Zero will yield no info and larger integer values will yield incrementally more information.

seed

seed to set at the start of every EM iteration. See set.seed.

type

type of state model. Either "RW" for a [R]andom [W]alk or "VAR" for [V]ector [A]uto[R]egression.

fixed

two-sided formula to be used with random instead of formula. It is of the form Surv(tstart, tstop, event) ~ x or Surv(tstart, tstop, event) ~ - 1 for no fixed effects.

random

one-sided formula to be used with fixed instead of formula. It is of the form ~ z.

Fmat

starting value for $F$ when type = "VAR". See 'Details' in PF_EM.

fixed_effects

starting values for fixed effects if any. See ddFixed.

G, theta, J, K, psi, phi

parameters for a restricted type = "VAR" model. See the vignette mentioned in 'Details' of PF_EM and the examples linked to in 'See Also'.

...

optional way to pass arguments to control.

Value

An object of class PF_EM.

Warning

The function is still under development so the output and API may change.

Details

Estimates a state model of the form

$$\alpha_t = F \alpha_t + R\epsilon_t, \qquad \epsilon_t \sim N(0, Q)$$

where $F\in{\rm I\!R}^{p\times p}$ has full rank, $\alpha_t\in {\rm I\!R}^p$, $\epsilon_t\in {\rm I\!R}^r$, $r \leq p$, and $R = (e_{l_1},e_{l_2},\dots,e_{l_r})$ where $e_k$ is column from the $p$ dimensional identity matrix and $l_1<l_2<\dots<l_r$. The time zero state is drawn from

$$\alpha_0\sim N(a_0, Q_0)$$

with $Q_0 \in {\rm I\!R}^{p\times p}$. The latent states, $\alpha_t$, are related to the output through the linear predictors

$$\eta_{it} = X_t(R^+\alpha_t) + Z_t\beta$$

where $X_t\in{\rm I\!R}^{n_t\times r}$ and $Z_t{\rm I\!R}^{n_t\times c}$ are design matrices and the outcome for a individual $i$ at time $t$ is distributed according to an exponential family member given $\eta_{it}$. $\beta$ are constant coefficients.

See vignette("Particle_filtering", "dynamichazard") for details.

Examples

Run this code

# NOT RUN {
#####
# Fit model with lung data set from survival
# Warning: long-ish computation time

library(dynamichazard)
.lung <- lung[!is.na(lung$ph.ecog), ]
# standardize
.lung$age <- scale(.lung$age)

# fit
set.seed(43588155)
pf_fit <- PF_EM(
 Surv(time, status == 2) ~ ddFixed(ph.ecog) + age,
 data = .lung, by = 50, id = 1:nrow(.lung),
 Q_0 = diag(1, 2), Q = diag(.5^2, 2),
 max_T = 800,
 control = PF_control(
    N_fw_n_bw = 500, N_first = 2500, N_smooth = 5000,
    n_max = 50, eps = .001, Q_tilde = diag(.2^2, 2), est_a_0 = FALSE,
    n_threads = max(parallel::detectCores(logical = FALSE), 1)))

# Plot state vector estimates
plot(pf_fit, cov_index = 1)
plot(pf_fit, cov_index = 2)

# Plot log-likelihood
plot(pf_fit$log_likes)
# }
# NOT RUN {
######
# example with fixed intercept

# prepare data
temp <- subset(pbc, id <= 312, select=c(id, sex, time, status, edema, age))
pbc2 <- tmerge(temp, temp, id=id, death = event(time, status))
pbc2 <- tmerge(pbc2, pbcseq, id=id, albumin = tdc(day, albumin),
              protime = tdc(day, protime), bili = tdc(day, bili))
pbc2 <- pbc2[, c("id", "tstart", "tstop", "death", "sex", "edema",
                "age", "albumin", "protime", "bili")]
pbc2 <- within(pbc2, {
 log_albumin <- log(albumin)
 log_protime <- log(protime)
 log_bili <- log(bili)
})

# standardize
for(c. in c("age", "log_albumin", "log_protime", "log_bili"))
 pbc2[[c.]] <- drop(scale(pbc2[[c.]]))

# fit model with extended Kalman filter
ddfit <- ddhazard(
 Surv(tstart, tstop, death == 2) ~ ddFixed_intercept() + ddFixed(age) +
   ddFixed(edema) + ddFixed(log_albumin) + ddFixed(log_protime) + log_bili,
 pbc2, Q_0 = 100, Q = 1e-2, by = 100, id = pbc2$id,
 model = "exponential", max_T = 3600,
 control = ddhazard_control(eps = 1e-5, NR_eps = 1e-4, n_max = 1e4))
summary(ddfit)

# fit model with particle filter
set.seed(88235076)
pf_fit <- PF_EM(
  Surv(tstart, tstop, death == 2) ~ ddFixed_intercept() + ddFixed(age) +
    ddFixed(edema) + ddFixed(log_albumin) + ddFixed(log_protime) + log_bili,
  pbc2, Q_0 = 2^2, Q = ddfit$Q * 100, # use estimate from before
  by = 100, id = pbc2$id,
  model = "exponential", max_T = 3600,
  control = PF_control(
    N_fw_n_bw = 500, N_smooth = 2500, N_first = 1000, eps = 1e-3,
    method = "AUX_normal_approx_w_cloud_mean", est_a_0 = FALSE,
    Q_tilde = as.matrix(.1^2),
    n_max = 25, # just take a few iterations as an example
    n_threads = max(parallel::detectCores(logical = FALSE), 1)))

# compare results
plot(ddfit)
plot(pf_fit)
sqrt(ddfit$Q * 100)
sqrt(pf_fit$Q)
rbind(ddfit$fixed_effects, pf_fit$fixed_effects)
# }
# NOT RUN {
#####
# simulation example with `random` and `fixed` argument and a restricted
# model

# g groups with k individuals in each
g <- 3L
k <- 400L

# matrices for state equation
p <- g + 1L
G <- matrix(0., p^2, 2L)
for(i in 1:p)
  G[i + (i - 1L) * p, 1L + (i == p)] <- 1L

theta <- c(.9, .8)
# coefficients in transition density
(F. <- matrix(as.vector(G %*% theta), 4L, 4L))

J <- matrix(0., ncol = 2L, nrow = p)
J[-p, 1L] <- J[p, 2L] <- 1
psi <- c(log(c(.3, .1)))

K <- matrix(0., p * (p - 1L) / 2L, 2L)
j <- 0L
for(i in (p - 1L):1L){
  j <- j + i
  K[j, 2L] <- 1
}
K[K[, 2L] < 1, 1L] <- 1
phi <- log(-(c(.8, .3) + 1) / (c(.8, .3) - 1))

V <- diag(exp(drop(J %*% psi)))
C <- diag(1, ncol(V))
C[lower.tri(C)] <- 2/(1 + exp(-drop(K %*% phi))) - 1
C[upper.tri(C)] <- t(C)[upper.tri(C)]
(Q <- V %*% C %*% V)     # covariance matrix in transition density
cov2cor(Q)

Q_0 <- get_Q_0(Q, F.)    # time-invariant covariance matrix
beta <- c(rep(-6, g), 0) # all groups have the same long run mean intercept

# simulate state variables
set.seed(56219373)
n_periods <- 300L
alphas <- matrix(nrow = n_periods + 1L, ncol = p)
alphas[1L, ] <- rnorm(p) %*% chol(Q_0)
for(i in 1:n_periods + 1L)
  alphas[i, ] <- F. %*% alphas[i - 1L, ] + drop(rnorm(p) %*% chol(Q))

alphas <- t(t(alphas) + beta)

# plot state variables
matplot(alphas, type = "l", lty = 1)

# simulate individuals' outcome
n_obs <- g * k
df <- lapply(1:n_obs, function(i){
  # find the group
  grp <- (i - 1L) %/% (n_obs / g) + 1L

  # left-censoring
  tstart <- max(0L, sample.int((n_periods - 1L) * 2L, 1) - n_periods + 1L)

  # covariates
  x <- c(1, rnorm(1))

  # outcome (stop time and event indicator)
  osa <- NULL
  oso <- NULL
  osx <- NULL
  y <- FALSE
  for(tstop in (tstart + 1L):n_periods){
    sigmoid <- 1 / (1 + exp(- drop(x %*% alphas[tstop + 1L, c(grp, p)])))
    if(sigmoid > runif(1)){
      y <- TRUE
      break
    }
    if(.01 > runif(1L) && tstop < n_periods){
      # sample new covariate
      osa <- c(osa, tstart)
      tstart <- tstop
      oso <- c(oso, tstop)
      osx <- c(osx, x[2])
      x[2] <- rnorm(1)
    }
  }

  cbind(
    tstart = c(osa, tstart), tstop = c(oso, tstop),
    x = c(osx, x[2]), y = c(rep(FALSE, length(osa)), y), grp = grp,
    id = i)
})
df <- data.frame(do.call(rbind, df))
df$grp <- factor(df$grp)

# fit model. Start with "cheap" iterations
fit <- PF_EM(
  fixed = Surv(tstart, tstop, y) ~ x, random = ~ grp + x - 1,
  data = df, model = "logit", by = 1L, max_T = max(df$tstop),
  Q_0 = diag(1.5^2, p), id = df$id, type = "VAR",
  G = G, theta = c(.5, .5), J = J, psi = log(c(.1, .1)),
  K = K, phi = log(-(c(.4, 0) + 1) / (c(.4, 0) - 1)),
  control = PF_control(
    N_fw_n_bw = 100L, N_smooth = 100L, N_first = 500L,
    method = "AUX_normal_approx_w_cloud_mean",
    nu = 5L, # sample from multivariate t-distribution
    n_max = 100L,  averaging_start = 50L,
    smoother = "Fearnhead_O_N", eps = 1e-4, covar_fac = 1.2,
    n_threads = 4L # depends on your cpu(s)
  ),
  trace = 1L)
plot(fit$log_likes) # log-likelihood approximation at each iterations

# take more iterations with more particles
cl <- fit$call
ctrl <- cl[["control"]]
ctrl[c("N_fw_n_bw", "N_smooth", "N_first", "n_max",
       "averaging_start")] <- list(500L, 2000L, 5000L, 200L, 30L)
cl[["control"]] <- ctrl
cl[c("phi", "psi", "theta")] <- list(fit$phi, fit$psi, fit$theta)
fit_extra <- eval(cl)

plot(fit_extra$log_likes) # log-likelihood approximation at each iteration

# check estimates
sqrt(diag(fit_extra$Q))
sqrt(diag(Q))
cov2cor(fit_extra$Q)
cov2cor(Q)
fit_extra$F
F.

# plot predicted state variables
for(i in 1:p){
  plot(fit_extra, cov_index = i)
  abline(h = 0, lty = 2)
  lines(1:nrow(alphas) - 1, alphas[, i] - beta[i], lty = 3)
}
# }

Run the code above in your browser using DataLab