PeakSegJointFasterOne: PeakSegJointFasterOne

Description

Run the PeakSegJointFaster heuristic optimization algorithm, which gives an approximate solution to a multi-sample Poisson maximum likelihood segmentation problem. Given S samples, this function computes a sequence of S+1 PeakSegJoint models, with 0, ..., S samples with an overlapping peak (maximum of one peak per sample).

Usage

PeakSegJointFasterOne(profiles, 
    bin.factor = 2L)

Arguments

profiles

List of data.frames with columns chromStart, chromEnd, count, or single data.frame with additional column sample.id.

bin.factor

Size of bin pyramid. Bigger values result in slower computation.

Value

List of model fit results, see examples to see how to use it.

Examples

Run this code

# NOT RUN {
library(PeakSegJoint)
data(H3K36me3.TDH.other.chunk1, envir=environment())

some.counts <- subset(
  H3K36me3.TDH.other.chunk1$counts,
  43000000 < chromEnd &
  chromStart < 43200000 &
  sample.id %in% c("McGill0023", "McGill0022", "McGill0016", "McGill0013"))

id.df <- unique(some.counts[, c("cell.type", "sample.id")])
group.list <- split(paste(id.df$sample.id), id.df$cell.type, drop=TRUE)

loss.df.list <- list()
fit.list <- list()
for(bin.factor in 2:7){
  fit.fast <- PeakSegJointFasterOne(some.counts, bin.factor)
  fit.fast$min.loss <- sum(fit.fast$peak_loss_vec)
  fit.fast$sample.loss.diff.vec <- sort(with(fit.fast, structure(
    peak_loss_vec-flat_loss_vec, names=sample.id)))
  fit.fast$group.loss.diff.vec <- sort(sapply(group.list, function(sid.vec){
    sum(fit.fast$sample.loss.diff.vec[sid.vec])
  }))
  fit.fast$sample.loss.vec <- with(fit.fast, structure(
    sum(flat_loss_vec)+cumsum(c(0, sample.loss.diff.vec)),
    names=paste0(0:length(sample.loss.diff.vec), "samples")))
  fit.fast$group.loss.vec <- with(fit.fast, structure(
    sum(flat_loss_vec)+cumsum(c(0, group.loss.diff.vec)),
    names=paste0(0:length(group.loss.diff.vec), "groups")))
  loss.df.list[[paste(bin.factor)]] <- with(fit.fast, data.frame(
    bin.factor,
    loss=sample.loss.vec,
    peaks=0:length(sample.loss.diff.vec)))
  fit.list[[paste(bin.factor)]] <- fit.fast
}
loss.df <- do.call(rbind, loss.df.list)
fit.best <- fit.list[[which.min(sapply(fit.list, "[[", "min.loss"))]]

norm.list <- list()
profile.list <- split(some.counts, some.counts$sample.id, drop=TRUE)
for(sample.id in names(profile.list)){
  one <- profile.list[[sample.id]]
  max.count <- max(one$count)
  one$count.norm <- one$count/max.count
  norm.list[[sample.id]] <- one
}
norm.df <- do.call(rbind, norm.list)

if(interactive() && require(ggplot2)){
  
  peaks.df.list <- list()
  for(n.samples in 1:length(fit.best$sample.loss.diff.vec)){
    peaks.df.list[[paste(n.samples)]] <- with(fit.best, data.frame(
      samples=n.samples,
      sample.id=names(sample.loss.diff.vec)[1:n.samples],
      chromStart=peak_start_end[1],
      chromEnd=peak_start_end[2]))
  }
  peaks <- do.call(rbind, peaks.df.list)
  best.peaks <- transform(peaks, y=samples*-0.1, what="peaks")
  ggplot()+
    ggtitle("model for each sample")+
    scale_color_manual(values=c(data="grey50",
                         peaks="deepskyblue",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=best.peaks)+
    geom_text(aes(chromStart/1e3, y,
                  label=paste0(samples, " sample",
                    ifelse(samples==1, "", "s"), " "),
                  color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=best.peaks)+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free")

  ## same thing but for each group.
  peaks.df.list <- list()
  for(n.groups in 1:length(fit.best$group.loss.diff.vec)){
    group.vec <- names(fit.best$group.loss.diff.vec[1:n.groups])
    meta.df <- do.call(rbind, lapply(group.vec, function(cell.type){
      data.frame(cell.type, sample.id=group.list[[cell.type]])
    }))
    peaks.df.list[[paste(n.groups)]] <- with(fit.best, data.frame(
      groups=n.groups,
      meta.df,
      chromStart=peak_start_end[1],
      chromEnd=peak_start_end[2]))
  }
  peaks <- do.call(rbind, peaks.df.list)
  best.peaks <- transform(peaks, y=groups*-0.1, what="peaks")
  ggplot()+
    ggtitle("model for each group")+
    scale_color_manual(values=c(data="grey50",
                         peaks="deepskyblue",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=best.peaks)+
    geom_text(aes(chromStart/1e3, y,
                  label=paste0(groups, " group",
                    ifelse(groups==1, "", "s"), " "),
                  color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=best.peaks)+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id + cell.type ~ ., scales="free")

  min.df <- subset(loss.df, peaks==max(peaks))
  ggplot()+
    geom_line(aes(peaks, loss, group=bin.factor), data=loss.df)+
    geom_text(aes(peaks, loss, label=bin.factor), data=min.df, hjust=0)

  if(require(microbenchmark)){

    N.samples.vec <- 10^seq(1, 3, by=0.5)
    max.N <- max(N.samples.vec)
    N.bases <- 10
    rmat <- function(Nr, Nc, mu){
      matrix(rpois(Nr*Nc, mu), Nr, Nc)
    }
    set.seed(1)
    big.mat <- cbind(
      rmat(max.N, N.bases, 5),
      rmat(max.N, N.bases, 10),
      rmat(max.N, N.bases, 5))
    big.df <- data.frame(
      sample.id=as.integer(row(big.mat)),
      chromStart=as.integer(col(big.mat)-1),
      chromEnd=as.integer(col(big.mat)),
      count=as.integer(big.mat))
    full.list <- ProfileList(big.df)
    time.df.list <- list()
    for(N.samples in N.samples.vec){
      partial.list <- full.list[1:N.samples]
      result <- microbenchmark(
        Heuristic=PeakSegJointHeuristicStep2(partial.list, 2L),
        Faster=PeakSegJointFasterOne(partial.list, 2L),
        times=2L)
      time.df.list[[paste(N.samples)]] <- data.frame(
        N.samples,
        result)
    }
    time.df <- do.call(rbind, time.df.list)

    ggplot()+
      geom_point(aes(
        N.samples, time/1e9, color=expr),
        data=time.df)+
      scale_x_log10()+
      scale_y_log10("seconds")
    
  }

}

# }

Run the code above in your browser using DataLab