if (all(require(Rsamtools) &&
require(RNAseqData.HNRNPC.bam.chr14) &&
require(GenomicAlignments) &&
require(ShortRead) &&
.Platform$OS.type != "windows")) {
## ----------------------------------------------------------------------
## Iterating through a BAM file
## ----------------------------------------------------------------------
## Select a single file and set 'yieldSize' in the BamFile object.
fl <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[[1]]
bf <- BamFile(fl, yieldSize = 300000)
## bamIterator() is initalized with a BAM file and returns a function.
## The return function requires no arguments and iterates through the
## file returning data chunks the size of yieldSize.
bamIterator <- function(bf) {
done <- FALSE
if (!isOpen( bf))
open(bf)
function() {
if (done)
return(NULL)
yld <- readGAlignments(bf)
if (length(yld) == 0L) {
close(bf)
done <<- TRUE
NULL
} else yld
}
}
## Initalize the iterator.
ITER <- bamIterator(bf)
## Create a FUN that counts reads in a region of interest.
roi <- GRanges("chr14", IRanges(seq(19e6, 107e6, by = 10e6), width = 10e6))
counter <- function(reads, roi, ...) {
countOverlaps(query = roi, subject = reads)
}
## Create a MulticoreParam and call bpiterate().
bpparam <- MulticoreParam(workers = 2)
res <- bpiterate(ITER, counter, BPPARAM = bpparam, roi = roi)
## The result length is the same as the number of data chunks.
length(res)
colSums(do.call(rbind, res))
## ----------------------------------------------------------------------
## Iterating through a FASTA file
## ----------------------------------------------------------------------
## Set data chunk size with 'n' in the FastqStreamer object.
sp <- SolexaPath(system.file('extdata', package = 'ShortRead'))
fl <- file.path(analysisPath(sp), "s_1_sequence.txt")
fqs <- FastqStreamer(fl, n = 100)
## Create an iterator that returns data chunks the size of 'n'.
fastqIterator <- function(fqs) {
done <- FALSE
if (!isOpen(fqs))
open(fqs)
function() {
if (done)
return(NULL)
yld <- yield(fqs)
if (length(yld) == 0L) {
close(fqs)
done <<- TRUE
NULL
} else yld
}
}
## Initialize the iterator.
ITER <- fastqIterator(fqs)
## The processor summarizes the number of times each sequence occurs.
summary <- function(reads, ...) {
tables(reads, n = 0)$distribution
}
bpparam <- MulticoreParam(workers = 2)
bpiterate(ITER, summary, BPPARAM = bpparam)
## Results from the workers are combined on the fly when a
## REDUCE function is provided. Collapsing the data in this
## way can substantially reduce memory requirements.
fqs <- FastqStreamer(fl, n = 100)
ITER <- fastqIterator(fqs)
bpiterate(ITER, summary, BPPARAM = bpparam, REDUCE = merge, all = TRUE)
## ----------------------------------------------------------------------
## Multiple files
## ----------------------------------------------------------------------
## Currently bpiterate() is only implemented for the multi-core
## environment (i.e., MulticoreParam). A single machine may not
## have enough memory to process multiple files. In this case the
## files can be distributed over a snow cluster with bplapply()
## then processed with bpiterate() on each cluster node.
## Select a subset of files, create a BamFileList.
fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1:3]
bfl <- BamFileList(fls, yieldSize = 200000)
## Cluster size is defined by the number of files.
snowp <- SnowParam(workers = length(bfl))
## Currently bpiterate() is only supported in the multi-core environment
## and must use MulticoreParam.
myFUN <- function(file, bamIterator, counter, ...) {
library(BiocParallel) ## for bpiterate
library(Rsamtools) ## for Bam file manipulation
library(GenomicAlignments) ## for readGAlignments
ITER <- bamIterator(file)
bpiterate(ITER, counter, BPPARAM = MulticoreParam(workers=2), roi = roi)
}
## Distribute the files across the snow cluster workers with bplapply().
## Each cluster node acts as a master and spawns 2 children.
res <- bplapply(bfl, myFUN, BPPARM = snowp, bamIterator = bamIterator,
counter = counter, roi = roi)
## The result is of length 3 (# of files).
## > length(res)
## [1] 3
## Each list element is of length 5 (# of iterations of size 'yieldSize').
## > elementLengths(res)
## ERR127306 ERR127307 ERR127308
## 5 5 5
}
Run the code above in your browser using DataLab