## ---------------------------------------------------------------------
## A. readGAlignments()
## ---------------------------------------------------------------------
## Simple use:
bamfile <- system.file("extdata", "ex1.bam", package="Rsamtools",
mustWork=TRUE)
gal1 <- readGAlignments(bamfile)
gal1
names(gal1)
## Using the 'use.names' arg:
gal2 <- readGAlignments(bamfile, use.names=TRUE)
gal2
head(names(gal2))
## Using the 'param' arg to drop PCR or optical duplicates as well as
## secondary alignments, and to load additional BAM fields:
param <- ScanBamParam(flag=scanBamFlag(isDuplicate=FALSE,
isSecondaryAlignment=FALSE),
what=c("qual", "flag"))
gal3 <- readGAlignments(bamfile, param=param)
gal3
mcols(gal3)
## Using the 'param' arg to load alignments from particular regions.
## Note that if we weren't providing a 'what' argument here, all the
## BAM fields would be loaded:
which <- RangesList(seq1=IRanges(1000, 1100),
seq2=IRanges(c(1546, 1555, 1567), width=10))
param <- ScanBamParam(which=which)
gal4 <- readGAlignments(bamfile, use.names=TRUE, param=param)
gal4
## IMPORTANT NOTE: A given record is loaded one time for each region
## it overlaps with. We call this "duplicated record selection" (this
## is a scanBam() feature, readGAlignments() is based on scanBam()):
which <- IRangesList(seq2=IRanges(c(1555, 1567), width=10))
param <- ScanBamParam(which=which)
gal5 <- readGAlignments(bamfile, use.names=TRUE, param=param)
gal5 # record EAS114_26:7:37:79:581 was loaded twice
## This becomes clearer if we use 'with.which_label=TRUE' to identify
## the region in 'which' where each element in 'gal5' originates from.
gal5 <- readGAlignments(bamfile, use.names=TRUE, param=param,
with.which_label=TRUE)
gal5
## Not surprisingly, we also get "duplicated record selection" when
## 'which' contains repeated or overlapping regions. Using the same
## regions as we did for 'gal4' above, except that now we're
## repeating the region on seq1:
which <- RangesList(seq1=rep(IRanges(1000, 1100), 2),
seq2=IRanges(c(1546, 1555, 1567), width=10))
param <- ScanBamParam(which=which)
gal4b <- readGAlignments(bamfile, use.names=TRUE, param=param)
length(gal4b) # > length(gal4), because all the records overlapping
# with bases 1000 to 1100 on seq1 are now duplicated
## The "duplicated record selection" will artificially increase the
## coverage or affect other downstream results. It can be mitigated
## (but not completely eliminated) by first "reducing" the set of
## regions:
which <- reduce(which)
which
param <- ScanBamParam(which=which)
gal4c <- readGAlignments(bamfile, use.names=TRUE, param=param)
length(gal4c) # < length(gal4), because the 2 first original regions
# on seq2 were merged into a single one
## Note that reducing the set of regions didn't completely eliminate
## "duplicated record selection". Records that overlap the 2 reduced
## regions on seq2 (which$seq2) are loaded twice (like for 'gal5'
## above).
## Using the 'param' arg to load tags. Except for MF and Aq, the tags
## specified below are predefined tags (see the SAM Spec for the list
## of predefined tags and their meaning).
param <- ScanBamParam(tag=c("MF", "Aq", "NM", "UQ", "H0", "H1"),
what="isize")
gal6 <- readGAlignments(bamfile, param=param)
mcols(gal6) # "tag" cols always after "what" cols
## With a BamViews object:
fls <- system.file("extdata", "ex1.bam", package="Rsamtools",
mustWork=TRUE)
bv <- BamViews(fls,
bamSamples=DataFrame(info="test", row.names="ex1"),
auto.range=TRUE)
## Note that the "readGAlignments" method for BamViews objects
## requires the ShortRead package to be installed.
aln <- readGAlignments(bv)
aln
aln[[1]]
aln[colnames(bv)]
mcols(aln)
## ---------------------------------------------------------------------
## B. readGAlignmentPairs()
## ---------------------------------------------------------------------
galp1 <- readGAlignmentPairs(bamfile)
head(galp1)
names(galp1)
## Using the 'param' arg to drop PCR or optical duplicates as well as
## secondary alignments (dropping secondary alignments can help make the
## pairing algorithm run significantly faster):
param <- ScanBamParam(flag=scanBamFlag(isDuplicate=FALSE,
isSecondaryAlignment=FALSE))
galp2 <- readGAlignmentPairs(bamfile, use.names=TRUE, param=param)
galp2
head(galp2)
head(names(galp2))
## ---------------------------------------------------------------------
## C. readGAlignmentsList()
## ---------------------------------------------------------------------
library(pasillaBamSubset)
## 'file' as character.
fl <- untreated3_chr4()
galist1 <- readGAlignmentsList(fl)
galist1[1:3]
length(galist1)
table(elementLengths(galist1))
## When 'file' is a BamFile, 'asMates' must be TRUE. If FALSE,
## the data are treated as single-end and each list element of the
## GAlignmentsList will be of length 1. For single-end data
## use readGAlignments().
bf <- BamFile(fl, yieldSize=3, asMates=TRUE)
readGAlignmentsList(bf)
## Use a 'param' to fine tune the results.
param <- ScanBamParam(flag=scanBamFlag(isProperPair=TRUE))
galist2 <- readGAlignmentsList(fl, param=param)
length(galist2)
## ---------------------------------------------------------------------
## D. readGappedReads()
## ---------------------------------------------------------------------
greads1 <- readGappedReads(bamfile)
greads1
names(greads1)
qseq(greads1)
greads2 <- readGappedReads(bamfile, use.names=TRUE)
head(greads2)
head(names(greads2))
Run the code above in your browser using DataLab