library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
## ---------------------------------------------------------------------
## Variants in all gene regions
## ---------------------------------------------------------------------
## Read variants from a VCF file.
fl <- system.file("extdata", "gl_chr1.vcf",
package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
## Often the seqlevels in the VCF file do not match those in the TxDb.
head(seqlevels(vcf))
head(seqlevels(txdb))
intersect(seqlevels(vcf), seqlevels(txdb))
## Rename seqlevels with renameSeqlevesl().
vcf <- renameSeqlevels(vcf, paste0("chr", seqlevels(vcf)))
## Confirm.
intersect(seqlevels(vcf), seqlevels(txdb))
## Overlaps for all possible variant locations.
loc_all <- locateVariants(vcf, txdb, AllVariants())
table(loc_all$LOCATION)
## ---------------------------------------------------------------------
## Variants in intergenic regions
## ---------------------------------------------------------------------
## Intergenic variants do not overlap a gene range in the
## annotation and therefore 'GENEID' is always NA. Flanking genes
## that fall within the 'upstream' and 'downstream' distances are
## reported as PRECEDEID and FOLLOWID.
region <- IntergenicVariants(upstream=70000, downstream=70000)
loc_int <- locateVariants(vcf, txdb, region)
mcols(loc_int)[c("LOCATION", "PRECEDEID", "FOLLOWID")]
## Distance to the flanking genes can be computed for variants that
## have PRECEDEID(s) or FOLLOWID(s). Each variant can have multiple
## flanking id's so we first expand PRECEDEID and the corresponding
## variant ranges.
p_ids <- unlist(loc_int$PRECEDEID, use.names=FALSE)
exp_ranges <- rep(loc_int, elementNROWS(loc_int$PRECEDEID))
## Provide the variant GRanges as 'x' and the TxDb annotation as 'y'
## to distance(). The 'id' and 'type' arguments describe an id found
## in the TxDb. See the ?'nearest-methods' man page in GenomicFeatures
## for details.
p_dist <- distance(exp_ranges, txdb, id=p_ids, type="gene")
head(p_dist)
## Expanded view of ranges, gene id and distance:
exp_ranges$PRECEDE_DIST <- p_dist
exp_ranges
## Collapsed view of ranges, gene id and distance:
loc_int$PRECEDE_DIST <- relist(p_dist, loc_int$PRECEDEID)
loc_int
## ---------------------------------------------------------------------
## GRangesList as subject
## ---------------------------------------------------------------------
## When 'subject' is a GRangesList the GENEID is unavailable and
## will always be reported as NA. This is because the GRangesList
## objects are extractions of region-by-transcript, not region-by-gene.
## Not run:
# cdsbytx <- cdsBy(txdb)
# locateVariants(vcf, cdsbytx, CodingVariants())
#
# intbytx <- intronsByTranscript(txdb)
# locateVariants(vcf, intbytx, IntronVariants())
# ## End(Not run)
## ---------------------------------------------------------------------
## Using the cache
## ---------------------------------------------------------------------
## When processing multiple VCF files, the 'cache' can be used
## to store the extracted components of the TxDb
## (i.e., cds by tx, introns by tx etc.). This avoids having to
## re-extract these GRangesLists during each loop.
## Not run:
# myenv <- new.env()
# files <- list(vcf1, vcf2, vcf3)
# lapply(files,
# function(fl) {
# vcf <- readVcf(fl, "hg19")
# ## modify seqlevels to match TxDb
# seqlevels(vcf_mod) <- paste0("chr", seqlevels(vcf))
# locateVariants(vcf_mod, txdb, AllVariants(), cache=myenv)
# })
# ## End(Not run)
## ---------------------------------------------------------------------
## Parallel implmentation
## ---------------------------------------------------------------------
## Not run:
# library(BiocParallel)
#
# ## A connection to a TxDb object is established when
# ## the package is loaded. Because each process reading from an
# ## sqlite db must have a unique connection the TxDb
# ## object cannot be passed as an argument when running in
# ## parallel. Instead the package must be loaded on each worker.
#
# ## The overhead of the multiple loading may defeat the
# ## purpose of running the job in parallel. An alternative is
# ## to instead pass the appropriate GRangesList as an argument.
# ## The details section on this man page under the heading
# ## 'Subject as GRangesList' explains what GRangesList is
# ## appropriate for each variant type.
#
# ## A. Passing a GRangesList:
#
# fun <- function(x, subject, ...)
# locateVariants(x, subject, IntronVariants())
#
# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# grl <- intronsByTranscript(TxDb.Hsapiens.UCSC.hg19.knownGene)
# mclapply(c(vcf, vcf), fun, subject=grl)
#
#
# ## B. Passing a TxDb:
#
# ## Forking:
# ## In the case of forking, the TxDb cannot be loaded
# ## in the current workspace.
# ## To detach the NAMESPACE:
# ## unloadNamespace("TxDb.Hsapiens.UCSC.hg19.knownGene")
#
# fun <- function(x) {
# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# locateVariants(x, TxDb.Hsapiens.UCSC.hg19.knownGene,
# IntronVariants())
# }
# mclapply(c(vcf, vcf), fun)
#
# ## Clusters:
# cl <- makeCluster(2, type = "SOCK")
# fun <- function(query, subject, region) {
# library(VariantAnnotation)
# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# locateVariants(query, TxDb.Hsapiens.UCSC.hg19.knownGene, region)
# }
# parLapply(cl, c(vcf, vcf), fun, region=IntronVariants())
# stopCluster(cl)
# ## End(Not run)
Run the code above in your browser using DataLab