Learn R Programming

systemPipeR (version 1.6.0)

filterVars: Filter VCF files

Description

Convenience function for filtering VCF files based on user definable quality parameters. The function imports each VCF file into R, applies the filtering on an internally generated VRanges object and then writes the results to a new VCF file.

Usage

filterVars(args, filter, varcaller, organism)

Arguments

args
Object of class SYSargs. The paths of the input VCF files are specified under infile1(args) and the paths of the output files under outfile1(args).
filter
Character vector of length one specifying the filter syntax that will be applied to the internally created VRanges object.
varcaller
Character vector of length one specifying the variant caller used for generating the input VCFs. Currently, this argument can be assigned 'gatk', 'bcftools' or 'vartools'.
organism
Character vector specifying the organism name of the reference genome.

Value

Output files in VCF format. Their paths can be obtained with outpaths(args).

See Also

variantReport combineVarReports, varSummar

Examples

Run this code
## Alignment with BWA (sequentially on single machine)
param <- system.file("extdata", "bwa.param", package="systemPipeR")
targets <- system.file("extdata", "targets.txt", package="systemPipeR")
args <- systemArgs(sysma=param, mytargets=targets)
sysargs(args)[1]

## Not run: 
# system("bwa index -a bwtsw ./data/tair10.fasta")
# bampaths <- runCommandline(args=args)
# 
# ## Alignment with BWA (parallelized on compute cluster)
# resources <- list(walltime="20:00:00", nodes=paste0("1:ppn=", cores(args)), memory="10gb")
# reg <- clusterRun(args, conffile=".BatchJobs.R", template="torque.tmpl", Njobs=18, runid="01",
#                   resourceList=resources)
# 
# ## Variant calling with GATK
# ## The following creates in the inital step a new targets file
# ## (targets_bam.txt). The first column of this file gives the paths to
# ## the BAM files created in the alignment step. The new targets file and the
# ## parameter file gatk.param are used to create a new SYSargs
# ## instance for running GATK. Since GATK involves many processing steps, it is
# ## executed by a bash script gatk_run.sh where the user can specify the
# ## detailed run parameters. All three files are expected to be located in the
# ## current working directory. Samples files for gatk.param and
# ## gatk_run.sh are available in the subdirectory ./inst/extdata/ of the 
# ## source file of the systemPipeR package. 
# writeTargetsout(x=args, file="targets_bam.txt")
# system("java -jar CreateSequenceDictionary.jar R=./data/tair10.fasta O=./data/tair10.dict")
# # system("java -jar /opt/picard/1.81/CreateSequenceDictionary.jar R=./data/tair10.fasta O=./data/tair10.dict")
# args <- systemArgs(sysma="gatk.param", mytargets="targets_bam.txt")
# resources <- list(walltime="20:00:00", nodes=paste0("1:ppn=", 1), memory="10gb")
# reg <- clusterRun(args, conffile=".BatchJobs.R", template="torque.tmpl", Njobs=18, runid="01",
#                   resourceList=resources)
# writeTargetsout(x=args, file="targets_gatk.txt")
# 
# ## Variant calling with BCFtools
# ## The following runs the variant calling with BCFtools. This step requires in
# ## the current working directory the parameter file sambcf.param and the 
# ## bash script sambcf_run.sh.
# args <- systemArgs(sysma="sambcf.param", mytargets="targets_bam.txt")
# resources <- list(walltime="20:00:00", nodes=paste0("1:ppn=", 1), memory="10gb")
# reg <- clusterRun(args, conffile=".BatchJobs.R", template="torque.tmpl", Njobs=18, runid="01",
#                   resourceList=resources)
# writeTargetsout(x=args, file="targets_sambcf.txt")
# 
# ## Filtering of VCF files generated by GATK
# args <- systemArgs(sysma="filter_gatk.param", mytargets="targets_gatk.txt")
# filter <- "totalDepth(vr) >= 2 & (altDepth(vr) / totalDepth(vr) >= 0.8) & rowSums(softFilterMatrix(vr))==4"
# # filter <- "totalDepth(vr) >= 20 & (altDepth(vr) / totalDepth(vr) >= 0.8) & rowSums(softFilterMatrix(vr))==6"
# filterVars(args, filter, varcaller="gatk", organism="A. thaliana")
# writeTargetsout(x=args, file="targets_gatk_filtered.txt")
# 
# ## Filtering of VCF files generated by BCFtools
# args <- systemArgs(sysma="filter_sambcf.param", mytargets="targets_sambcf.txt")
# filter <- "rowSums(vr) >= 2 & (rowSums(vr[,3:4])/rowSums(vr[,1:4]) >= 0.8)"
# # filter <- "rowSums(vr) >= 20 & (rowSums(vr[,3:4])/rowSums(vr[,1:4]) >= 0.8)"
# filterVars(args, filter, varcaller="bcftools", organism="A. thaliana")
# writeTargetsout(x=args, file="targets_sambcf_filtered.txt")
# 
# ## Annotate filtered variants from GATK
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
# txdb <- loadDb("./data/tair10.sqlite")
# fa <- FaFile(systemPipeR::reference(args))
# variantReport(args=args, txdb=txdb, fa=fa, organism="A. thaliana")
# 
# ## Annotate filtered variants from BCFtools
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
# txdb <- loadDb("./data/tair10.sqlite")
# fa <- FaFile(systemPipeR::reference(args))
# variantReport(args=args, txdb=txdb, fa=fa, organism="A. thaliana")
# 
# ## Combine results from GATK
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
# combineDF <- combineVarReports(args, filtercol=c(Consequence="nonsynonymous"))
# write.table(combineDF, "./results/combineDF_nonsyn_gatk.xls", quote=FALSE, row.names=FALSE, sep="\t")
# 
# ## Combine results from BCFtools
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
# combineDF <- combineVarReports(args, filtercol=c(Consequence="nonsynonymous"))
# write.table(combineDF, "./results/combineDF_nonsyn_sambcf.xls", quote=FALSE, row.names=FALSE, sep="\t")
# 
# ## Summary for GATK
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
# write.table(varSummary(args), "./results/variantStats_gatk.xls", quote=FALSE, col.names = NA, sep="\t")
# 
# ## Summary for BCFtools
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
# write.table(varSummary(args), "./results/variantStats_sambcf.xls", quote=FALSE, col.names = NA, sep="\t")
# 
# ## Venn diagram of variants
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_gatk_filtered.txt")
# varlist <- sapply(names(outpaths(args))[1:4], function(x) as.character(read.delim(outpaths(args)[x])$VARID))
# vennset_gatk <- overLapper(varlist, type="vennsets")
# args <- systemArgs(sysma="annotate_vars.param", mytargets="targets_sambcf_filtered.txt")
# varlist <- sapply(names(outpaths(args))[1:4], function(x) as.character(read.delim(outpaths(args)[x])$VARID))
# vennset_bcf <- overLapper(varlist, type="vennsets")
# vennPlot(list(vennset_gatk, vennset_bcf), mymain="", mysub="GATK: red; BCFtools: blue", colmode=2, ccol=c("blue", "red"))
# ## End(Not run)

Run the code above in your browser using DataLab