metaseqr(counts, sample.list, exclude.list = NULL,
file.type = c("auto", "sam", "bam", "bed"),
path = NULL, contrast = NULL, libsize.list = NULL,
id.col = 4, gc.col = NA, name.col = NA, bt.col = NA,
annotation = c("download", "embedded"),
org = c("hg18", "hg19", "hg38", "mm9", "mm10", "rn5", "dm3",
"danrer7", "pantro4", "susscr3", "tair10", "custom"),
refdb = c("ensembl", "ucsc", "refseq"),
count.type = c("gene", "exon"),
exon.filters = list(min.active.exons = list(exons.per.gene = 5,
min.exons = 2, frac = 1/5)),
gene.filters = list(length = list(length = 500),
avg.reads = list(average.per.bp = 100, quantile = 0.25),
expression = list(median = TRUE, mean = FALSE, quantile = NA,
known = NA, custom = NA),
biotype = get.defaults("biotype.filter", org[1])),
when.apply.filter = c("postnorm", "prenorm"),
normalization = c("edaseq", "deseq", "edger", "noiseq", "nbpseq",
"each", "none"),
norm.args = NULL,
statistics = c("deseq", "edger", "noiseq", "bayseq", "limma",
"nbpseq"),
stat.args = NULL,
adjust.method = sort(c(p.adjust.methods, "qvalue")),
meta.p = if (length(statistics) > 1) c("simes", "bonferroni", "fisher",
"dperm.min", "dperm.max", "dperm.weight", "fperm", "whitlock",
"minp", "maxp", "weight", "pandora", "none") else "none",
weight = rep(1/length(statistics), length(statistics)),
nperm = 10000, reprod=TRUE, pcut = NA, log.offset = 1,
preset = NULL,
qc.plots = c("mds", "biodetection", "countsbio", "saturation",
"readnoise", "filtered", "correl", "pairwise", "boxplot",
"gcbias", "lengthbias", "meandiff", "meanvar", "rnacomp",
"deheatmap", "volcano", "biodist"),
fig.format = c("png", "jpg", "tiff", "bmp", "pdf", "ps"),
out.list = FALSE, export.where = NA,
export.what = c("annotation", "p.value", "adj.p.value",
"meta.p.value", "adj.meta.p.value", "fold.change",
"stats", "counts","flags"),
export.scale = c("natural", "log2", "log10", "vst", "rpgm"),
export.values = c("raw", "normalized"),
export.stats = c("mean", "median", "sd", "mad", "cv",
"rcv"),
export.counts.table = FALSE,
restrict.cores = 0.6, report = TRUE, report.top = 0.1,
report.template = "default", save.gene.model = TRUE,
verbose = TRUE, run.log = TRUE, ...)
annotation
parameter should strictly be
"download"
or an external file in proper format.
ii) The first n columns should contain gene or exon
annotation elements like chromosomal locations, gene
accessions, exon accessions, GC content etc. In that
case, the annotation
parameter can also be
"embedded"
. The ideal embedded annotation contains
8 columns, chromosome, gene or exon start, gene or exon
end, gene or exon accession, GC-content (fraction or
percentage), strand, HUGO gene symbol and gene biotype
(e.g. "protein_coding" or "ncRNA"). When the
annotation
parameter is "embedded", certain of
these features are mandatory (co-ordinates and
accessions). If they are not present, the pipeline will
not run. If additional elements are not present (e.g. GC
content or biotypes), certain features of metaseqr will
not be available. For example, EDASeq normalization will
not be performed based on a GC content covariate but
based on gene length which is not what the authors of
EDASeq suggest. If biotypes are not present, a lot of
diagnostic plots will not be available. If the HUGO gene
symbols are missing, the final annotation will contain
only gene accessions and thus be less comprehensible.
Generally, it's best to set the annotation
parameter to "download"
to ensure the most
comprehensible results. Counts can be a data frame
satisfying the above conditions. It is a data frame
by default when read2count
is used. counts can
also be an .RData file (output of save
function which contains static input elements (list
containing the gene model (exon counts for each gene
constructed by the construct.gene.model
function, gene and exon annotation to avoid
re-downloading and/or gene counts depending on
count.type
). This kind of input facilitates the
re-analysis of the same experiment, using different
filtering, normalization and statistical algorithms.
Finally, counts can be a list representing the gene model
(exon counts for each gene) constructed by the
construct.gene.model
function (provided for
backwards compatibility). This .RData file can be generated
by setting save.gene.model=TRUE
when performing data
analysis for the first time.sample.list <-
list(ConditionA=c("Sample_A1",
"Sample_A2",
"Sample_A3"),
ConditionB=c("Sample_B1",
"Sample_B2"),
ConditionC=c("Sample_C1",
"Sample_C2"))
. The names of the samples in list members
MUST match the column names containing the read counts in
the counts file. If they do not match, the pipeline will
either crash or at best, ignore several of your samples.
Alternative, sample.list
can be a small
tab-delimited file structured as follows: the first line
of the external tab delimited file should contain column
names (names are not important). The first column MUST
contain UNIQUE sample names and the second column MUST
contain the biological condition where each of the
samples in the first column should belong to. In this
case, the function make.sample.list
is
used. If the counts
argument is missing, the
sample.list
argument MUST be a targets text
tab-delimited file which contains the sample names, the
BAM/BED file names and the biological conditions/groups
for each sample/file. The file should be text
tab-delimited and structured as follows: the first line
of the external tab delimited file should contain column
names (names are not important). The first column MUST
contain UNIQUE sample names. The second column MUST
contain the raw BAM/BED files WITH their full path.
Alternatively, the path
argument should be
provided (see below). The third column MUST contain the
biological condition where each of the samples in the
first column should belong to.sample.list
above."auto"
for auto-guessing, "bed"
for BED
files, "sam"
for SAM files or "bam"
for BAM
files.sample.list
above is contrast
<- c("ConditionA_vs_ConditionB",
"ConditionA_vs_ConditionC",
"ConditionA_vs_ConditionB_vs_ConditionC")
. The
first element of pairwise contrasts (e.g. "ConditionA"
above) MUST be the control condition or any reference
that ConditionB is checked against. metaseqr uses this
convention to properly calculate fold changes. If it's
NULL, a contrast between the first two members of the
sample.list
will be auto-generated.sample.list
) and members are the library sizes
(the sequencing depth) for each sample. For example
libsize.list <- list(Sample_A1=32456913,
Sample_A2=4346818)
.4
which is the standard feature name
column in a BED file.counts
argument, where each gene's GC content is given. If not
provided, GC content normalization provided by EDASeq
will not be available."known"
gene filter will
not be available."biodetection"
, "countsbio"
,
"saturation"
, "filtered"
and
"biodist"
plots will not be available."download"
(default) for automatic downloading of
the annotation for the organism specified by the org
parameter (using biomaRt), ii) "embedded"
if the
annotation elements are embedded in the read counts file
or iv) a file specified by the user which should be as
similar as possible to the "download"
case, in
terms of column structure."hg18"
, "hg19"
or
"hg38"
, for mouse genomes "mm9"
, "mm10"
,
for rat genome "rn5"
, for drosophila genome
"dm3"
, for zebrafish genome "danrer7"
, for
chimpanzee genome "pantro4"
, for pig genome
"susscr3"
and for Arabidopsis thaliana genome
"tair10"
. Finally, "custom"
will instruct
metaseqR to completely ignore the org
argument and
depend solely on annotation file provided by the user."ensembl"
(default),
"ucsc"
or "refseq"
."gene"
or "exon"
.
This is a very important and mandatory parameter as it
defines the course of the workflow."prenorm"
to apply apply
the filters and exclude genes from further processing
before normalization, or "postnorm"
to apply the
filters after normalization (default). In the case of
when.apply.filter="prenorm"
, a first normalization
round is applied to a copy of the gene counts matrix in
order to derive the proper normalized values that will
constitute the several expression-based filtering
cutoffs."edaseq"
(default) for EDASeq normalization,
"deseq"
for the normalization algorithm
(individual options specified by the norm.args
argument) in the DESq package, "edger"
for the
normalization algorithms present in the edgeR package
(specified by the norm.args
argument),
"noiseq"
for the normalization algorithms present
in the NOISeq package (specified by the norm.args
argument), "nbpseq"
for the normalization
algorithms present in the NBPSeq package (specified by
the norm.args
argument) or "none"
to not
normalize the data (highly unrecommended). It can also
be "each"
where in this case, the normalization
applied will be specific to each statistical test used
(i.e. the normalization method bundled with each package
and used in its examples and documentation). The last
choice is for future use!NULL
for the defaults of
normalization
. If normalization="each"
, it
must be a named list of lists, where each sub-list
contains normalization parameters specific to each
statistical test to be used. The last choice is for
future use!"deseq"
(default) to conduct statistical
test(s) implemented in the DESeq package, "edger"
to conduct statistical test(s) implemented in the edgeR
package, "limma"
to conduct the RNA-Seq version of
statistical test(s) implemented in the limma package,
"noiseq"
to conduct statistical test(s)
implemented in the NOISeq package, "bayseq"
to
conduct statistical test(s) implemented in the baySeq
package and "nbpseq"
to conduct statistical
test(s) implemented in the NBPSeq package. In any case
individual algorithm parameters are controlled by the
contents of the stat.args
list.NULL
for the defaults of
statistics
.p.adjust.methods
or "qvalue"
from
the qvalue Bioconductor package. Defaults to "BH"
for Benjamini-Hochberg correction."simes"
(default), "bonferroni"
,
"minp"
, "maxp"
, "weight"
, "pandora"
,
"dperm.min"
, "dperm.max"
, "dperm.weight"
,
"fisher"
, "fperm"
, "whitlock"
or
"none"
. For the "fisher"
and "fperm"
methods, see the documentation of the R package MADAM. For
the "whitlock"
method, see the documentation of the
survcomp Bioconductor package. With the "maxp"
option, the final p-value is the maximum p-value out of
those returned by each statistical test. This is
equivalent to an "intersection" of the results derived
from each algorithm so as to have a final list with the
common genes returned by all statistical tests. Similarly,
when meta.p="minp"
, is equivalent to a "union" of
the results derived from each algorithm so as to have a
final list with all the genes returned by all statistical
tests. The latter can be used as a very lose statistical
threshold to aggregate results from all methods regardless
of their False Positive Rate. With the "simes"
option, the method proposed by Simes (Simes, R. J., 1986)
is used. With the "dperm.min"
, "dperm.max"
,
"dperm.weight"
options, a permutation procedure is
initialed, where nperm
permutations are performed
across the samples of the normalized counts matrix,
producing nperm
permuted instances of the initital
dataset. Then, all the chosen statistical tests are
re-executed for each permutation. The final p-value is
the number of times that the p-value of the permuted
datasets is smaller than the original dataset.
The p-value of the original dataset is created based on
the choice of one of dperm.min
, dperm.max
or dperm.weight
options. In case of
dperm.min
, the intial p-value vector is consists
of the minimum p-value resulted from the applied
statistical tests for each gene. The maximum p-value
is used with the dperm.max
option. With the
dperm.weight
option, the weight
weighting vector for each statistical test is used to
weight each p-value according to the power of
statistical tests (some might work better for a
specific dataset). Be careful as the permutation
procedure usually requires a lot of time. However,
it should be the most accurate. This method will NOT
work when there are no replicated samples across
biological conditions. In that case, use
meta.p="simes"
instead. Finally, there are the
"minp"
, "maxp"
and "weight"
options which correspond to the latter three methods
but without permutations. Generally, permutations
would be accurate to use when the experiment includes
>5 samples per condition (or even better 7-10) which
is rather rare in RNA-Seq experiments. Finally,
"pandora"
is the same as "weight"
and is
added to be in accordance with the metaseqR paper.statistics
vector containing a weight for each
statistical test. It should sum to 1. Use with
caution with the dperm.weight
parameter!
Theoretical background is not yet solid and only
experience shows improved results!meta.p="fperm"
or
meta.p="dperm"
. It defaults to 10000.meta.p="dperm.min"
, meta.p="dperm.max"
or meta.p="dperm.weight"
. Ideally one would
want to create the same set of indices for a given
dataset so as to create reproducible p-values. If
reprod=TRUE
, a fixed seed is used by
meta.perm
for all the datasets analyzed with
metaseqr
. If reprod=FALSE
, then the
p-values will not be reproducible, although statistical
significance is not expected to change for a large
number of resambling. Finally, reprod
can be
a numeric vector of seeds with the same length as
nperm
so that the user can supply his/her
own seeds.1
).preset
can be one of "all.basic"
,
"all.normal"
, "all.full"
,
"medium.basic"
, "medium.normal"
,
"medium.full"
, "strict.basic"
,
"strict.normal"
or "strict.full"
, each of
which control the strictness of the analysis and the
amount of data to be exported. For an explanation of the
presets, see the section "Presets" below."mds"
,
"biodetection"
, "rnacomp"
,
"countsbio"
, "saturation"
,
"readnoise"
, "filtered"
, "boxplot"
,
"gcbias"
, "lengthbias"
, "meandiff"
,
"meanvar"
, "deheatmap"
, "volcano"
,
"biodist"
, "venn"
. The "mds"
stands
for Mutlti-Dimensional Scaling and it creates a PCA-like
plot but using the MDS dimensionality reduction instead.
It has been succesfully used for NGS data (e.g. see the
package htSeqTools) and it shows how well samples from
the same condition cluster together. For
"biodetection"
, "countsbio"
,
"saturation"
, "rnacomp"
,
"readnoise"
, "biodist"
see the vignette of
NOISeq package. The "saturation"
case has been
rewritten in order to display more samples in a more
simple way. See the help page of
diagplot.noiseq.saturation
. In addition,
the "readnoise"
plots represent an older version
or the RNA composition plot included in older versions of
NOISeq. For "gcbias"
, "lengthbias"
,
"meandiff"
, "meanvar"
see the vignette of
EDASeq package. "lenghtbias"
is similar to
"gcbias"
but using the gene length instead of the
GC content as covariate. The "boxplot"
option
draws boxplots of log2 transformed gene counts. The
"filtered"
option draws a 4-panel figure with the
filtered genes per chromosome and per biotype, as
absolute numbers and as fractions of the genome. See also
the help page of diagplot.filtered
. The
"deheatmap"
option performs hierarchical
clustering and draws a heatmap of differentially
expressed genes. In the context of diagnostic plots, it's
useful to see if samples from the same groups cluster
together after statistical testing. The "volcano"
option draws a volcano plot for each contrast and if a
report is requested, an interactive volcano plot is
presented in the HTML report. The "venn"
option
will draw an up to 5-way Venn diagram depicting the
common and specific to each statistical algorithm genes
and for each contrast, when meta-analysis is performed.
The "correl"
option creates two correlation
graphs: the first one is a correlation heatmap (a
correlation matrix which depicts all the pairwise
correlations between each pair of samples in the counts
matrix is drawn as a clustered heatmap) and the second
one is a correlogram plot, which summarizes the
correlation matrix in the form of ellipses (for an
explanation please see the vignette/documentation of the
R package corrplot. Set qc.plots=NULL
if you don't
want any diagnostic plots created."png"
,
"jpg"
, "tiff"
, "bmp"
, "pdf"
,
"ps"
. The native format "x11"
(for direct
display) is not provided as an option as it may not
render the proper display of some diagnostic plots in
some devices."annotation"
, to bind the
annoation elements for each gene, "p.value"
, to
bind the p-values of each method, "adj.p.value"
,
to bind the multiple testing adjusted p-values,
"meta.p.value"
, to bind the combined p-value from
the meta-analysis, "adj.meta.p.value"
, to bind the
corrected combined p-value from the meta-analysis,
"fold.change"
, to bind the fold changes of each
requested contrast, "stats"
, to bind several
statistics calclulated on raw and normalized counts (see
the export.stats
argument), "counts"
, to
bind the raw and normalized counts for each sample."natural"
, "log2"
, "log10"
,
"vst"
(Variance Stabilizing Transormation, see the
documentation of DESeq package) and "rpgm"
which
is ratio of mapped reads per gene model (either the gene
length or the sum of exon lengths, depending on
count.type
argument). Note that this is not RPKM
as reads are already normalized for library size using
one of the supported normalization methods. Also,
"rpgm"
might be misleading when normalization
is other than "deseq"
."raw"
to export raw values (counts etc.) and
"normalized"
to export normalized counts."mean"
, "median"
,
"sd"
, "mad"
, "cv"
for the
Coefficient of Variation, "rcv"
for a robust
version of CV where the median and the MAD are used
instead of the mean and the standard deviation.FALSE
.restrict.cores=1
and the system does not have
sufficient RAM, the pipeline running machine might
significantly slow down.TRUE
.0.1
(top 10 genes). Set to NA
or NULL
to append all
the statistically significant genes to the HTML report.export.where/data/gene_model.RData
. This file can
be used as input to metaseqR for exon count based
analysis, in order to avoid the time consuming step of
assembling the counts for each gene from its exonsTRUE
.metaseqr
run using package log4r. Defaults to TRUE
. The
filename will be auto-generated.par
.out.list
is TRUE
, a named list whose
length is the same as the number of requested contrasts.
Each list member is named according to the corresponding
contrast and contains a data frame of differentially
expressed genes for that contrast. The contents of the
data frame are defined by the export.what,
export.scale, export.stats, export.values
parameters. If
report
is TRUE
, the output list contains
two main elements. The first is described above (the
analysis results) and the second contains the same
results but in HTML formatted tables.exon.filters
argument is a named list of
filters, where the names are the filter names and the
members are the filter parameters (named lists with
parameter name, parameter value). See the usage of the
metaseqr
function for an example of how these
lists are structured. The supported exon filters in the
current version are: i) min.active.exons
which
implements a filter for demanding m out of n exons of a
gene to have a certain read presence with parameters
exons.per.gene
, min.exons
and frac
.
The filter is described as follows: if a gene has up to
exons.per.gene
exons, then read presence is
required in at least min.exons
of them, else read
presence is required in a frac
fraction of the
total exons. With the default values, the filter
instructs that if a gene has up to 5 exons, read presence
is required in at least 2, else in at least 20 exons, in order to be accepted. More filters will be
implemented in future versions and users are encouraged
to propose exon filter ideas to the author by mail. See
metaseqr
usage for the defaults. Set
exon.filters=NULL
to not apply any exon filtering.length
which implements a length filter
where genes are accepted for further analysis if they are
above length
(its parameter) kb. ii)
avg.reads
which implements a filter where a gene
is accepted for further analysis if it has more average
reads than the quantile
of the average count
distribution per average.per.bp
base pairs. In
summary, the reads of each gene are averaged per
average.per.bp
based on each gene's length (in
case of exons, input the "gene's length" is the sum of
the lengths of exons) and the quantile
quantile of
the average counts distribution is calculated for each
sample. Genes passing the filter should have an average
read count larger than the maximum of the vector of the
quantiles calculated above. iii) expression
which
implements a filter based on the overall expression of a
gene. The parameters of this filter are: median
,
where genes below the median of the overall count
distribution are not accepted for further analysis (this
filter has been used to distinguish between "expressed"
and "not expressed" genes in several cases, e.g. (Mokry
et al., NAR, 2011) with a logical as value, mean
which is the same as median
but using the mean,
quantile
which is the same as the previous two but
using a specific quantile of the total counts
distribution, known
, where in this case, a set of
known not-expressed genes in the system under
investigation are used to estimate an expression cutoff.
This can be quite useful, as the genes are filtered based
on a "true biological" cutoff instead of a statistical
cutoff. The value of this filter is a character vector of
HUGO gene symbols (MUST be contained in the annotation,
thus it's better to use annotation="download"
)
whose counts are used to build a "null" expression
distribution. The 90th quantile of this distribution is
then the expression cutoff. This filter can be combined
with any other filter. Be careful with gene names as they
are case sensitive and must match exactly ("Pten" is
different from "PTEN"!). iv) biotype
where in this
case, genes with a certain biotype (MUST be contained in
the annotation, thus it's better to use
annotation="download"
) are excluded from the
analysis. This filter is a named list of logical, where
names are the biotypes in each genome and values are
TRUE
or FALSE
. If the biotype should be
excluded, the value should be TRUE
else
FALSE
. See the result of
get.defaults("biotype.filter","hg19")
for an
example. Finally, in future versions there will be
support for user-defined filters in the form of a
function.normalization="edaseq"
the only parameter names
are within.which
and between.which
,
controlling the withing lane/sample and between
lanes/samples normalization algorithm. In the case
of normalization="nbpseq"
, there is one
additional parameter called main.method
which can
take the calues "nbpseq"
or "nbsmyth"
.
These values correspond to the two different workflows
available in the NBPSeq package. Please, consult the
NBPSeq package documentation for further details. For the
rest of the algorithms, the parameter names are the same
as the names used in the respective packages. For
examples, please use the get.defaults
function.stat.args
is a
named list whose names are the names the algorithms used
(see the statistics
parameter). Each member is
another named list,with parameters to be used for each
statistical algorithm. Again, the names of the member
lists are parameter names and the values of the member
lists are parameter values. You should check the
documentations of DESeq, edgeR, NOISeq, baySeq, limma and
NBPSeq for these parameters. There are a few exceptions
in parameter names: In case of statistics="edger"
,
apart from the rest of the edgeR statistical testing
arguments, there is the argument main.method
which
can be either "classic"
or "glm"
, again
defining whether the binomial test or GLMs will be used
for statistical testing. For examples, please use the
get.defaults
function. When
statistics="nbpseq"
, apart from the rest arguments
of the NBPSeq functions estimate.disp
and
estimate.dispersion
, there is the argument
main.method
which can be "nbpseq"
or
"nbsmyth"
. This argument determines the parameters
to be used by the estimate.dispersion
function or
by the estimate.disp
function to estimate RNA-Seq
count dispersions. The difference between the two is that
they constitute different starting points for the two
workflows in the package NBPSeq. The first worklfow (with
main.method="nbpseq"
and the
estimate.dispersion
function is NBPSeq package
specific, while the second (with
main.method="nbsmyth"
and the estimate.disp
function is similar to the workflow of the edgeR package.
For additional information regarding the statistical
testing in NBPSeq, please consult the documentation of
the NBPSeq package. Additinally, please note that
there is currently a problem with the NBPSeq package and
the workflow that is specific to the NBPSeq package. The
problem has to do with function exporting as there are
certain functions which are not recognized from the
package internally. For this reason and until it is
fixed, only the Smyth workflow will be available with the
NBPSeq package (thus
stat.args$main.method="nbpseq"
will not
be available)!"all.basic"
to define an analysis
preset. When using analysis presets, the following
argumentsof metaseqr are overriden: exon.filters
,
gene.filters
, pcut
, export.what
,
export.scale
, export.values
,
exon.stats
. If you want to explicitly control the
above arguments, the preset
argument should be set
to NULL
(default). Following is a synopsis of the
different presets and the values of the arguments they
moderate: "all.basic"
: use all
genes (do not filter) and export all genes and basic
annotation and statistics elements. In this case, the
above described arguments become:exon.filters=NULL
gene.filters=NULL
pcut=1
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change")
export.scale=c("natural","log2")
export.values=c("normalized")
export.stats=c("mean")
"all.normal"
: use all genes (do not filter) and
export all genes and normal annotation and statistics
elements. In this case, the above described arguments
become:exon.filters=NULL
gene.filters=NULL
pcut=1
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change","stats","counts")
export.scale=c("natural","log2")
export.values=c("normalized")
export.stats=c("mean","sd","cv")
exon.filters=NULL
gene.filters=NULL
pcut=1
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change","stats","counts")
export.scale=c("natural","log2","log10","vst")
export.values=c("raw","normalized")
export.stats=c("mean","median","sd","mad","cv","rcv")
"medium.basic"
: apply a medium set of
filters and and export statistically significant genes
and basic annotation and statistics elements. In this
case, the above described arguments become:exon.filters=list(min.active.exons=list(exons.per.gene=5,min.exons=2,frac=1/5))
gene.filters=list(length=list(length=500),
avg.reads=list(average.per.bp=100,quantile=0.25),
expression=list(median=TRUE,mean=FALSE,quantile=NA,known=NA,custom=NA),
biotype=get.defaults("biotype.filter",org[1]))
pcut=0.05
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change")
export.scale=c("natural","log2")
export.values=c("normalized")
export.stats=c("mean")
"medium.normal"
: apply a medium set of filters and
and export statistically significant genes and normal
annotation and statistics elements. In this case, the
above described arguments become:exon.filters=list(min.active.exons=list(exons.per.gene=5,min.exons=2,frac=1/5))
gene.filters=list(length=list(length=500),
avg.reads=list(average.per.bp=100,quantile=0.25),
expression=list(median=TRUE,mean=FALSE,quantile=NA,known=NA,custom=NA),
biotype=get.defaults("biotype.filter",org[1]))
pcut=0.05
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change","stats","counts")
export.scale=c("natural","log2")
export.values=c("normalized")
export.stats=c("mean","sd","cv")
exon.filters=list(min.active.exons=list(exons.per.gene=5,min.exons=2,frac=1/5))
gene.filters=list(length=list(length=500),
avg.reads=list(average.per.bp=100,quantile=0.25),
expression=list(median=TRUE,mean=FALSE,quantile=NA,known=NA,custom=NA),
biotype=get.defaults("biotype.filter",org[1]))
pcut=0.05
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change","stats","counts")
export.scale=c("natural","log2","log10","vst")
export.values=c("raw","normalized")
export.stats=c("mean","median","sd","mad","cv","rcv")
"strict.basic"
: apply a strict set of
filters and and export statistically significant genes
and basic annotation and statistics elements. In this
case, the above described arguments become:exon.filters=list(min.active.exons=list(exons.per.gene=4,min.exons=2,frac=1/4))
gene.filters=list(length=list(length=750),
avg.reads=list(average.per.bp=100,quantile=0.5),
expression=list(median=TRUE,mean=FALSE,quantile=NA,known=NA,custom=NA),
biotype=get.defaults("biotype.filter",org[1]))
pcut=0.01
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change")
export.scale=c("natural","log2")
export.values=c("normalized")
export.stats=c("mean")
"strict.normal"
: apply a strict set of filters and
and export statistically significant genes and normal
annotation and statistics elements. In this case, the
above described arguments become:exon.filters=list(min.active.exons=list(exons.per.gene=4,min.exons=2,frac=1/4))
gene.filters=list(length=list(length=750),
avg.reads=list(average.per.bp=100,quantile=0.5),
expression=list(median=TRUE,mean=FALSE,quantile=NA,known=NA,custom=NA),
biotype=get.defaults("biotype.filter",org[1]))
pcut=0.01
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change","stats","counts")
export.scale=c("natural","log2")
export.values=c("normalized")
export.stats=c("mean","sd","cv")
exon.filters=list(min.active.exons=list(exons.per.gene=4,min.exons=2,frac=1/4))
gene.filters=list(length=list(length=750),
avg.reads=list(average.per.bp=100,quantile=0.5),
expression=list(median=TRUE,mean=FALSE,quantile=NA,known=NA,custom=NA),
biotype=get.defaults("biotype.filter",org[1]))
pcut=0.01
export.what=c("annotation","p.value","adj.p.value","meta.p.value",
"adj.meta.p.value","fold.change","stats","counts")
export.scale=c("natural","log2","log10","vst")
export.values=c("raw","normalized")
export.stats=c("mean","median","sd","mad","cv","rcv")
# An example pipeline with exon counts
data("hg19.exon.data",package="metaseqR")
metaseqr(
counts=hg19.exon.counts,
sample.list=list(normal="normal",paracancerous="paracancerous",cancerous="cancerous"),
contrast=c("normal_vs_paracancerous","normal_vs_cancerous",
"normal_vs_paracancerous_vs_cancerous"),
libsize.list=libsize.list.hg19,
id.col=4,
annotation="download",
org="hg19",
count.type="exon",
normalization="edaseq",
statistics="deseq",
pcut=0.05,
qc.plots=c("mds", "biodetection", "countsbio", "saturation", "rnacomp",
"boxplot", "gcbias", "lengthbias", "meandiff", "readnoise","meanvar",
"readnoise", "deheatmap", "volcano", "biodist", "filtered"),
fig.format=c("png","pdf"),
export.what=c("annotation","p.value","adj.p.value","fold.change","stats",
"counts"),
export.scale=c("natural","log2","log10","vst"),
export.values=c("raw","normalized"),
export.stats=c("mean","median","sd","mad","cv","rcv"),
restrict.cores=0.8,
gene.filters=list(
length=list(
length=500
),
avg.reads=list(
average.per.bp=100,
quantile=0.25
),
expression=list(
median=TRUE,
mean=FALSE
),
biotype=get.defaults("biotype.filter","hg18")
)
)
# An example pipeline with gene counts
data("mm9.gene.data",package="metaseqR")
result <- metaseqr(
counts=mm9.gene.counts,
sample.list=list(e14.5=c("e14.5_1","e14.5_2"), adult_8_weeks=c("a8w_1","a8w_2")),
contrast=c("e14.5_vs_adult_8_weeks"),
libsize.list=libsize.list.mm9,
annotation="download",
org="mm9",
count.type="gene",
normalization="edger",
statistics=c("deseq","edger","noiseq"),
meta.p="fisher",
pcut=0.05,
fig.format=c("png","pdf"),
export.what=c("annotation","p.value","meta.p.value","adj.meta.p.value",
"fold.change"),
export.scale=c("natural","log2"),
export.values="normalized",
export.stats=c("mean","sd","cv"),
export.where=getwd(),
restrict.cores=0.8,
gene.filters=list(
length=list(
length=500
),
avg.reads=list(
average.per.bp=100,
quantile=0.25
),
expression=list(
median=TRUE,
mean=FALSE,
quantile=NA,
known=NA,
custom=NA
),
biotype=get.defaults("biotype.filter","mm9")
),
out.list=TRUE
)
head(result$data[["e14.5_vs_adult_8_weeks"]])
Run the code above in your browser using DataLab