recoup: Create genomic signal profiles in predefined or custom areas using short sequence reads

Description

This function calculates and plots signal profiles created from short sequence reads derived from Next Generation Sequencing technologies. The profiles provided are either sumarized curve profiles or heatmap profiles. Currently, recoup supports genomic profile plots for reads derived from ChIP-Seq and RNA-Seq experiments. The function uses ggplot2 and ComplexHeatmap graphics facilities for curve and heatmap coverage profiles respectively. The output list object can be reused as input to this function which will automatically recognize which profile elements needto be recalculated, saving time.

Usage

recoup(
        input,
        design = NULL,
        region = c("genebody", "tss", "tes", "custom"),
        type = c("chipseq", "rnaseq"),
        genome = c("hg18", "hg19", "hg38", "mm9" ,"mm10",
            "rn5", "dm3", "danrer7", "pantro4", "susscr3"),
        refdb = c("ensembl", "ucsc", "refseq"),
        flank = c(2000, 2000),
        fraction = 1,
        orderBy = list(
            what = c("none", "suma", "sumn", 
            "maxa","maxn", "avga", "avgn", "hcn"), 
            order = c("descending", "ascending"),
            custom = NULL
        ),
        binParams = list(
            flankBinSize = 0,
            regionBinSize = 0,
            sumStat = c("mean", "median"),
            interpolation = c("auto", "spline", "linear", 
                "neighborhood"),
            forceHeatmapBinning = TRUE,
            forcedBinSize = c(50, 200)
        ),
        selector = NULL,
        preprocessParams = list(
            normalize = c("none", "linear", 
                "downsample","sampleto"),
            sampleTo = 1e+6,
            spliceAction = c("split", "keep", "remove"),
            spliceRemoveQ = 0.75,
            bedGenome = NA,
            seed = 42
        ),
        plotParams = list(
            plot = TRUE,
            profile = TRUE,
            heatmap = TRUE,
            correlation = TRUE,
            signalScale = c("natural", "log2"),
            heatmapScale = c("common", "each" ),
            heatmapFactor = 1,
            corrScale = c("normalized", "each"),
            sumStat = c("mean", "median"),
            smooth = TRUE,
            corrSmoothPar = ifelse(is.null(design), 0.1, 
                0.5),
            singleFacet = c("none", "wrap", "grid"),
            multiFacet = c("wrap", "grid"),
            conf = TRUE,
            device = c("x11", "png", "jpg", "tiff", "bmp",
                "pdf", "ps"),
            outputDir = ".",
            outputBase = NULL
        ),
        saveParams = list(
            ranges = TRUE,
            coverage = TRUE,
            profile = TRUE,
            profilePlot = TRUE,
            heatmapPlot = TRUE,
            correlationPlot = TRUE
        ),
        kmParams = list(
            k = 0,
            nstart = 20,
            algorithm = c("Hartigan-Wong", 
                "Lloyd", "Forgy", "MacQueen"),
            iterMax = 20,
            reference = NULL,
            seed = 42
        ),
        strandedParams = list(
            strand = NULL, 
            ignoreStrand = TRUE
        ),
        ggplotParams = list(
            title = element_text(size = 12),
            axis.title.x = element_text(size = 10, 
                face = "bold"),
            axis.title.y = element_text(size = 10, 
                face = "bold"),
            axis.text.x = element_text(size = 9, 
                face = "bold"),
            axis.text.y = element_text(size = 10, 
                face = "bold"),
            strip.text.x = element_text(size = 10, 
                face = "bold"),
            strip.text.y = element_text(size = 10, 
                face = "bold"),
            legend.position = "bottom",
            panel.margin = grid::unit(1, "lines")
        ),
        complexHeatmapParams = list(
            main = list(
                cluster_rows = ifelse(length(grep(
                    "hc", orderBy$what)) > 0, TRUE, FALSE),
                cluster_columns = FALSE,
                column_title_gp = grid::gpar(fontsize = 10, 
                    font = 2),
                show_row_names = FALSE,
                show_column_names = FALSE,
                heatmap_legend_param = list(
                    color_bar = "continuous"
                )
            ),
            group=list(
                cluster_rows = ifelse(length(grep(
                    "hc", orderBy$what)) > 0, TRUE, FALSE),
                cluster_columns = FALSE,
                column_title_gp = grid::gpar(fontsize = 10, 
                    font = 2),
                show_row_names = FALSE,
                show_column_names = FALSE,
                row_title_gp = grid::gpar(fontsize = 8,
                    font = 2),
                gap = unit(5, "mm"),
                heatmap_legend_param = list(
                    color_bar = "continuous"
                )
            )
        ),
        bamParams = NULL,
        onTheFly = FALSE,
        localDbHome = file.path(path.expand("~"), ".recoup"), 
        rc = NULL
    )

Arguments

input

the main input to recoup can be either a list or a configuration file (with essentially the same contents as the list). In case of list input, it is a list of n lists, where n the number of samples. See Details for the inner list contents. Alternatively, input can be a text tab delimited file with a specific header (the same fields as each inner list when input is a list) and one row for each sample. Again, see Details section for the field specifications.

design

either a data frame with grouping factors as columns (e.g. two grouping factors can be strand, and Ensembl biotype) or a tab delimited text file with the same content (grouping factors in columns). If a data frame, the row.names attribute must correspond to the names (e.g. rownames) of the genome argument, or be a superset or subset of them. If a file, the first column must correspond to the names (e.g. rownames) of the genome argument or be a superset or subset of them.

region

one of "tss", "tes", "genebody", "custom".

type

one of "chipseq", "rnaseq".

genome

when region is "tss", "tes" or "genebody", genome can be one of "hg38", "hg19", "hg18", "mm10", "mm9", "dm3", "rn5", "danrer7", "pantro4", "susscr3" for human, mouse, fruitfly, rat, zebrafish and chimpanzee genomes respectively. When when region is one of the above or "custom", genome can be a tab delimited BED-like text file or a data frame.

refdb

one of "ensembl", "ucsc" or "refseq". It will be used to retrieve genomic reference regions when genome argument is one of the supported organisms.

flank

a vector of length two with the number of base pairs to flank upstream and downstream the region. Minimum flank is 0bp and maximum is 50kb. It is always expressed in bp.

fraction

a number from 0 to 1 (default) denoting the fraction of total data to be used. See Details for further information.

orderBy

a named list whose members control the order of the genomic regions (related to the genome and region arguments as they appear in heatmap profiles. The list has the following fields:

what: one of"none"(default),"suma","sumn","maxa","maxn","avga","avgn","hcn", wherenin"sumn","maxn","hcn"is the index of the profile which sould be used as reference. See Details for further information.
order: either"descending"(default) for ordering coverages from highest to lowest, or"ascending"for the opposite.
custom: a numeric vector of custom values (e.g. RNA abundance) that will be used to sort all the profiles. If provided,whatwill be ignored. Defaults toNULL.

binParams

a named list whose members control the resolution of the coverage profiles. The list has the following fields:

flankBinSize: the number of intervals (bins) into which the upstream and downstream regions are split and the per-base coverage is averaged across. If0(default), no binning is performed and the profiles are calculated based at the base-pair level (the highest possible resolution).
regionBinSize: the number of intervals (bins) into which the main region is split and the per-base coverage is averaged across. If0(default), no binning is performed and the profiles are calculated based at the base-pair level (the highest possible resolution).
sumStat: the statistic which is used to summarize the bin coverage. Can be"mean"(default) or"median".
interpolation: the interpolation method to be used for coverage interpolation when the reference regions are of unequal lengths (e.g. gene bodies) and theregionBinSizeis larger than some of the former. Can be"auto"(default),"spline","linear"or"neighborhood". See Details for further explanations of each option.
forceHeatmapBinning: ifTRUE(default) and the profile resolution is very high (seeflankBinSizeandregionBinSizeabove), binning is applied prior to heatmap profile generation, otherwise the heatmaps will be oversized and will take a lot of time to render. Set toFALSEif bothflankBinSizeandregionBinSizeare not zero so as to avoid unecessary profile recalculations.
forcedBinSize: a vector with two integers representing theflankBinSizeandregionBinSizeto be used withforceHeatmapBinningabove.

See Details for a few further notes in the usage of binParams.

selector

a named list whose members control some subsetting abilities regarding the input reference genomic regions (genome argument) or NULL (default) when the genome argument is/may be custom. When list, it has the following fields:

id: a vector of ids of the same type as those present in thegenomefile or organism type/version.
bioype: a vector of Ensembl biotypes that will be used to filter thegenomewhen the latter is one of the supported organisms. Not used whengenomeis a custom file.
exonType: currently not used.

preprocessParams

a named list whose members control certain preprocessing steps applied to the GRanges objects obtained while or after reading the input BAM/BED files with short reads or BigWig files with processed signals. The list has the following fields:

normalize: one of"none"(default),"linear", "downsample", "sampleto". Controls how the coverages are normalized across samples. See Details for explanation of these options.
sampleTo: a fixed library size for downsampling to be used with"sampleto"option above.
spliceAction: one of"split"(default),"keep", "remove". Controls the action to be performed with spliced reads in the case of RNA-Seq samples. See Details for explanation of these options.
spliceRemoveQ: the quantile of putative joint spliced read length to be used for read filtering whenspliceActionis"remove". See Details for further explanations.
bedGenome: one of the supported genomes, as when reading from bed files, chromosomal lenghts are not available and must be retrieved with another way.
seed: seed for random number generation to be used whenspliceActionis"downsample"or"sampleto". See Details for further explanations. Defaults to 42.

plotParams

a named list whose members control profile (curve and heatmap) plotting parameters. The list has the following fields:

plot: if set toTRUE(default), the plots created with the calculated profiles withrecoupare displayed. Set toFALSEto plot later using the output object.
profile: if set toTRUE(default), the average coverage profile across the genomic regions of preference is calculated. Set toFALSEto suppress this.
heatmap: if set toTRUE(default), the coverage heatmap profile across the genomic regions of preference is calculated. Set toFALSEto suppress this.
correlation: if set toTRUE(default), the plots created with the calculated coverage correlations are displayed. Set toFALSEto plot later using the output object.
signalScale: one of"natural"(default) or"log2"to control the signal scale of the final coverage plots. Hint: uselog2scale for RNA-Seq profiles as it produces much smoother plots.
heatmapScale: one of"common"(default) or"each". When"common", a common heatmap color scale is calculated for all samples. When"each", each heatmap has its own color scale.
heatmapFactor: a positive numeric value by which the upper color scale limit of the heatmap profile is multiplied. Defaults to1. See Details for further information.
corrScale: either"normalized"(default) or"each". Controls the scale display in coverage correlation plots. See Details for further information.
sumStat: the statistic which is used to summarize coverage matrices. Can be"mean"(default) or"median".
smooth: ifTRUE(default), the final curve profiles are smoothed using splines. Set toFALSEfor no smoothing. If the reference genomic regions are many, the differences are minimal.
corrSmoothPar: a numeric value between 0 and 1 which controls the smoothing of correlation plots. Its default value is controlled by the presence ofdesign. See Details for further information.
corrScale: either"normalized"(default) or"each". See Details for further information.
singleFacet: how shouldggplot2should facet the profiles with 1-factor design and only one sample whose profile will be plotted. Can be"none"(default),"wrap"or"grid". When"none", no gridding is applied and design factors are distinguished by colour. With more than one design factors, themultiFacetoption below is used.
multiFacet: how shouldggplot2should facet the profiles with 1-factor design and more than one samples whose profile will be plotted. Can be"wrap"(default) or"grid". 2 or 3 (3rd would be colour) factor designs are faceted with"grid".
conf: plot also confidence intervals usinggeom_ribbonin profile or correlation plots.
device: the R plotting device to redirect the plots to. Can be"x11"(default),"png", "jpg", "tiff", "bmp", "pdf", "ps".
outputDir: the directory to place profiles when the plotting device is not"x11". Defaults to".".
outputBase: the naming template for output files when the plotting device is not"x11". The extensions"_profile"and"_heatmap"will be appended to distinguish each plot type. LeaveNULL(default) for automatic filename generation.

saveParams

a named list which controls the information to be stored in the recoup output list object. The list has the following fields:

ranges: set toTRUE(default) to store theGRangesobject obtained from the BAM/BED files. Set toFALSEfor not saving. Not applicable when input is of type BigWig.
coverage: set toTRUE(default) to store theRlelist object obtained from the coverage calculations. Set toFALSEfor not saving.
profile: set toTRUE(default) to store the profile matrices exracted from coverage summarizations. Set toFALSEfor not saving. It must be present when using therecoupoutput in the plotting functionsrecoupProfileandrecoupHeatmap.
profilePlot: set toTRUE(default) to storeggplotobject containing the average coverage plot. Set toFALSEfor not saving. Must beTRUEif you wish to use therecoupoutput later withrecoupPlot.
heatmapPlot: set toTRUE(default) to storeComplexHeatmapobject containing the coverage heatmap plot. Set toFALSEfor not saving. Must beTRUEif you wish to use therecoupoutput later withrecoupPlot.
correlationPlot: set toTRUE(default) to storeggplotobject containing coverage correlation plot. Set toFALSEfor not saving. Must beTRUEif you wish to use therecoupoutput later withrecoupPlot.

See the Details section for some additional information.

kmParams

a named list which controls the execution of k-means clustering using standard R base function kmeans otherwise. The list has the following fields:

k: the number of clusters for k-means clustering. When0(default), no k-means clustering is performed.
nstart: Seekmeans.
algorithm: Seekmeans.
iterMax: See theiter.maxparameter inkmeans.
reference: which profile to use as reference for the determination of clusters and ordering. The rest of the heatmaps will be ordered according to the reference clustering. It can be either a sample id orNULL(default). If the latter, all the profile matrices are merged into one big matrix and k-means clustering is performed on that matrix.
seed: Seekmeans.

The result of k-means clustering will be appended to design as an additional field. If design is NULL, it will be created and passed to the plotting functions.

strandedParams

a named list which controls how strand information will be treated (if present). The list has the following fields:

strand: if set toNULL(default) then reads from both strands are used from the input BAM/BED files. If"+"or"-", then only the respective strands are used. Not applicable for input of type BigWig.
ignoreStrand:TRUE(default) orFALSE. Passed to theignore.strandargument in thefindOverlapsfunction used during coverage calculations.

ggplotParams

a named list with theme parameters passed to the ggplot function of the ggplot2 package. See the documentation of ggplot2 for further details. Only the parameters mentioned in the function call are used.

complexHeatmapParams

a named list with groups of parameters passed to the Heatmap function of the ComplexHeatmap package. The list has the following fields:

main:Heatmapparameters applied to each non-split (according todesign) heatmap. See therecoupfunction call for supported parameters andHeatmapfor further details.
main:Heatmapparameters applied to each split (according todesign) heatmap. See therecoupfunction call for supported parameters andHeatmapfor further details.

bamParams

BAM file read parameters passed to BamFile. See the related function. Currently this is not used.

onTheFly

Read short reads directly from BAM files when input contains paths to BAM files. In this case the storage of short reads in the output list object as a GRanges object is not possible. This argument is not used in this release and is a placeholder for the next release.

localDbHome

local path to retrieve annotation from when genome is one of the supported organisms. See also buildAnnotationStore.

fraction (0-1) of cores to use in a multicore system. It defaults to NULL (no parallelization).

Value

a named list with five members:
- data: theinputargument if it was a list or the resulting list from the unexported internalreadConfigfunction, with theranges,coverageandprofilefields filled according tosaveParams. Thisdatamember can be used again as an argument torecoup. Thecoverageandprofilefields will be recalculated according torecoupparameters but therangeswill be resued if the input files are not changed.
- design: the design data frame which is used to facet the profiles. %\item \code{plotopts}: plotting options read from %\code{\link{recoupProfile}} and %\code{\link{recoupHeatmap}} functions in order to %create the respective plots. It is a simple list %object and users can modify the parameters which %are mostly self-explanatory.
- plots: theggplot2and/orHeatmapobjects created byrecoup.
- callopts: the majority ofrecoupcall parameters. Their storage serves the reuse of arecouplist object so that only certain elements of plots are recalculated.

Details

When input is a list, is should contain as many sublists as the number of samples. Each sublist must have at least the following fields:

id: a unique identifier for each sample which should not contain whitespaces and preferably no special characters.
file: the full path to the BAM, BED of BigWig file. If the path to the BAM is a hyperlink, the BAM file must be indexed. BigWigs are already indexed.
format: one of"bam","bed"or"bigwig".

Additionally, each sublist may also contain the following fields:

name: a sample name which will appear in plots.
color: either an R color (see thecolorsfunction) or a hexadecimal color (e.g."#FF0000").

When input is a text file, this should be strictly tab-delimited (no other delimiters like comma), should contain a header with the same names (case sensitive) as the sublist fields above (id, file, format are mandatory and name, color are optional). When genome is not one of the supported organisms, it should be a text tab delimited file (only tabs supported) with a header line, or a data frame, where the basic BED field must be present, that means that at least "chromosome", "start", "end", a unique identifier column and "strand" must be present, preferably in this order. This file is read in a data frame and then passed to the makeGRangesFromDataFrame function from the GenomicRanges package which takes care of the rest. See also the makeGRangesFromDataFrame's documentation. When genome is one of the supported organisms, recoup takes care of the rest. When region is "tss", the curve and heatmap profiles are centered around the TSS of the (gene) regions provided with the genome argument, flanked accordind to the flank argument. The same applied for region="tes" where the plots are centered around the transcription end site. When region is "genebody", the profiles consist of two flanking parts (upstream of the TSS and downstream of the TES) and a middle part consisting of the gene body coverage profile. The latter is constructed by creating a fixed number of intervals (bins) along each gene and averaging the coverage of each interval. In some extreme cases (e.g. for small genes), the number of bins may be larger than the gene length. In these cases, a few zeros are distributed randomly across the number of bins to reach the predefined number of gene body intervals. When region is "custom" the behavior depends on the custom regions length. If it contains single-base intervals (e.g. ChIP-Seq peak centers), then the behavior is similar to the TSS behavior above. If it contains genomic intervals of equal or unequal size, the behavior is similar to the gene body case. The fraction parameter controls the total fraction of both total reads and genomic regions to be used for profile creation. This means that the total reads for each sample are randomly downsampled to fraction*100% of the original reads and the same applied to the input genomic regions. This practice is followed by similar packages (like ngs.plot) and serves the purpose of a quick overview of how the actual profiles look before profiling the whole genome. Regarding the orderBy parameters, for the options of the what parameter "sum" type of options order profiles according to i) the sum of coverages of all samples in each genomic region when orderBy$what="suma" or ii) the sum of coverages of sample n (e.g. 2) in each genomic region when orderBy$what="sumn" (e.g. orderBy$what="sum1"). The same apply for the "max" type of options but this time the ordering is performed according to the position of the highest coverage in each genomic profile. Ties in the position of highest coverage are broken randomly and sorting is performed with the default R sort. Similarly for "avg" type of options, the ordering is performed according to the average total coverage of a reference region. For the "hc" type of options, hierarchical clustering is performed on the selected (n) reference profile (e.g. orderBy$what="hc1") and this ordering is applied to the rest of the sample profiles. When what="none", no ordering is performed and the input order is used (genome argument). If any design is present through the design argument or k-means clustering is also performed (through the kmParams argument), the orderBy directives are applied to each sub-profile created by design or k-means clustering. Regarding the flankBinSize field of binParams, it is used only when region="genebody" or region="custom" and the custom regions are not single-base regions. This happens as when the genomic regions to be profiled are single-base regions (e.g. TSSs or ChIP-Seq peak centers), these regions are merged with the flanking areas and alltogether form the main genomic region. In these cases, only the regionBinSize field value is used. Note that when type="rnaseq" or region="genebody" or region="custom" with non single-base regions the values of flankBinSize and regionBinSize offer a fine control over how the flanking and the main regions are presented in the profiles. For example, when flankBinSize=100 and regionBinSize=100 with a gene body profile plot, the outcome will look kind of "unrealistic" as the e.g. 2kb flanking regions will look very similar to the usually larger gene bodies. On the other hand if flankBinSize=50 and regionBinSize=200, this setting will create more "realistic" gene body profiles as the flanking regions will be squished and the gene body area will look expanded. Within the same parameter group is also interpolation. When working with reference regions of different lengths (e.g. gene bodies), it happens very often that their lengths are a little to a lot smaller than the number of bins into which they should be split and averaged in order to be able to create the average curve and heatmap profiles. recoup allows for dynamic resolutions by permitting to the user to set the number of bins into which genomic areas will be binned or by allowing a per-base resolution where possible. The interpolation parameter controls what happens in such cases. When "spline", the R function spline is used, with the default method, to produce a spline interpolation of the same size as the regionBinSize option and is used as the coverage for that region. When "linear", the procedure is the same as above but using approx. When "neighborhood", a number of NA values are distributed randomly across the small area coverage vector, excluding the first and the last two positions, in order to reach regionBinSize. Then, each NA position is filled with the mean value of the two values before and the two values after the NA position, with na.rm=TRUE. This method should be avoided when >20% of the values of the extended vector are NA's as it may cause a crash. However, it should be the most accurate one in the opposite case (few NA's). When "auto" (the default), a hybrid of "spline" and "neighborhood" is applied. If the NA's constitute more than 20% of the extended vector, "spline" is used, otherwise "neighborhood". None of the above is applied to regions of equal length as there is no need for that. Regarding the usage of selector$id field, this requires some careful usage, as if the ids present there and the ids of the genome areas do not match, there will be no genomic regions left to calculate coverage profiles on and the program will crash. Regarding the usage of the preprocessParams argument, the normalize field controls how the GRanges representing the reads extracted from BAM/BED files or the signals extracted from BigWig files will be normalized. When "none", no normalization is applied and external normalization is assumed. When "linear", all the library sizes are divided by the maximum one and a normalization factor is calculated for each sample. The coverage of this sample across the input genomic regions is then multiplied by this factor. When "downsample", all libraries are downsampled to the minimum library size among samples. When "sampleto", all libraries are downsampled to a fixed number of reads. The "seed" field is used to define a seed for the generation of the random indices of reads to be removed for downsampling. It is required for reproducible results. The sampleTo field of preprocessParams tells recoup the fixed number of reads to downsample all libraries when preprocessParams$normalize="sampleTo". It defaults to 1 million reads (1e+6). The spliceAction field of preprocessParams is used to control the action to be taken in the presence of RNA-Seq spliced reads (implies type="rnaseq"). When "keep", no action is performed regarding the spliced reads (represented as very long reads spanning intronic regions in the GRanges object). When "remove", these reads are excluded from coverage calculations according to their length as follows: firstly the length distribution of all reads lengths (using the width function for GRanges) is calculated. Then the quantile defined by the field spliceRemoveQ of preprocessParams is calculated and reads above the length corresponding to this quantile are excluded. When "splice", then splice junction information inferred from CIGAR strings (if) present in the BAM files is used to splice the longer reads and calculate real coverages. This option is not available with BED files, however, BED files can contain pre-spliced reads using for example BEDTools for conversion. It should also be noted that in the case of BigWig files, only linear normalization is supported as there is no information on raw reads. Regarding the heatmapFactor option of plotParams, it controls the color scale of the heatmap as follows: the default value (1) causes the extremes of the heatmap colors to be linearly and equally distributed across the actual coverage profile values. If set smaller than 1, the the upper extreme of the coverage values (which by default maps to the upper color point) is multiplied by this factor and this new value is set as the upper color break (limit). This has the effect of decreasing the brightness of the heatmap as color is saturated before reaching the maximum coverage value. If set greater than 1, then the heatmap brightness is increased. Regarding the correlation option of plotParams, if TRUE then recoup calculates average coverage values for each reference region (row-wise in the profile matrices) instead of the average coverage in each base of the reference regions (column-wise in the profile matrices). This is particularly useful for checking whether total genome profiles for some biological factor/condition correlate with each other. This potential correlation is becoming even clearer when orderBy$what is not "none". Regarding the corrScale option of plotParams, it controls whether the average coverage curves over the set of reference genomic regions (one average coverage vale per genomic region, note the difference with the profiles where the coverage is calculated over the genomic locations themselves) should be normalized to a 0-1 scale or not. This is particularly useful when plotting data from different libraries (e.g. PolII and H3K27me1 occupancy over gene bodies) where other types of normalization (e.g. read downsampling cannot be applied). Regarding the corrSmoothPar option of plotParams, it controls the smoothing parameter for coverage correlation curves. If design is present, spline smoothing is applied (smooth.spline) with spar=0.5 else lowess smoothing is applied (lowess) with f=0.1. corrSmoothPar controls the spar and f respectively. Regarding the usage of saveParams argument, this is useful for several purposes: one is for re-using recoup without re-reading BAM/BED/BigWig files. If the ranges are present in the input object to recoup, they are not re-calculated. If not stored, the memory/storage usage is reduced but the object can be used only for simply replotting the profiles using recoupProfile and/or recoupHeatmap functions. As a note regarding parallel calculations, the number of cores assigned to recoup depends both on the number of cores and the available RAM in your system. The most RAM expensive part of recoup is currently the construction of binned profile matrices. If you have a lot of cores (e.g. 16) but less than 128Gb of RAM for this number of cores, you should avoid using all cores, especially with large BAM files. Half of them would be more appropriate. Finally, the output list of recoup can be provided as input again to recoup with some input parameters changed. recoup will then automatically recognize what has been changed and recalculate some, all or none of the genomic region profiles, depending on what input parameters have changed. For example, if any of the ordering options change (e.g. from no profile ordering to k-means clustering), then no recalculations are performed and the process is very fast. If region binning is changed (binParams$flankBinSize or binParams$regionBinSize), then only profile matrices are recalculated and coverages are maintained. If any of the preprocessParams changes, this causes all object including the short reads to be reimported and profiles recalculated from the beginning.

Examples

Run this code

# Load some sample data
data("recoup_test_data",package="recoup")

# Note: the figures that will be produced will not look 
# realistic and will be "bumpy". This is because package
# size limitations posed by Bioconductor guidelines do not
# allow for a full test dataset. Have a look at the
# vignette on how to test with more realistic data.

# TSS high resolution profile with no design
test.tss <- recoup(
    test.input,
    design=NULL,
    region="tss",
    type="chipseq",
    genome=test.genome,
    flank=c(2000,2000),
    selector=NULL,
    rc=0.5
)
# Genebody low resolution profile with 2-factor design, 
# wide genebody and more narrow flanking
test.gb <- recoup(
    test.input,
    design=test.design,
    region="genebody",
    type="chipseq",
    genome=test.genome,
    flank=c(2000,2000),
    binParams=list(flankBinSize=50,regionBinSize=150),
    orderBy=list(what="hc1"),
    selector=NULL,
    rc=0.5
)

Run the code above in your browser using DataLab