... | ... |
@@ -11,13 +11,10 @@ BSmooth(BSseq, |
11 | 11 |
h = 1000, |
12 | 12 |
maxGap = 10^8, |
13 | 13 |
keep.se = FALSE, |
14 |
- verbose = TRUE, |
|
15 | 14 |
BPPARAM = bpparam(), |
16 |
- BACKEND = getRealizationBackend(), |
|
17 |
- ..., |
|
18 |
- parallelBy = c("sample", "chromosome"), |
|
19 |
- mc.preschedule = FALSE, |
|
20 |
- mc.cores = 1) |
|
15 |
+ chunkdim = NULL, |
|
16 |
+ level = NULL, |
|
17 |
+ verbose = getOption("verbose")) |
|
21 | 18 |
} |
22 | 19 |
\arguments{ |
23 | 20 |
\item{BSseq}{An object of class \code{BSseq}.} |
... | ... |
@@ -25,17 +22,9 @@ BSmooth(BSseq, |
25 | 22 |
\item{h}{The minimum smoothing window, in bases.} |
26 | 23 |
\item{maxGap}{The maximum gap between two methylation loci, before the smoothing is broken across the gap. |
27 | 24 |
The default smoothes each chromosome separately.} |
28 |
- \item{parallelBy}{\strong{Deprecated}. |
|
29 |
- See section 'Parallelization and progress monitoring' for further details.} |
|
30 |
- \item{mc.preschedule}{\strong{Deprecated}. |
|
31 |
- See section 'Parallelization and progress monitoring' for further details.} |
|
32 |
- \item{mc.cores}{\strong{Deprecated}. |
|
33 |
- See section 'Parallelization and progress monitoring' for further details.} |
|
34 | 25 |
\item{keep.se}{Should the estimated standard errors from the smoothing |
35 | 26 |
algorithm be kept. This will make the return object roughly 30 |
36 | 27 |
percent bigger and is currently not be used for anything in \pkg{bsseq}.} |
37 |
- \item{verbose}{\strong{Deprecated}. |
|
38 |
- See section, 'Parallelization and progress monitoring' for further details.} |
|
39 | 28 |
\item{BPPARAM}{An optional \linkS4class{BiocParallelParam} instance |
40 | 29 |
determining the parallel back-end to be used during evaluation. Currently |
41 | 30 |
supported are \linkS4class{SerialParam} (Unix, Mac, Windows), |
... | ... |
@@ -44,13 +33,18 @@ BSmooth(BSseq, |
44 | 33 |
\linkS4class{BatchJobsParam} (Unix, Mac, Windows, only with the in-memory |
45 | 34 |
realization backend). See sections 'Parallelization and progress |
46 | 35 |
monitoring' and 'Realization backends' for further details.} |
47 |
- \item{BACKEND}{\code{NULL} or a single string specifying the name of the |
|
48 |
- realization backend. When the backend is set to \code{NULL}, the |
|
49 |
- \code{coef} and \code{se.coef} assays are realized in memory as ordinary |
|
50 |
- matrices, otherwise these are realized with the given \code{BACKEND}.} |
|
51 |
- \item{...}{Additional arguments passed to the \linkS4class{RealizationSink} |
|
52 |
- constructor. |
|
53 |
- See section 'Realization backends' for further details.} |
|
36 |
+ \item{chunkdim}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.} |
|
37 |
+ The dimensions of the chunks to use for writing the data to |
|
38 |
+ disk. By default, \code{\link[HDF5Array]{getHDF5DumpChunkDim}()} using the |
|
39 |
+ dimensions of the returned \linkS4class{BSseq} object will be used. See |
|
40 |
+ \code{?\link[HDF5Array]{getHDF5DumpChunkDim}} for more information.} |
|
41 |
+ \item{level}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.} |
|
42 |
+ The compression level to use for writing the data to disk. By |
|
43 |
+ default, \code{\link[HDF5Array]{getHDF5DumpCompressionLevel}()} will be |
|
44 |
+ used. See \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} for more |
|
45 |
+ information.} |
|
46 |
+ \item{verbose}{A \code{logical(1)} indicating whether progress messages |
|
47 |
+ should be printed (default \code{TRUE}).} |
|
54 | 48 |
} |
55 | 49 |
\details{ |
56 | 50 |
\code{ns} and \code{h} are passed to the \code{locfit} function. The |
... | ... |
@@ -141,14 +141,15 @@ slots for representing smoothed data. This class is an extension of |
141 | 141 |
\item{\code{strandCollapse(BSseq, shift = TRUE)}}{ |
142 | 142 |
|
143 | 143 |
This function operates on a \code{BSseq} objects which has |
144 |
- stranded loci (ie. loci where the strand is one of \sQuote{+} pr |
|
145 |
- \sQuote{-}). It will collapse the methylation and coverage |
|
146 |
- information across the two strands, into one position. The |
|
147 |
- argument \code{shift} indicates whether the positions for the loci |
|
148 |
- on the reverse strand should be shifted one (ie. the positions for |
|
149 |
- these loci are the positions of the \sQuote{G} in the |
|
150 |
- \sQuote{CpG}; this is the case for Bismark output for example.} |
|
144 |
+ stranded loci (i.e. loci where the strand is one of \sQuote{+} or |
|
145 |
+ \sQuote{-}). It will collapse the methylation and coverage |
|
146 |
+ information across the two strands, unstranding the loci in the process |
|
147 |
+ and potentially re-ordering them. |
|
151 | 148 |
|
149 |
+ The argument \code{shift} indicates whether the positions for the loci |
|
150 |
+ on the reverse strand should be shifted one (i.e. the positions for |
|
151 |
+ these loci are the positions of the \sQuote{G} in the |
|
152 |
+ \sQuote{CpG}; this is the case for Bismark output for example).} |
|
152 | 153 |
} |
153 | 154 |
} |
154 | 155 |
\section{Coercion}{ |
... | ... |
@@ -183,18 +184,17 @@ M <- matrix(1:9, 3,3) |
183 | 184 |
colnames(M) <- c("A1", "A2", "A3") |
184 | 185 |
BStest <- BSseq(pos = 1:3, chr = c("chr1", "chr2", "chr1"), M = M, Cov = M + 2) |
185 | 186 |
chrSelectBSseq(BStest, seqnames = "chr1", order = TRUE) |
186 |
-collapseBSseq(BStest, columns = c("A1" = "A", "A2" = "A", "A3" = "B")) |
|
187 |
+collapseBSseq(BStest, group = c("A", "A", "B")) |
|
187 | 188 |
|
188 | 189 |
#------------------------------------------------------------------------------- |
189 | 190 |
# An example using a HDF5-backed BSseq object |
190 | 191 |
# |
191 |
-library(HDF5Array) |
|
192 |
-hdf5_M <- realize(M, "HDF5Array") |
|
193 |
-hdf5_BStest <- BSseq(pos = 1:3, |
|
194 |
- chr = c("chr1", "chr2", "chr1"), |
|
195 |
- M = hdf5_M, |
|
196 |
- Cov = hdf5_M + 2) |
|
192 |
+hdf5_BStest <- realize(BStest, "HDF5Array") |
|
197 | 193 |
chrSelectBSseq(hdf5_BStest, seqnames = "chr1", order = TRUE) |
198 |
-collapseBSseq(hdf5_BStest, columns = c("A1" = "A", "A2" = "A", "A3" = "B")) |
|
194 |
+collapseBSseq( |
|
195 |
+ BSseq = hdf5_BStest, |
|
196 |
+ group = c("A", "A", "B"), |
|
197 |
+ BACKEND = "HDF5Array", |
|
198 |
+ type = "integer") |
|
199 | 199 |
} |
200 | 200 |
\keyword{classes} |
... | ... |
@@ -13,12 +13,12 @@ |
13 | 13 |
rmZeroCov = FALSE, |
14 | 14 |
strandCollapse = TRUE, |
15 | 15 |
BPPARAM = bpparam(), |
16 |
- BACKEND = getRealizationBackend() |
|
17 |
- dir = tempdir(), |
|
16 |
+ BACKEND = getRealizationBackend(), |
|
17 |
+ dir = tempfile("BSseq"), |
|
18 | 18 |
replace = FALSE, |
19 | 19 |
chunkdim = NULL, |
20 | 20 |
level = NULL, |
21 |
- verbose = TRUE) |
|
21 |
+ verbose = getOption("verbose")) |
|
22 | 22 |
} |
23 | 23 |
\arguments{ |
24 | 24 |
\item{files}{The path to the files created by running Bismark's methylation |
... | ... |
@@ -53,7 +53,8 @@ |
53 | 53 |
(Unix, Mac, and Windows, limited to single-machine clusters), and |
54 | 54 |
\linkS4class{BatchJobsParam} (Unix, Mac, Windows, only with the in-memory |
55 | 55 |
realization backend). |
56 |
- See sections 'Parallelization and progress monitoring' and 'Realization backends' for further details.} |
|
56 |
+ See sections 'Parallelization and progress monitoring' and 'Realization |
|
57 |
+ backends' for further details.} |
|
57 | 58 |
\item{BACKEND}{\code{NULL} or a single string specifying the name of the |
58 | 59 |
realization backend. |
59 | 60 |
When the backend is set to \code{NULL}, the \code{M} and \code{Cov} assays |
... | ... |
@@ -80,13 +81,36 @@ |
80 | 81 |
should be printed (default \code{TRUE}).} |
81 | 82 |
} |
82 | 83 |
|
84 |
+\section{File formats}{ |
|
85 |
+ The format of each file is automatically detected using the internal function \code{bsseq:::.guessBismarkFileType()}. |
|
86 |
+ Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically decompressed to \code{\link[base]{tempdir}()}. |
|
87 |
+ \subsection{Supported file formats}{ |
|
88 |
+ Bismark's 'genome wide cytosine report' (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-genome-wide-cytosine-report-optional-is-tab-delimited-in-the-following-format-1-based-coords}) and 'coverage' (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords}) formats are both supported. |
|
89 |
+ If setting \code{loci = NULL}, then we strongly recommend using the 'genome wide cytosine report' output format because this includes strand information for each locus. |
|
90 |
+ The 'coverage' output does not contain strand information and so the \code{\link[BiocGenerics]{strand}} of the returned \linkS4class{BSseq} object will be set to \code{*} unless stranded \code{loci} are supplied. |
|
91 |
+ } |
|
92 |
+ |
|
93 |
+ \subsection{Unsupported file formats}{ |
|
94 |
+ Neither the 'bedGraph' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bedgraph-output-optional-looks-like-this-tab-delimited-0-based-start-1-based-end-coords}) nor the 'bismark_methylation_extractor' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bismark_methylation_extractor-output-is-in-the-form-tab-delimited-1-based-coords}) are supported. |
|
95 |
+ The former does not include the required counts of methylated and unmethylated reads hile the is an intermediate file containing read-level, rather than locus-level, data on methylation. |
|
96 |
+ } |
|
97 |
+ \subsection{One-based vs. zero-based genomic co-ordinates}{ |
|
98 |
+ The genomic co-ordinates of the Bismark output files may be zero-based or one-based depending on whether the \code{--zero_based} argument was used when running Bismark's methylation extractor. |
|
99 |
+ Furthermore, the default co-ordinate counting system varies by version of Bismark. |
|
100 |
+ \pkg{bsseq} makes no assumptions about the basis of the genomic co-ordinates and it is left to the user to ensure that the appropriate basis is used in the analysis of their data. |
|
101 |
+ |
|
102 |
+ Since Bioconductor packages typically use one-based co-ordinates, we strongly recommend that your Bismark files are also one-based. |
|
103 |
+ } |
|
104 |
+} |
|
105 |
+ |
|
83 | 106 |
\section{Efficient use of \code{read.bismark()}}{ |
84 | 107 |
We recommend the following to achieve fast and efficient importing of Bismark files: |
85 | 108 |
|
86 | 109 |
\itemize{ |
87 | 110 |
\item Specify the set of methylation loci via the \code{loci} argument. |
111 |
+ \item Use Bismark files in the 'coverage' output format. |
|
88 | 112 |
\item Leave \code{rmZeroCov = FALSE}. |
89 |
- \item Use a \code{BPPARAM} with a large number of workers (cores). |
|
113 |
+ \item Use a \code{BPPARAM} with a moderate number of workers (cores). |
|
90 | 114 |
\item Use \code{BACKEND = "HDF5Array"}. |
91 | 115 |
} |
92 | 116 |
|
... | ... |
@@ -98,24 +122,28 @@ |
98 | 122 |
|
99 | 123 |
\strong{If you are unsure whether the below-described shortcuts apply to your data, leave \code{loci = NULL} and let \code{read.bismark()} identify the set of candidate loci from \code{files}.} |
100 | 124 |
|
101 |
- If all \code{files} are 'genome wide cytosine reports' for samples aligned to the same reference genome, then all \code{files} contain the exact same set of methylation loci. |
|
125 |
+ You may wish to use the \code{\link{findLoci}()} function to find all methylation loci of interest in your reference genome (e.g., all CpGs) and then pass the result via the \code{loci} argument. |
|
126 |
+ |
|
127 |
+ Alternatively, if all \code{files} are 'genome wide cytosine reports' for samples aligned to the same reference genome, then all \code{files} contain the exact same set of methylation loci. |
|
102 | 128 |
In this case, you may wish to first construct \code{loci} using the internal function \code{bsseq:::.readBismarkAsFWGRanges()} applied to a single file, e.g., \code{loci = bsseq:::.readBismarkAsFWGRanges(files[1], rmZeroCov, strandCollapse)}. |
129 |
+ } |
|
103 | 130 |
|
104 |
- % TODO: Uncomment once findCytosines is exported and documented. |
|
105 |
- % Alternatively, you may wish to use the \code{\link{findCytosines}()} function to construct all methylation loci of interest (e.g., all CpGs) in your reference genome and then pass this via the \code{loci} argument. |
|
131 |
+ \subsection{Using the 'coverage' Bismark files}{ |
|
132 |
+ It will generally be faster to parse Bismark files in the 'coverage' output format than those in the 'genome wide cytosine report' format This is because the former only includes loci with non-zero coverage and so the file size is often considerably smaller, particularly for shallowly sequenced samples (e.g., those from single-cell bisulfite sequencing). |
|
106 | 133 |
} |
107 | 134 |
|
108 | 135 |
\subsection{Leaving \code{rmZeroCov = FALSE}}{ |
109 | 136 |
If you set \code{rmZeroCov = TRUE}, then \code{read.bismark()} must first parse all the \code{files} to identify which loci have zero coverage in all samples and then filter these out from the set of candidate loci. |
110 |
- \strong{This will happen regardless of whether \code{loci} is \code{NULL} or is supplied as a \linkS4class{GenomicRanges} of candidate loci.} |
|
137 |
+ \strong{This will happen even if you supply \code{loci} with a \linkS4class{GenomicRanges} of candidate loci.} |
|
111 | 138 |
|
112 | 139 |
Furthermore, any coverage-based filtering of methylation loci is best left until you have constructed your final \linkS4class{BSseq} object. |
113 | 140 |
In our experience, the final \linkS4class{BSseq} object is often the product of combining multiple \linkS4class{BSseq} objects, each constructed with a separate call to \code{read.bismark()}. |
114 |
- In such cases, it is premature to use \code{rmZeroCov = TRUE} when running each \code{read.bismark()} and combining these objects will regretably often lead to an inefficiently constructed final \linkS4class{BSseq} object. |
|
141 |
+ In such cases, it is premature to use \code{rmZeroCov = TRUE} when running each \code{read.bismark()}; regretably, combining these objects will often then lead to an inefficiently stored \linkS4class{BSseq} object. |
|
115 | 142 |
} |
116 | 143 |
|
117 |
- \subsection{Using a \code{BPPARAM} with a large number of workers (cores)}{ |
|
118 |
- Each file can be processed on its own, so you can process in parallel as many files as you have workers. We have good experience using as many as 40 cores via \code{BPPARAM = MulticoreParam(workers = 40)}.} |
|
144 |
+ \subsection{Using a \code{BPPARAM} with a moderate number of workers (cores)}{ |
|
145 |
+ Each file can be processed on its own, so you can process in parallel as many files as you have workers. However, if using the HDF5Array backend, then writing to the HDF5 file cannot be performed in parallel and so becomes the bottleneck. Nonetheless, by using a moderate number of workers (2 - 10), we can ensure there is processed data available to write to disk as soon as the current write is completed. |
|
146 |
+ } |
|
119 | 147 |
|
120 | 148 |
\subsection{Using \code{BACKEND = "HDF5Array"}}{ |
121 | 149 |
By using the HDF5Array realization backend from \pkg{HDF5Array}, we reduce the amount of data that is kept in-memory at any one time. |
... | ... |
@@ -126,28 +154,6 @@ |
126 | 154 |
} |
127 | 155 |
} |
128 | 156 |
|
129 |
-\section{File formats}{ |
|
130 |
- The format of each file is automatically detected using the internal function \code{bsseq:::.guessBismarkFileType()}. |
|
131 |
- Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. |
|
132 |
- \subsection{Supported file formats}{ |
|
133 |
- We strongly recommend that you use the 'genome wide cytosine report' output format |
|
134 |
- (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-genome-wide-cytosine-report-optional-is-tab-delimited-in-the-following-format-1-based-coords}) as this includes trand information for each locus. |
|
135 |
- The 'coverage' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords}) is also supported, although all loci from these files will have their \code{\link[BiocGenerics]{strand}} set to \code{*} in the returned \linkS4class{BSseq} object because this file format does not include strand information. |
|
136 |
- } |
|
137 |
- |
|
138 |
- \subsection{Unsupported file formats}{ |
|
139 |
- Neither the 'bedGraph' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bedgraph-output-optional-looks-like-this-tab-delimited-0-based-start-1-based-end-coords}) nor the 'bismark_methylation_extractor' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bismark_methylation_extractor-output-is-in-the-form-tab-delimited-1-based-coords}) are supported. |
|
140 |
- The former does not include the required counts of methylated and unmethylated reads hile the is an intermediate file containing read-level, rather than locus-level, data on methylation. |
|
141 |
- } |
|
142 |
- \subsection{One-based vs. zero-based genomic co-ordinates}{ |
|
143 |
- The genomic co-ordinates of the Bismark output files may be zero-based or one-based depending on whether the \code{--zero_based} argument was used when running Bismark's methylation extractor. |
|
144 |
- Furthermore, the default co-ordinate counting system varies by version of Bismark. |
|
145 |
- \pkg{bsseq} makes no assumptions about the basis of the genomic co-ordinates and it is left to the user to ensure that the appropriate basis is used in the analysis of their data. |
|
146 |
- |
|
147 |
- Since Bioconductor packages typically use one-based co-ordinates, we strongly recommend that your Bismark files are also one-based. |
|
148 |
- } |
|
149 |
-} |
|
150 |
- |
|
151 | 157 |
\section{Realization backends}{ |
152 | 158 |
The \code{read.bismark()} function creates a \linkS4class{BSseq} object with two assays, \code{M} and \code{Cov}. |
153 | 159 |
The choice of \emph{realization backend} controls whether these assays are stored in-memory as an ordinary \link[base]{matrix} or on-disk as a \linkS4class{HDF5Array}, for example. |
... | ... |
@@ -211,10 +217,12 @@ |
211 | 217 |
} |
212 | 218 |
|
213 | 219 |
\seealso{ |
214 |
- \code{\link{read.bsmooth}()} for parsing output from the BSmooth |
|
220 |
+ \itemize{ |
|
221 |
+ \item \code{\link{read.bsmooth}()} for parsing output from the BSmooth |
|
215 | 222 |
alignment suite. |
216 |
- \code{\link{read.umtab}()} for parsing legacy (old) formats from the BSmooth alignment suite. |
|
217 |
- \code{\link{collapseBSseq}()} for collapsing (aggregating) data from sample's with multiple Bismark methylation extractor files (e.g., technical replicates). |
|
223 |
+ \item \code{\link{read.umtab}()} for parsing legacy (old) formats from the BSmooth alignment suite. |
|
224 |
+ \item \code{\link{collapseBSseq}()} for collapsing (aggregating) data from sample's with multiple Bismark methylation extractor files (e.g., technical replicates). |
|
225 |
+ } |
|
218 | 226 |
} |
219 | 227 |
|
220 | 228 |
\examples{ |
... | ... |
@@ -252,3 +260,5 @@ |
252 | 260 |
\author{ |
253 | 261 |
Peter Hickey \email{peter.hickey@gmail.com} |
254 | 262 |
} |
263 |
+ |
|
264 |
+% TODO: Mention and link to findLoci(). |