Browse code

Update docs

Peter Hickey authored on 17/09/2018 11:03:43
Showing 3 changed files

... ...
@@ -11,13 +11,10 @@ BSmooth(BSseq,
11 11
         h = 1000,
12 12
         maxGap = 10^8,
13 13
         keep.se = FALSE,
14
-        verbose = TRUE,
15 14
         BPPARAM = bpparam(),
16
-        BACKEND = getRealizationBackend(),
17
-        ...,
18
-        parallelBy = c("sample", "chromosome"),
19
-        mc.preschedule = FALSE,
20
-        mc.cores = 1)
15
+        chunkdim = NULL,
16
+        level = NULL,
17
+        verbose = getOption("verbose"))
21 18
 }
22 19
 \arguments{
23 20
   \item{BSseq}{An object of class \code{BSseq}.}
... ...
@@ -25,17 +22,9 @@ BSmooth(BSseq,
25 22
   \item{h}{The minimum smoothing window, in bases.}
26 23
   \item{maxGap}{The maximum gap between two methylation loci, before the smoothing is broken across the gap.
27 24
   The default smoothes each chromosome separately.}
28
-  \item{parallelBy}{\strong{Deprecated}.
29
-    See section 'Parallelization and progress monitoring' for further details.}
30
-  \item{mc.preschedule}{\strong{Deprecated}.
31
-    See section 'Parallelization and progress monitoring' for further details.}
32
-  \item{mc.cores}{\strong{Deprecated}.
33
-    See section 'Parallelization and progress monitoring' for further details.}
34 25
   \item{keep.se}{Should the estimated standard errors from the smoothing
35 26
     algorithm be kept. This will make the return object roughly 30
36 27
     percent bigger and is currently not be used for anything in \pkg{bsseq}.}
37
-  \item{verbose}{\strong{Deprecated}.
38
-    See section, 'Parallelization and progress monitoring' for further details.}
39 28
   \item{BPPARAM}{An optional \linkS4class{BiocParallelParam} instance
40 29
     determining the parallel back-end to be used during evaluation. Currently
41 30
     supported are \linkS4class{SerialParam} (Unix, Mac, Windows),
... ...
@@ -44,13 +33,18 @@ BSmooth(BSseq,
44 33
     \linkS4class{BatchJobsParam} (Unix, Mac, Windows, only with the in-memory
45 34
     realization backend). See sections 'Parallelization and progress
46 35
     monitoring' and 'Realization backends' for further details.}
47
-  \item{BACKEND}{\code{NULL} or a single string specifying the name of the
48
-    realization backend. When the backend is set to \code{NULL}, the
49
-    \code{coef} and \code{se.coef} assays are realized in memory as ordinary
50
-    matrices, otherwise these are realized with the given \code{BACKEND}.}
51
-  \item{...}{Additional arguments passed to the \linkS4class{RealizationSink}
52
-    constructor.
53
-    See section 'Realization backends' for further details.}
36
+  \item{chunkdim}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.}
37
+    The dimensions of the chunks to use for writing the data to
38
+    disk. By default, \code{\link[HDF5Array]{getHDF5DumpChunkDim}()} using the
39
+    dimensions of the returned \linkS4class{BSseq} object will be used. See
40
+    \code{?\link[HDF5Array]{getHDF5DumpChunkDim}} for more information.}
41
+  \item{level}{\strong{Only applicable if \code{BACKEND == "HDF5Array"}.}
42
+    The compression level to use for writing the data to disk. By
43
+    default, \code{\link[HDF5Array]{getHDF5DumpCompressionLevel}()} will be
44
+    used. See \code{?\link[HDF5Array]{getHDF5DumpCompressionLevel}} for more
45
+    information.}
46
+  \item{verbose}{A \code{logical(1)} indicating whether progress messages
47
+    should be printed (default \code{TRUE}).}
54 48
 }
55 49
 \details{
56 50
   \code{ns} and \code{h} are passed to the \code{locfit} function. The
... ...
@@ -141,14 +141,15 @@ slots for representing smoothed data. This class is an extension of
141 141
     \item{\code{strandCollapse(BSseq, shift = TRUE)}}{
142 142
 
143 143
       This function operates on a \code{BSseq} objects which has
144
-      stranded loci (ie. loci where the strand is one of \sQuote{+} pr
145
-      \sQuote{-}).  It will collapse the methylation and coverage
146
-      information across the two strands, into one position.  The
147
-      argument \code{shift} indicates whether the positions for the loci
148
-      on the reverse strand should be shifted one (ie. the positions for
149
-      these loci are the positions of the \sQuote{G} in the
150
-      \sQuote{CpG}; this is the case for Bismark output for example.}
144
+      stranded loci (i.e. loci where the strand is one of \sQuote{+} or
145
+      \sQuote{-}). It will collapse the methylation and coverage
146
+      information across the two strands, unstranding the loci in the process
147
+      and potentially re-ordering them.
151 148
 
149
+      The argument \code{shift} indicates whether the positions for the loci
150
+      on the reverse strand should be shifted one (i.e. the positions for
151
+      these loci are the positions of the \sQuote{G} in the
152
+      \sQuote{CpG}; this is the case for Bismark output for example).}
152 153
   }
153 154
 }
154 155
 \section{Coercion}{
... ...
@@ -183,18 +184,17 @@ M <- matrix(1:9, 3,3)
183 184
 colnames(M) <- c("A1", "A2", "A3")
184 185
 BStest <- BSseq(pos = 1:3, chr = c("chr1", "chr2", "chr1"), M = M, Cov = M + 2)
185 186
 chrSelectBSseq(BStest, seqnames = "chr1", order = TRUE)
186
-collapseBSseq(BStest, columns = c("A1" = "A", "A2" = "A", "A3" = "B"))
187
+collapseBSseq(BStest, group = c("A", "A", "B"))
187 188
 
188 189
 #-------------------------------------------------------------------------------
189 190
 # An example using a HDF5-backed BSseq object
190 191
 #
191
-library(HDF5Array)
192
-hdf5_M <- realize(M, "HDF5Array")
193
-hdf5_BStest <- BSseq(pos = 1:3,
194
-                     chr = c("chr1", "chr2", "chr1"),
195
-                     M = hdf5_M,
196
-                     Cov = hdf5_M + 2)
192
+hdf5_BStest <- realize(BStest, "HDF5Array")
197 193
 chrSelectBSseq(hdf5_BStest, seqnames = "chr1", order = TRUE)
198
-collapseBSseq(hdf5_BStest, columns = c("A1" = "A", "A2" = "A", "A3" = "B"))
194
+collapseBSseq(
195
+    BSseq = hdf5_BStest,
196
+    group = c("A", "A", "B"),
197
+    BACKEND = "HDF5Array",
198
+    type = "integer")
199 199
 }
200 200
 \keyword{classes}
... ...
@@ -13,12 +13,12 @@
13 13
              rmZeroCov = FALSE,
14 14
              strandCollapse = TRUE,
15 15
              BPPARAM = bpparam(),
16
-             BACKEND = getRealizationBackend()
17
-             dir = tempdir(),
16
+             BACKEND = getRealizationBackend(),
17
+             dir = tempfile("BSseq"),
18 18
              replace = FALSE,
19 19
              chunkdim = NULL,
20 20
              level = NULL,
21
-             verbose = TRUE)
21
+             verbose = getOption("verbose"))
22 22
 }
23 23
 \arguments{
24 24
   \item{files}{The path to the files created by running Bismark's methylation
... ...
@@ -53,7 +53,8 @@
53 53
     (Unix, Mac, and Windows, limited to single-machine clusters), and
54 54
     \linkS4class{BatchJobsParam} (Unix, Mac, Windows, only with the in-memory
55 55
     realization backend).
56
-    See sections 'Parallelization and progress  monitoring' and 'Realization backends' for further details.}
56
+    See sections 'Parallelization and progress  monitoring' and 'Realization
57
+    backends' for further details.}
57 58
   \item{BACKEND}{\code{NULL} or a single string specifying the name of the
58 59
     realization backend.
59 60
     When the backend is set to \code{NULL}, the \code{M} and \code{Cov} assays
... ...
@@ -80,13 +81,36 @@
80 81
     should be printed (default \code{TRUE}).}
81 82
 }
82 83
 
84
+\section{File formats}{
85
+  The format of each file is automatically detected using the internal function \code{bsseq:::.guessBismarkFileType()}.
86
+  Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically decompressed to \code{\link[base]{tempdir}()}.
87
+  \subsection{Supported file formats}{
88
+    Bismark's 'genome wide cytosine report' (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-genome-wide-cytosine-report-optional-is-tab-delimited-in-the-following-format-1-based-coords}) and 'coverage' (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords}) formats are both supported.
89
+    If setting \code{loci = NULL}, then we strongly recommend using the 'genome wide cytosine report' output format because this includes strand information for each locus.
90
+     The 'coverage' output does not contain strand information and so the \code{\link[BiocGenerics]{strand}} of the returned \linkS4class{BSseq} object will be set to \code{*} unless stranded \code{loci} are supplied.
91
+  }
92
+
93
+  \subsection{Unsupported file formats}{
94
+     Neither the 'bedGraph' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bedgraph-output-optional-looks-like-this-tab-delimited-0-based-start-1-based-end-coords}) nor the 'bismark_methylation_extractor' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bismark_methylation_extractor-output-is-in-the-form-tab-delimited-1-based-coords}) are supported.
95
+     The former does not include the required counts of methylated and unmethylated reads hile the is an intermediate file containing read-level, rather than locus-level, data on methylation.
96
+  }
97
+  \subsection{One-based vs. zero-based genomic co-ordinates}{
98
+    The genomic co-ordinates of the Bismark output files may be zero-based or one-based depending on whether the \code{--zero_based} argument was used when running Bismark's methylation extractor.
99
+    Furthermore, the default co-ordinate counting system varies by version of Bismark.
100
+    \pkg{bsseq} makes no assumptions about the basis of the genomic co-ordinates and it is left to the user to ensure that the appropriate basis is used in the analysis of their data.
101
+
102
+    Since Bioconductor packages typically use one-based co-ordinates, we strongly recommend that your Bismark files are also one-based.
103
+  }
104
+}
105
+
83 106
 \section{Efficient use of \code{read.bismark()}}{
84 107
   We recommend the following to achieve fast and efficient importing of Bismark files:
85 108
 
86 109
   \itemize{
87 110
     \item Specify the set of methylation loci via the \code{loci} argument.
111
+    \item Use Bismark files in the 'coverage' output format.
88 112
     \item Leave \code{rmZeroCov = FALSE}.
89
-    \item Use a \code{BPPARAM} with a large number of workers (cores).
113
+    \item Use a \code{BPPARAM} with a moderate number of workers (cores).
90 114
     \item Use \code{BACKEND = "HDF5Array"}.
91 115
   }
92 116
 
... ...
@@ -98,24 +122,28 @@
98 122
 
99 123
     \strong{If you are unsure whether the below-described shortcuts apply to your data, leave \code{loci = NULL} and let \code{read.bismark()} identify the set of candidate loci from \code{files}.}
100 124
 
101
-    If all \code{files} are 'genome wide cytosine reports' for samples aligned to the same reference genome, then all \code{files} contain the exact same set of methylation loci.
125
+    You may wish to use the \code{\link{findLoci}()} function to find all methylation loci of interest in your reference genome (e.g., all CpGs) and then pass the result via the \code{loci} argument.
126
+
127
+    Alternatively, if all \code{files} are 'genome wide cytosine reports' for samples aligned to the same reference genome, then all \code{files} contain the exact same set of methylation loci.
102 128
     In this case, you may wish to first construct \code{loci} using the internal function \code{bsseq:::.readBismarkAsFWGRanges()} applied to a single file, e.g., \code{loci = bsseq:::.readBismarkAsFWGRanges(files[1], rmZeroCov, strandCollapse)}.
129
+  }
103 130
 
104
-    % TODO: Uncomment once findCytosines is exported and documented.
105
-    % Alternatively, you may wish to use the \code{\link{findCytosines}()} function to construct all methylation loci of interest (e.g., all CpGs) in your reference genome and then pass this via the \code{loci} argument.
131
+  \subsection{Using the 'coverage' Bismark files}{
132
+    It will generally be faster to parse Bismark files in the 'coverage' output format than those in the 'genome wide cytosine report' format This is because the former only includes loci with non-zero coverage and so the file size is often considerably smaller, particularly for shallowly sequenced samples (e.g., those from single-cell bisulfite sequencing).
106 133
   }
107 134
 
108 135
   \subsection{Leaving \code{rmZeroCov = FALSE}}{
109 136
     If you set \code{rmZeroCov = TRUE}, then \code{read.bismark()} must first parse all the \code{files} to identify which loci have zero coverage in all samples and then filter these out from the set of candidate loci.
110
-    \strong{This will happen regardless of whether \code{loci} is \code{NULL} or is supplied as a \linkS4class{GenomicRanges} of candidate loci.}
137
+    \strong{This will happen even if you supply \code{loci} with a \linkS4class{GenomicRanges} of candidate loci.}
111 138
 
112 139
     Furthermore, any coverage-based filtering of methylation loci is best left until you have constructed your final \linkS4class{BSseq} object.
113 140
     In our experience, the final \linkS4class{BSseq} object is often the product of combining multiple \linkS4class{BSseq} objects, each constructed with a separate call to \code{read.bismark()}.
114
-    In such cases, it is premature to use \code{rmZeroCov = TRUE} when running each \code{read.bismark()} and combining these objects will regretably often lead to an inefficiently constructed final \linkS4class{BSseq} object.
141
+    In such cases, it is premature to use \code{rmZeroCov = TRUE} when running each \code{read.bismark()}; regretably, combining these objects will often then lead to an inefficiently stored \linkS4class{BSseq} object.
115 142
   }
116 143
 
117
-  \subsection{Using a \code{BPPARAM} with a large number of workers (cores)}{
118
-    Each file can be processed on its own, so you can process in parallel as many files as you have workers. We have good experience using as many as 40 cores via \code{BPPARAM = MulticoreParam(workers = 40)}.}
144
+  \subsection{Using a \code{BPPARAM} with a moderate number of workers (cores)}{
145
+    Each file can be processed on its own, so you can process in parallel as many files as you have workers. However, if using the HDF5Array backend, then writing to the HDF5 file cannot be performed in parallel and so becomes the bottleneck. Nonetheless, by using a moderate number of workers (2 - 10), we can ensure there is processed data available to write to disk as soon as the current write is completed.
146
+  }
119 147
 
120 148
   \subsection{Using \code{BACKEND = "HDF5Array"}}{
121 149
     By using the HDF5Array realization backend from \pkg{HDF5Array}, we reduce the amount of data that is kept in-memory at any one time.
... ...
@@ -126,28 +154,6 @@
126 154
     }
127 155
 }
128 156
 
129
-\section{File formats}{
130
-  The format of each file is automatically detected using the internal function \code{bsseq:::.guessBismarkFileType()}.
131
-  Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed.
132
-  \subsection{Supported file formats}{
133
-     We strongly recommend that you use the 'genome wide cytosine report' output format
134
-     (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-genome-wide-cytosine-report-optional-is-tab-delimited-in-the-following-format-1-based-coords}) as this includes trand information for each locus.
135
-     The 'coverage' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-coverage-output-looks-like-this-tab-delimited-1-based-genomic-coords}) is also supported, although all loci from these files will have their \code{\link[BiocGenerics]{strand}} set to \code{*} in the returned \linkS4class{BSseq} object because this file format does not include strand information.
136
-  }
137
-
138
-  \subsection{Unsupported file formats}{
139
-     Neither the 'bedGraph' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bedgraph-output-optional-looks-like-this-tab-delimited-0-based-start-1-based-end-coords}) nor the 'bismark_methylation_extractor' output format (\url{https://github.com/FelixKrueger/Bismark/tree/master/Docs#the-bismark_methylation_extractor-output-is-in-the-form-tab-delimited-1-based-coords}) are supported.
140
-     The former does not include the required counts of methylated and unmethylated reads hile the is an intermediate file containing read-level, rather than locus-level, data on methylation.
141
-  }
142
-  \subsection{One-based vs. zero-based genomic co-ordinates}{
143
-    The genomic co-ordinates of the Bismark output files may be zero-based or one-based depending on whether the \code{--zero_based} argument was used when running Bismark's methylation extractor.
144
-    Furthermore, the default co-ordinate counting system varies by version of Bismark.
145
-    \pkg{bsseq} makes no assumptions about the basis of the genomic co-ordinates and it is left to the user to ensure that the appropriate basis is used in the analysis of their data.
146
-
147
-    Since Bioconductor packages typically use one-based co-ordinates, we strongly recommend that your Bismark files are also one-based.
148
-  }
149
-}
150
-
151 157
 \section{Realization backends}{
152 158
   The \code{read.bismark()} function creates a \linkS4class{BSseq} object with two assays, \code{M} and \code{Cov}.
153 159
   The choice of \emph{realization backend} controls whether these assays are stored in-memory as an ordinary \link[base]{matrix} or on-disk as a \linkS4class{HDF5Array}, for example.
... ...
@@ -211,10 +217,12 @@
211 217
 }
212 218
 
213 219
 \seealso{
214
-  \code{\link{read.bsmooth}()} for parsing output from the BSmooth
220
+  \itemize{
221
+    \item \code{\link{read.bsmooth}()} for parsing output from the BSmooth
215 222
   alignment suite.
216
-  \code{\link{read.umtab}()} for parsing legacy (old) formats from the BSmooth alignment suite.
217
-  \code{\link{collapseBSseq}()} for collapsing (aggregating) data from sample's with multiple Bismark methylation extractor files (e.g., technical replicates).
223
+    \item \code{\link{read.umtab}()} for parsing legacy (old) formats from the BSmooth alignment suite.
224
+    \item \code{\link{collapseBSseq}()} for collapsing (aggregating) data from sample's with multiple Bismark methylation extractor files (e.g., technical replicates).
225
+  }
218 226
 }
219 227
 
220 228
 \examples{
... ...
@@ -252,3 +260,5 @@
252 260
 \author{
253 261
   Peter Hickey \email{peter.hickey@gmail.com}
254 262
 }
263
+
264
+% TODO: Mention and link to findLoci().