Browse code

improvements to read.bismark and combineList

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@73977 bc3139a8-67e5-0310-9ffc-ced21a209358

khansen authored on 06/03/2013 19:03:51
Showing 8 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: bsseq
2
-Version: 0.7.4
2
+Version: 0.7.5
3 3
 Title: Analyze, manage and store bisulfite sequencing data
4 4
 Description: Tools for analyzing and visualizing bisulfite sequencing data
5 5
 Author: Kasper Daniel Hansen
... ...
@@ -251,15 +251,32 @@ combineList <- function(x, ...) {
251 251
     stopifnot(all(sapply(x, class) == "BSseq"))
252 252
     gr <- getBSseq(x[[1]], "gr")
253 253
     trans <- getBSseq(x[[1]], "trans")
254
-    ok <- sapply(x[-1], function(xx) {
255
-        identical(gr, getBSseq(xx, "gr")) &&
256
-            identical(trans, getBSseq(xx, "trans"))
254
+    sameTrans <- sapply(x[-1], function(xx) {
255
+        identical(trans, getBSseq(xx, "trans"))
257 256
     })
258
-    if(!all(ok))
259
-        stop("all elements of '...' in combineList needs to have the same granges and trans")
260
-    M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M")))
261
-    Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov")))
262
-    if(any(!sapply(x, hasBeenSmoothed))) {
257
+    if(!all(sameTrans))
258
+        stop("all elements of '...' in combineList needs to have the same trans")
259
+    sameGr <- sapply(x[-1], function(xx) {
260
+        identical(trans, getBSseq(xx, "gr"))
261
+    })
262
+    if(all(sameGr)) {
263
+        M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M")))
264
+        Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov")))
265
+    } else {
266
+        gr <- sort(reduce(do.call(c, unname(lapply(x, granges))), min.gapwidth = 0L))
267
+        sampleNames <- do.call(c, lapply(x, sampleNames))
268
+        M <- matrix(0, ncol = length(sampleNames), nrow = length(gr))
269
+        colnames(M) <- sampleNames
270
+        Cov <- M
271
+        idxes <- split(seq(along = sampleNames), rep(seq(along = x), times = sapply(x, ncol)))
272
+        for(ii in seq(along = idxes)) {
273
+            ov <- findOverlaps(gr, granges(x[[ii]]))
274
+            idx <- idxes[[ii]]
275
+            M[queryHits(ov), idx] <- getBSseq(x[[ii]], "M")[subjectHits(ov),]
276
+            Cov[queryHits(ov), idx] <- getBSseq(x[[ii]], "Cov")[subjectHits(ov),]
277
+        }
278
+    }
279
+    if(any(!sapply(x, hasBeenSmoothed)) || !(all(sameGr))) {
263 280
         coef <- NULL
264 281
         se.coef <- NULL
265 282
         trans <- NULL
... ...
@@ -56,7 +56,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg
56 56
     lines(xx, yy, col = col, lty = lty, lwd = lwd)
57 57
 }
58 58
 
59
-.bsPlotPoints <- function(x, y, z, col) {
59
+.bsPlotPoints <- function(x, y, z, col, pointsMinCov) {
60 60
     points(x[z>pointsMinCov], y[z>pointsMinCov], col = col, pch = 16, cex = 0.5)
61 61
 }
62 62
 
... ...
@@ -147,7 +147,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg
147 147
 
148 148
 .plotSmoothData <- function(BSseq, region, extend, addRegions, col, lty, lwd, regionCol,
149 149
                             addTicks, addPoints, pointsMinCov, highlightMain) {
150
-    gr <- .bsGetGr(Bsseq, region, extend)
150
+    gr <- .bsGetGr(BSseq, region, extend)
151 151
     BSseq <- subsetByOverlaps(BSseq, gr)
152 152
     
153 153
     ## Extract basic information
... ...
@@ -186,7 +186,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg
186 186
     if(addPoints) {
187 187
         sapply(sampleNames(BSseq), function(samp) {
188 188
             .bsPlotPoints(positions, rawPs[, samp], coverage[, samp],
189
-                          col = colEtc$col[samp])
189
+                          col = colEtc$col[samp], pointsMinCov = pointsMinCov)
190 190
         })
191 191
     }
192 192
 }
... ...
@@ -1,69 +1,64 @@
1
-read.bismark <- function(files, sampleNames, returnRaw = FALSE, rmZeroCov = FALSE, verbose = TRUE){
2
-  ## Argument checking
3
-  if (anyDuplicated(files)){
4
-    stop("duplicate entries in 'files'")
5
-  }
6
-  if (length(sampleNames) != length(files) | anyDuplicated(sampleNames)){
7
-    stop("argument 'sampleNames' has to have the same length as argument 'files', without duplicate entries")
8
-  }
9
-  ## Process each file
10
-  idxes <- seq_along(files)
11
-  names(idxes) <- sampleNames
12
-  allOut <- lapply(idxes, function(ii){
13
-    if (verbose) {
14
-      cat(sprintf("Reading file '%s' ... ", files[ii]))
1
+read.bismark <- function(files, sampleNames, rmZeroCov = FALSE, verbose = TRUE){
2
+    ## Argument checking
3
+    if (anyDuplicated(files)){
4
+        stop("duplicate entries in 'files'")
15 5
     }
16
-    stime <- system.time({
17
-      if (returnRaw) {
18
-        out <- read.bismarkFileRaw(thisfile = files[ii])
19
-      } else{
20
-        raw <- read.bismarkFileRaw(thisfile = files[ii])
21
-        M <- matrix(elementMetadata(raw)[, "mCount"], ncol = 1)
22
-        Cov <- M + elementMetadata(raw)[, "uCount"]
23
-        elementMetadata(raw) <- NULL
24
-        out <- BSseq(gr = raw, M = M, Cov = Cov, sampleNames = sampleNames[ii], rmZeroCov = rmZeroCov)
25
-      }
26
-    })[3]
27
-    if (verbose) {
28
-      cat(sprintf("in %.1f secs\n", stime))  
6
+    if (length(sampleNames) != length(files) | anyDuplicated(sampleNames)){
7
+        stop("argument 'sampleNames' has to have the same length as argument 'files', without duplicate entries")
29 8
     }
30
-    out  
31
-  })
32
-  if (!returnRaw) {
9
+    ## Process each file
10
+    idxes <- seq_along(files)
11
+    names(idxes) <- sampleNames
12
+    allOut <- lapply(idxes, function(ii){
13
+        if (verbose) {
14
+            cat(sprintf("Reading file '%s' ... ", files[ii]))
15
+        }
16
+        stime <- system.time({
17
+            raw <- read.bismarkFileRaw(thisfile = files[ii])
18
+            M <- matrix(elementMetadata(raw)[, "mCount"], ncol = 1)
19
+            Cov <- M + elementMetadata(raw)[, "uCount"]
20
+            elementMetadata(raw) <- NULL
21
+            out <- BSseq(gr = raw, M = M, Cov = Cov,
22
+                         sampleNames = sampleNames[ii], rmZeroCov = rmZeroCov)
23
+        })[3]
24
+        if (verbose) {
25
+            cat(sprintf("in %.1f secs\n", stime))  
26
+        }
27
+        out  
28
+    })
33 29
     if (verbose) {
34
-      cat(sprintf("Joining samples ... "))
30
+        cat(sprintf("Joining samples ... "))
35 31
     }
36 32
     stime <- system.time({
37
-      allOut <- Reduce("combine", allOut)
33
+        allOut <- combineList(allOut)
38 34
     })[3]
39 35
     if (verbose) {
40
-      cat(sprintf("in %.1f secs\n", stime))
36
+        cat(sprintf("in %.1f secs\n", stime))
41 37
     }
42
-  }
43
-  allOut
38
+    allOut
44 39
 }
45 40
 
46 41
 read.bismarkFileRaw <- function(thisfile, verbose = TRUE){
47
-  ## Set up the 'what' argument for scan()
48
-  columnHeaders <- c("chr", "start", "end", "mPerc", "mCount", "uCount")
49
-  what0 <- replicate(length(columnHeaders), character(0))
50
-  names(what0) <- columnHeaders
51
-  int <- which(columnHeaders %in% c("start", "end", "mCount", "uCount"))
52
-  what0[int] <- replicate(length(int), integer(0))
53
-  dbl <- which(columnHeaders %in% "mPerc")
54
-  what0[dbl] <- replicate(length(dbl), double(0))
55
-  ## Read in the file
56
-  if (grepl("\\.gz$", thisfile)) 
57
-    con <- gzfile(thisfile)
58
-  else 
59
-    con <- file(thisfile, open = "r")
60
-  out <- scan(file = con, what = what0, sep = "\t", quote = "", na.strings = "NA", quiet = TRUE)
61
-  close(con)
62
-  ## Create GRanges instance from 'out'
63
-  gr <- GRanges(seqnames = out[["chr"]], ranges = IRanges(start = out[["start"]], width = 1))
64
-  out[["chr"]] <- out[["start"]] <- out[["end"]] <- NULL
65
-  out <- out[!sapply(out, is.null)]
66
-  df <- DataFrame(out)
67
-  elementMetadata(gr) <- df
68
-  gr
42
+    ## Set up the 'what' argument for scan()
43
+    columnHeaders <- c("chr", "start", "end", "mPerc", "mCount", "uCount")
44
+    what0 <- replicate(length(columnHeaders), character(0))
45
+    names(what0) <- columnHeaders
46
+    int <- which(columnHeaders %in% c("start", "end", "mCount", "uCount"))
47
+    what0[int] <- replicate(length(int), integer(0))
48
+    null <- which(columnHeaders %in% "mPerc")
49
+    what0[null] <- replicate(length(null), NULL)
50
+    ## Read in the file
51
+    if (grepl("\\.gz$", thisfile)) 
52
+        con <- gzfile(thisfile)
53
+    else 
54
+        con <- file(thisfile, open = "r")
55
+    out <- scan(file = con, what = what0, sep = "\t", quote = "", na.strings = "NA", quiet = TRUE)
56
+    close(con)
57
+    ## Create GRanges instance from 'out'
58
+    gr <- GRanges(seqnames = out[["chr"]], ranges = IRanges(start = out[["start"]], width = 1))
59
+    out[["chr"]] <- out[["start"]] <- out[["end"]] <- NULL
60
+    out <- out[!sapply(out, is.null)]
61
+    df <- DataFrame(out)
62
+    elementMetadata(gr) <- df
63
+    gr
69 64
 }
... ...
@@ -401,20 +401,17 @@ read.bsmoothDirRaw <- function(dir, seqnames = NULL, keepCycle = FALSE, keepFilt
401 401
 
402 402
 
403 403
 sampleRawToBSseq <- function(gr, qualityCutoff = 20, sampleName = NULL, rmZeroCov = FALSE) {
404
-    numberQualsGreaterThan.old <- function(char) {
405
-        quals <- as.integer(charToRaw(char)) - 33L
406
-        sum(quals >= qualityCutoff)
407
-    }
408 404
     numberQualsGreaterThan <- function(cvec) {
409 405
         onestring <- paste(cvec, collapse = "")
410 406
         greater <- (as.integer(charToRaw(onestring)) - 33L >= qualityCutoff)
411
-        tapply(greater, rep(1:length(cvec), times = nchar(cvec)), sum)
407
+        out <- tapply(greater, rep(1:length(cvec), times = nchar(cvec)), sum)
408
+        out
412 409
     }
413 410
     strToCov <- function(vec) {
414 411
         Cov <- rep(0, length(vec))
415 412
         wh <- which(! vec %in% c("", "0"))
416
-        ## Cov[wh] <- sapply(vec[wh], numberQualsGreaterThan.old)
417
-        Cov[wh] <- numberQualsGreaterThan(vec[wh])
413
+        if(length(wh) > 0)
414
+            Cov[wh] <- numberQualsGreaterThan(vec[wh])
418 415
         Cov
419 416
     }
420 417
     M <- matrix(strToCov(elementMetadata(gr)[, "Mstr"]), ncol = 1)
... ...
@@ -456,8 +453,7 @@ read.bsmooth <- function(dirs, sampleNames = NULL, seqnames = NULL, returnRaw =
456 453
     if(!returnRaw) {
457 454
         if(verbose) cat(sprintf("Joining samples ... "))
458 455
         stime <- system.time({
459
-            ## FIXME: we can do much better than this ... need to write combineList
460
-            allOut <- Reduce("combine", allOut)
456
+            allOut <- combineList(allOut)
461 457
         })[3]
462 458
         if(verbose) cat(sprintf("in %.1f secs\n", stime))
463 459
     }
... ...
@@ -4,8 +4,15 @@
4 4
 
5 5
 \section{Version 0.7.x}{
6 6
   \itemize{
7
-    \item Exposed combineList as a faster alternative to Reduce(combine,
8
-    list).
7
+    \item Removed the returnRaw argument to read.bismark() as it was
8
+    unnecessary (Bismark output files does not have additional
9
+    information beyond M and Cov and genomic positions, unlike BSmooth).
10
+    \item Moved the Bismark example data from data to inst/extdata.
11
+    \item combineList() now deals with the case where the list of BSseq
12
+    objects have different genomic locations.  This speeds up
13
+    read.bismark() substantially.
14
+    \item Exposed combineList() as a faster alternative to
15
+    Reduce(combine, list).
9 16
     \item Updated the code for the plotting routines (plotRegion).  This
10 17
     should not have an impact on user-visible code.
11 18
     \item Added read.bismark() function to parse output from the Bismark
12 19
similarity index 100%
13 20
rename from data/CpG_context_test_data.fastq_bismark.bedGraph
14 21
rename to inst/extdata/CpG_context_test_data.fastq_bismark.bedGraph
... ...
@@ -7,51 +7,66 @@
7 7
   Parsing output from the Bismark alignment suite.
8 8
 }
9 9
 \usage{
10
-  read.bismark(files, sampleNames, returnRaw = FALSE, rmZeroCov = FALSE, verbose = TRUE)
10
+  read.bismark(files, sampleNames, rmZeroCov = FALSE, verbose = TRUE)
11 11
 }
12 12
 \arguments{
13
-  \item{files}{Input files. Each sample is in a different file. Input files are created by running Bismark's \code{methylation_extractor}; see Note for details.} 
13
+  \item{files}{Input files. Each sample is in a different file. Input
14
+    files are created by running Bismark's \code{methylation_extractor};
15
+    see Note for details.}  
14 16
   \item{sampleNames}{sample names, based on the order of \code{files}.}
15
-  \item{returnRaw}{Should the function return the complete information in the output files?}
16
-  \item{rmZeroCov}{Should methylation loci that have zero coverage in all samples be removed. This will result in a much smaller object if the data originates from (targeted) capture bisulfite sequencing.} 
17
+  \item{rmZeroCov}{Should methylation loci that have zero coverage in
18
+    all samples be removed. This will result in a much smaller object if
19
+    the data originates from (targeted) capture bisulfite sequencing.}  
17 20
   \item{verbose}{Make the function verbose.}
18 21
 }
19 22
 \note{
20 23
   Input files can either be gzipped or not.
21 24
   
22
-  Input files are created by running Bismark's \code{methylation_extractor} and \code{genome_methylation_bismark2bedGraph_v4.pl} scripts over the Bismark alignment file. For example (run from the command line):
25
+  Input files are created by running Bismark's
26
+  \code{methylation_extractor} and
27
+  \code{genome_methylation_bismark2bedGraph_v4.pl} scripts over the
28
+  Bismark alignment file. For example (run from the command line): 
23 29
   
24
-  \code{methylation_extractor -s --comprehensive test_data.fastq_bismark.sam}
30
+  \code{methylation_extractor -s --comprehensive
31
+  test_data.fastq_bismark.sam}
25 32
   
26
-  \code{genome_methylation_bismark2bedGraph_v4.pl --counts CpG_context_test_data.fastq_bismark.txt > CpG_context_test_data.fastq_bismark.bedGraph}
33
+  \code{genome_methylation_bismark2bedGraph_v4.pl --counts
34
+  CpG_context_test_data.fastq_bismark.txt >
35
+  CpG_context_test_data.fastq_bismark.bedGraph} 
27 36
   
28
-  The \code{--comprehensive} argument to \code{methylation_extractor} and the \code{--counts} argument to \code{genome_methylation_bismark2bedGraph_v4.pl} are required.
37
+  The \code{--comprehensive} argument to \code{methylation_extractor}
38
+  and the \code{--counts} argument to
39
+  \code{genome_methylation_bismark2bedGraph_v4.pl} are required. 
29 40
   
30
-  In this example, the file \code{CpG_context_test_data.fastq_bismark.bedGraph} is then the input file to \code{read.bismark}. 
41
+  In this example, the file
42
+  \code{CpG_context_test_data.fastq_bismark.bedGraph} is then the input
43
+  file to \code{read.bismark}.  
31 44
   
32
-  See \url{http://rpubs.com/PeteHaitch/readBismark} for a worked example using Bismark and \code{read.bismark}. Please consult the Bismark website for full details of these scripts and the latest versions (\url{http://www.bioinformatics.babraham.ac.uk/projects/download.html#bismark})
45
+  See \url{http://rpubs.com/PeteHaitch/readBismark} for a worked example
46
+  using Bismark and \code{read.bismark}. Please consult the Bismark
47
+  website for full details of these scripts and the latest versions
48
+  (\url{http://www.bioinformatics.babraham.ac.uk/projects/download.html#bismark}) 
33 49
 }
34 50
 \value{
35
-  Either an object of class \code{"BSseq"} (if \code{returnRaw = FALSE})
36
-  or a list of \code{"GRanges"} which each component coming from a
37
-  file. 
51
+  An object of class \code{"BSseq"}.
38 52
 }
39 53
 
40 54
 \seealso{
41
-  \code{\link{read.bsmooth}} for parsing output from the BSmooth alignment suite. \code{\link{read.umtab}} for parsing legacy (old) formats from the
42
-  BSmooth alignment suite.  \code{\link{collapseBSseq}} for collapse
43
-  (merging or summing) the data in two different directories.
55
+  \code{\link{read.bsmooth}} for parsing output from the BSmooth
56
+  alignment suite. \code{\link{read.umtab}} for parsing legacy (old)
57
+  formats from the  BSmooth alignment suite.
58
+  \code{\link{collapseBSseq}} for collapse  (merging or summing) the
59
+  data in two different directories. 
44 60
 }
45 61
 
46 62
 \examples{
47 63
   \dontrun{
48
-  bismarkBedGraph <- system.file('data/CpG_context_test_data.fastq_bismark.bedGraph', package = 'bsseq')
49
-  bismarkBSseq <- read.bismark(files = bismarkBedGraph, sampleNames = 'test_data', returnRaw = FALSE, rmZeroCov = FALSE, verbose = TRUE)
64
+  bismarkBedGraph <- system.file("extdata/CpG_context_test_data.fastq_bismark.bedGraph", package = 'bsseq')
65
+  bismarkBSseq <- read.bismark(files = bismarkBedGraph, sampleNames = "test_data", rmZeroCov = FALSE, verbose = TRUE)
50 66
   bismarkBSseq
51 67
   }
52 68
 }
53 69
 
54 70
 \author{
55 71
   Peter Hickey \email{peter.hickey@gmail.com}
56
-
57 72
 }