git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@73977 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -251,15 +251,32 @@ combineList <- function(x, ...) { |
251 | 251 |
stopifnot(all(sapply(x, class) == "BSseq")) |
252 | 252 |
gr <- getBSseq(x[[1]], "gr") |
253 | 253 |
trans <- getBSseq(x[[1]], "trans") |
254 |
- ok <- sapply(x[-1], function(xx) { |
|
255 |
- identical(gr, getBSseq(xx, "gr")) && |
|
256 |
- identical(trans, getBSseq(xx, "trans")) |
|
254 |
+ sameTrans <- sapply(x[-1], function(xx) { |
|
255 |
+ identical(trans, getBSseq(xx, "trans")) |
|
257 | 256 |
}) |
258 |
- if(!all(ok)) |
|
259 |
- stop("all elements of '...' in combineList needs to have the same granges and trans") |
|
260 |
- M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M"))) |
|
261 |
- Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov"))) |
|
262 |
- if(any(!sapply(x, hasBeenSmoothed))) { |
|
257 |
+ if(!all(sameTrans)) |
|
258 |
+ stop("all elements of '...' in combineList needs to have the same trans") |
|
259 |
+ sameGr <- sapply(x[-1], function(xx) { |
|
260 |
+ identical(trans, getBSseq(xx, "gr")) |
|
261 |
+ }) |
|
262 |
+ if(all(sameGr)) { |
|
263 |
+ M <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "M"))) |
|
264 |
+ Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov"))) |
|
265 |
+ } else { |
|
266 |
+ gr <- sort(reduce(do.call(c, unname(lapply(x, granges))), min.gapwidth = 0L)) |
|
267 |
+ sampleNames <- do.call(c, lapply(x, sampleNames)) |
|
268 |
+ M <- matrix(0, ncol = length(sampleNames), nrow = length(gr)) |
|
269 |
+ colnames(M) <- sampleNames |
|
270 |
+ Cov <- M |
|
271 |
+ idxes <- split(seq(along = sampleNames), rep(seq(along = x), times = sapply(x, ncol))) |
|
272 |
+ for(ii in seq(along = idxes)) { |
|
273 |
+ ov <- findOverlaps(gr, granges(x[[ii]])) |
|
274 |
+ idx <- idxes[[ii]] |
|
275 |
+ M[queryHits(ov), idx] <- getBSseq(x[[ii]], "M")[subjectHits(ov),] |
|
276 |
+ Cov[queryHits(ov), idx] <- getBSseq(x[[ii]], "Cov")[subjectHits(ov),] |
|
277 |
+ } |
|
278 |
+ } |
|
279 |
+ if(any(!sapply(x, hasBeenSmoothed)) || !(all(sameGr))) { |
|
263 | 280 |
coef <- NULL |
264 | 281 |
se.coef <- NULL |
265 | 282 |
trans <- NULL |
... | ... |
@@ -56,7 +56,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg |
56 | 56 |
lines(xx, yy, col = col, lty = lty, lwd = lwd) |
57 | 57 |
} |
58 | 58 |
|
59 |
-.bsPlotPoints <- function(x, y, z, col) { |
|
59 |
+.bsPlotPoints <- function(x, y, z, col, pointsMinCov) { |
|
60 | 60 |
points(x[z>pointsMinCov], y[z>pointsMinCov], col = col, pch = 16, cex = 0.5) |
61 | 61 |
} |
62 | 62 |
|
... | ... |
@@ -147,7 +147,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg |
147 | 147 |
|
148 | 148 |
.plotSmoothData <- function(BSseq, region, extend, addRegions, col, lty, lwd, regionCol, |
149 | 149 |
addTicks, addPoints, pointsMinCov, highlightMain) { |
150 |
- gr <- .bsGetGr(Bsseq, region, extend) |
|
150 |
+ gr <- .bsGetGr(BSseq, region, extend) |
|
151 | 151 |
BSseq <- subsetByOverlaps(BSseq, gr) |
152 | 152 |
|
153 | 153 |
## Extract basic information |
... | ... |
@@ -186,7 +186,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg |
186 | 186 |
if(addPoints) { |
187 | 187 |
sapply(sampleNames(BSseq), function(samp) { |
188 | 188 |
.bsPlotPoints(positions, rawPs[, samp], coverage[, samp], |
189 |
- col = colEtc$col[samp]) |
|
189 |
+ col = colEtc$col[samp], pointsMinCov = pointsMinCov) |
|
190 | 190 |
}) |
191 | 191 |
} |
192 | 192 |
} |
... | ... |
@@ -1,69 +1,64 @@ |
1 |
-read.bismark <- function(files, sampleNames, returnRaw = FALSE, rmZeroCov = FALSE, verbose = TRUE){ |
|
2 |
- ## Argument checking |
|
3 |
- if (anyDuplicated(files)){ |
|
4 |
- stop("duplicate entries in 'files'") |
|
5 |
- } |
|
6 |
- if (length(sampleNames) != length(files) | anyDuplicated(sampleNames)){ |
|
7 |
- stop("argument 'sampleNames' has to have the same length as argument 'files', without duplicate entries") |
|
8 |
- } |
|
9 |
- ## Process each file |
|
10 |
- idxes <- seq_along(files) |
|
11 |
- names(idxes) <- sampleNames |
|
12 |
- allOut <- lapply(idxes, function(ii){ |
|
13 |
- if (verbose) { |
|
14 |
- cat(sprintf("Reading file '%s' ... ", files[ii])) |
|
1 |
+read.bismark <- function(files, sampleNames, rmZeroCov = FALSE, verbose = TRUE){ |
|
2 |
+ ## Argument checking |
|
3 |
+ if (anyDuplicated(files)){ |
|
4 |
+ stop("duplicate entries in 'files'") |
|
15 | 5 |
} |
16 |
- stime <- system.time({ |
|
17 |
- if (returnRaw) { |
|
18 |
- out <- read.bismarkFileRaw(thisfile = files[ii]) |
|
19 |
- } else{ |
|
20 |
- raw <- read.bismarkFileRaw(thisfile = files[ii]) |
|
21 |
- M <- matrix(elementMetadata(raw)[, "mCount"], ncol = 1) |
|
22 |
- Cov <- M + elementMetadata(raw)[, "uCount"] |
|
23 |
- elementMetadata(raw) <- NULL |
|
24 |
- out <- BSseq(gr = raw, M = M, Cov = Cov, sampleNames = sampleNames[ii], rmZeroCov = rmZeroCov) |
|
25 |
- } |
|
26 |
- })[3] |
|
27 |
- if (verbose) { |
|
28 |
- cat(sprintf("in %.1f secs\n", stime)) |
|
6 |
+ if (length(sampleNames) != length(files) | anyDuplicated(sampleNames)){ |
|
7 |
+ stop("argument 'sampleNames' has to have the same length as argument 'files', without duplicate entries") |
|
29 | 8 |
} |
30 |
- out |
|
31 |
- }) |
|
32 |
- if (!returnRaw) { |
|
9 |
+ ## Process each file |
|
10 |
+ idxes <- seq_along(files) |
|
11 |
+ names(idxes) <- sampleNames |
|
12 |
+ allOut <- lapply(idxes, function(ii){ |
|
13 |
+ if (verbose) { |
|
14 |
+ cat(sprintf("Reading file '%s' ... ", files[ii])) |
|
15 |
+ } |
|
16 |
+ stime <- system.time({ |
|
17 |
+ raw <- read.bismarkFileRaw(thisfile = files[ii]) |
|
18 |
+ M <- matrix(elementMetadata(raw)[, "mCount"], ncol = 1) |
|
19 |
+ Cov <- M + elementMetadata(raw)[, "uCount"] |
|
20 |
+ elementMetadata(raw) <- NULL |
|
21 |
+ out <- BSseq(gr = raw, M = M, Cov = Cov, |
|
22 |
+ sampleNames = sampleNames[ii], rmZeroCov = rmZeroCov) |
|
23 |
+ })[3] |
|
24 |
+ if (verbose) { |
|
25 |
+ cat(sprintf("in %.1f secs\n", stime)) |
|
26 |
+ } |
|
27 |
+ out |
|
28 |
+ }) |
|
33 | 29 |
if (verbose) { |
34 |
- cat(sprintf("Joining samples ... ")) |
|
30 |
+ cat(sprintf("Joining samples ... ")) |
|
35 | 31 |
} |
36 | 32 |
stime <- system.time({ |
37 |
- allOut <- Reduce("combine", allOut) |
|
33 |
+ allOut <- combineList(allOut) |
|
38 | 34 |
})[3] |
39 | 35 |
if (verbose) { |
40 |
- cat(sprintf("in %.1f secs\n", stime)) |
|
36 |
+ cat(sprintf("in %.1f secs\n", stime)) |
|
41 | 37 |
} |
42 |
- } |
|
43 |
- allOut |
|
38 |
+ allOut |
|
44 | 39 |
} |
45 | 40 |
|
46 | 41 |
read.bismarkFileRaw <- function(thisfile, verbose = TRUE){ |
47 |
- ## Set up the 'what' argument for scan() |
|
48 |
- columnHeaders <- c("chr", "start", "end", "mPerc", "mCount", "uCount") |
|
49 |
- what0 <- replicate(length(columnHeaders), character(0)) |
|
50 |
- names(what0) <- columnHeaders |
|
51 |
- int <- which(columnHeaders %in% c("start", "end", "mCount", "uCount")) |
|
52 |
- what0[int] <- replicate(length(int), integer(0)) |
|
53 |
- dbl <- which(columnHeaders %in% "mPerc") |
|
54 |
- what0[dbl] <- replicate(length(dbl), double(0)) |
|
55 |
- ## Read in the file |
|
56 |
- if (grepl("\\.gz$", thisfile)) |
|
57 |
- con <- gzfile(thisfile) |
|
58 |
- else |
|
59 |
- con <- file(thisfile, open = "r") |
|
60 |
- out <- scan(file = con, what = what0, sep = "\t", quote = "", na.strings = "NA", quiet = TRUE) |
|
61 |
- close(con) |
|
62 |
- ## Create GRanges instance from 'out' |
|
63 |
- gr <- GRanges(seqnames = out[["chr"]], ranges = IRanges(start = out[["start"]], width = 1)) |
|
64 |
- out[["chr"]] <- out[["start"]] <- out[["end"]] <- NULL |
|
65 |
- out <- out[!sapply(out, is.null)] |
|
66 |
- df <- DataFrame(out) |
|
67 |
- elementMetadata(gr) <- df |
|
68 |
- gr |
|
42 |
+ ## Set up the 'what' argument for scan() |
|
43 |
+ columnHeaders <- c("chr", "start", "end", "mPerc", "mCount", "uCount") |
|
44 |
+ what0 <- replicate(length(columnHeaders), character(0)) |
|
45 |
+ names(what0) <- columnHeaders |
|
46 |
+ int <- which(columnHeaders %in% c("start", "end", "mCount", "uCount")) |
|
47 |
+ what0[int] <- replicate(length(int), integer(0)) |
|
48 |
+ null <- which(columnHeaders %in% "mPerc") |
|
49 |
+ what0[null] <- replicate(length(null), NULL) |
|
50 |
+ ## Read in the file |
|
51 |
+ if (grepl("\\.gz$", thisfile)) |
|
52 |
+ con <- gzfile(thisfile) |
|
53 |
+ else |
|
54 |
+ con <- file(thisfile, open = "r") |
|
55 |
+ out <- scan(file = con, what = what0, sep = "\t", quote = "", na.strings = "NA", quiet = TRUE) |
|
56 |
+ close(con) |
|
57 |
+ ## Create GRanges instance from 'out' |
|
58 |
+ gr <- GRanges(seqnames = out[["chr"]], ranges = IRanges(start = out[["start"]], width = 1)) |
|
59 |
+ out[["chr"]] <- out[["start"]] <- out[["end"]] <- NULL |
|
60 |
+ out <- out[!sapply(out, is.null)] |
|
61 |
+ df <- DataFrame(out) |
|
62 |
+ elementMetadata(gr) <- df |
|
63 |
+ gr |
|
69 | 64 |
} |
... | ... |
@@ -401,20 +401,17 @@ read.bsmoothDirRaw <- function(dir, seqnames = NULL, keepCycle = FALSE, keepFilt |
401 | 401 |
|
402 | 402 |
|
403 | 403 |
sampleRawToBSseq <- function(gr, qualityCutoff = 20, sampleName = NULL, rmZeroCov = FALSE) { |
404 |
- numberQualsGreaterThan.old <- function(char) { |
|
405 |
- quals <- as.integer(charToRaw(char)) - 33L |
|
406 |
- sum(quals >= qualityCutoff) |
|
407 |
- } |
|
408 | 404 |
numberQualsGreaterThan <- function(cvec) { |
409 | 405 |
onestring <- paste(cvec, collapse = "") |
410 | 406 |
greater <- (as.integer(charToRaw(onestring)) - 33L >= qualityCutoff) |
411 |
- tapply(greater, rep(1:length(cvec), times = nchar(cvec)), sum) |
|
407 |
+ out <- tapply(greater, rep(1:length(cvec), times = nchar(cvec)), sum) |
|
408 |
+ out |
|
412 | 409 |
} |
413 | 410 |
strToCov <- function(vec) { |
414 | 411 |
Cov <- rep(0, length(vec)) |
415 | 412 |
wh <- which(! vec %in% c("", "0")) |
416 |
- ## Cov[wh] <- sapply(vec[wh], numberQualsGreaterThan.old) |
|
417 |
- Cov[wh] <- numberQualsGreaterThan(vec[wh]) |
|
413 |
+ if(length(wh) > 0) |
|
414 |
+ Cov[wh] <- numberQualsGreaterThan(vec[wh]) |
|
418 | 415 |
Cov |
419 | 416 |
} |
420 | 417 |
M <- matrix(strToCov(elementMetadata(gr)[, "Mstr"]), ncol = 1) |
... | ... |
@@ -456,8 +453,7 @@ read.bsmooth <- function(dirs, sampleNames = NULL, seqnames = NULL, returnRaw = |
456 | 453 |
if(!returnRaw) { |
457 | 454 |
if(verbose) cat(sprintf("Joining samples ... ")) |
458 | 455 |
stime <- system.time({ |
459 |
- ## FIXME: we can do much better than this ... need to write combineList |
|
460 |
- allOut <- Reduce("combine", allOut) |
|
456 |
+ allOut <- combineList(allOut) |
|
461 | 457 |
})[3] |
462 | 458 |
if(verbose) cat(sprintf("in %.1f secs\n", stime)) |
463 | 459 |
} |
... | ... |
@@ -4,8 +4,15 @@ |
4 | 4 |
|
5 | 5 |
\section{Version 0.7.x}{ |
6 | 6 |
\itemize{ |
7 |
- \item Exposed combineList as a faster alternative to Reduce(combine, |
|
8 |
- list). |
|
7 |
+ \item Removed the returnRaw argument to read.bismark() as it was |
|
8 |
+ unnecessary (Bismark output files does not have additional |
|
9 |
+ information beyond M and Cov and genomic positions, unlike BSmooth). |
|
10 |
+ \item Moved the Bismark example data from data to inst/extdata. |
|
11 |
+ \item combineList() now deals with the case where the list of BSseq |
|
12 |
+ objects have different genomic locations. This speeds up |
|
13 |
+ read.bismark() substantially. |
|
14 |
+ \item Exposed combineList() as a faster alternative to |
|
15 |
+ Reduce(combine, list). |
|
9 | 16 |
\item Updated the code for the plotting routines (plotRegion). This |
10 | 17 |
should not have an impact on user-visible code. |
11 | 18 |
\item Added read.bismark() function to parse output from the Bismark |
... | ... |
@@ -7,51 +7,66 @@ |
7 | 7 |
Parsing output from the Bismark alignment suite. |
8 | 8 |
} |
9 | 9 |
\usage{ |
10 |
- read.bismark(files, sampleNames, returnRaw = FALSE, rmZeroCov = FALSE, verbose = TRUE) |
|
10 |
+ read.bismark(files, sampleNames, rmZeroCov = FALSE, verbose = TRUE) |
|
11 | 11 |
} |
12 | 12 |
\arguments{ |
13 |
- \item{files}{Input files. Each sample is in a different file. Input files are created by running Bismark's \code{methylation_extractor}; see Note for details.} |
|
13 |
+ \item{files}{Input files. Each sample is in a different file. Input |
|
14 |
+ files are created by running Bismark's \code{methylation_extractor}; |
|
15 |
+ see Note for details.} |
|
14 | 16 |
\item{sampleNames}{sample names, based on the order of \code{files}.} |
15 |
- \item{returnRaw}{Should the function return the complete information in the output files?} |
|
16 |
- \item{rmZeroCov}{Should methylation loci that have zero coverage in all samples be removed. This will result in a much smaller object if the data originates from (targeted) capture bisulfite sequencing.} |
|
17 |
+ \item{rmZeroCov}{Should methylation loci that have zero coverage in |
|
18 |
+ all samples be removed. This will result in a much smaller object if |
|
19 |
+ the data originates from (targeted) capture bisulfite sequencing.} |
|
17 | 20 |
\item{verbose}{Make the function verbose.} |
18 | 21 |
} |
19 | 22 |
\note{ |
20 | 23 |
Input files can either be gzipped or not. |
21 | 24 |
|
22 |
- Input files are created by running Bismark's \code{methylation_extractor} and \code{genome_methylation_bismark2bedGraph_v4.pl} scripts over the Bismark alignment file. For example (run from the command line): |
|
25 |
+ Input files are created by running Bismark's |
|
26 |
+ \code{methylation_extractor} and |
|
27 |
+ \code{genome_methylation_bismark2bedGraph_v4.pl} scripts over the |
|
28 |
+ Bismark alignment file. For example (run from the command line): |
|
23 | 29 |
|
24 |
- \code{methylation_extractor -s --comprehensive test_data.fastq_bismark.sam} |
|
30 |
+ \code{methylation_extractor -s --comprehensive |
|
31 |
+ test_data.fastq_bismark.sam} |
|
25 | 32 |
|
26 |
- \code{genome_methylation_bismark2bedGraph_v4.pl --counts CpG_context_test_data.fastq_bismark.txt > CpG_context_test_data.fastq_bismark.bedGraph} |
|
33 |
+ \code{genome_methylation_bismark2bedGraph_v4.pl --counts |
|
34 |
+ CpG_context_test_data.fastq_bismark.txt > |
|
35 |
+ CpG_context_test_data.fastq_bismark.bedGraph} |
|
27 | 36 |
|
28 |
- The \code{--comprehensive} argument to \code{methylation_extractor} and the \code{--counts} argument to \code{genome_methylation_bismark2bedGraph_v4.pl} are required. |
|
37 |
+ The \code{--comprehensive} argument to \code{methylation_extractor} |
|
38 |
+ and the \code{--counts} argument to |
|
39 |
+ \code{genome_methylation_bismark2bedGraph_v4.pl} are required. |
|
29 | 40 |
|
30 |
- In this example, the file \code{CpG_context_test_data.fastq_bismark.bedGraph} is then the input file to \code{read.bismark}. |
|
41 |
+ In this example, the file |
|
42 |
+ \code{CpG_context_test_data.fastq_bismark.bedGraph} is then the input |
|
43 |
+ file to \code{read.bismark}. |
|
31 | 44 |
|
32 |
- See \url{http://rpubs.com/PeteHaitch/readBismark} for a worked example using Bismark and \code{read.bismark}. Please consult the Bismark website for full details of these scripts and the latest versions (\url{http://www.bioinformatics.babraham.ac.uk/projects/download.html#bismark}) |
|
45 |
+ See \url{http://rpubs.com/PeteHaitch/readBismark} for a worked example |
|
46 |
+ using Bismark and \code{read.bismark}. Please consult the Bismark |
|
47 |
+ website for full details of these scripts and the latest versions |
|
48 |
+ (\url{http://www.bioinformatics.babraham.ac.uk/projects/download.html#bismark}) |
|
33 | 49 |
} |
34 | 50 |
\value{ |
35 |
- Either an object of class \code{"BSseq"} (if \code{returnRaw = FALSE}) |
|
36 |
- or a list of \code{"GRanges"} which each component coming from a |
|
37 |
- file. |
|
51 |
+ An object of class \code{"BSseq"}. |
|
38 | 52 |
} |
39 | 53 |
|
40 | 54 |
\seealso{ |
41 |
- \code{\link{read.bsmooth}} for parsing output from the BSmooth alignment suite. \code{\link{read.umtab}} for parsing legacy (old) formats from the |
|
42 |
- BSmooth alignment suite. \code{\link{collapseBSseq}} for collapse |
|
43 |
- (merging or summing) the data in two different directories. |
|
55 |
+ \code{\link{read.bsmooth}} for parsing output from the BSmooth |
|
56 |
+ alignment suite. \code{\link{read.umtab}} for parsing legacy (old) |
|
57 |
+ formats from the BSmooth alignment suite. |
|
58 |
+ \code{\link{collapseBSseq}} for collapse (merging or summing) the |
|
59 |
+ data in two different directories. |
|
44 | 60 |
} |
45 | 61 |
|
46 | 62 |
\examples{ |
47 | 63 |
\dontrun{ |
48 |
- bismarkBedGraph <- system.file('data/CpG_context_test_data.fastq_bismark.bedGraph', package = 'bsseq') |
|
49 |
- bismarkBSseq <- read.bismark(files = bismarkBedGraph, sampleNames = 'test_data', returnRaw = FALSE, rmZeroCov = FALSE, verbose = TRUE) |
|
64 |
+ bismarkBedGraph <- system.file("extdata/CpG_context_test_data.fastq_bismark.bedGraph", package = 'bsseq') |
|
65 |
+ bismarkBSseq <- read.bismark(files = bismarkBedGraph, sampleNames = "test_data", rmZeroCov = FALSE, verbose = TRUE) |
|
50 | 66 |
bismarkBSseq |
51 | 67 |
} |
52 | 68 |
} |
53 | 69 |
|
54 | 70 |
\author{ |
55 | 71 |
Peter Hickey \email{peter.hickey@gmail.com} |
56 |
- |
|
57 | 72 |
} |