git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@78387 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,5 +1,5 @@ |
1 | 1 |
Package: bsseq |
2 |
-Version: 0.9.7 |
|
2 |
+Version: 0.9.8 |
|
3 | 3 |
Title: Analyze, manage and store bisulfite sequencing data |
4 | 4 |
Description: Tools for analyzing and visualizing bisulfite sequencing data |
5 | 5 |
Author: Kasper Daniel Hansen |
... | ... |
@@ -9,7 +9,7 @@ Depends: R (>= 2.15), methods, BiocGenerics, IRanges, GenomicRanges, |
9 | 9 |
Imports: scales, stats, graphics, Biobase, locfit |
10 | 10 |
Suggests: RUnit, bsseqData |
11 | 11 |
Collate: hasGRanges.R BSseq_class.R BSseqTstat_class.R BSseq_utils.R combine.R |
12 |
- utils.R read.bsmooth.R read.bismark.R BSmooth.R dmrFinder.R |
|
12 |
+ utils.R read.bsmooth.R read.bismark.R BSmooth.R BSmooth.tstat.R dmrFinder.R |
|
13 | 13 |
gof_stats.R plotting.R fisher.R fixes.R |
14 | 14 |
License: Artistic-2.0 |
15 | 15 |
LazyData: yes |
... | ... |
@@ -148,128 +148,3 @@ BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sam |
148 | 148 |
} |
149 | 149 |
|
150 | 150 |
|
151 |
-BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paired", "group2"), |
|
152 |
- local.correct = TRUE, maxGap = NULL, qSd = 0.75, k = 101, mc.cores = 1, verbose = TRUE){ |
|
153 |
- smoothSd <- function(Sds, k) { |
|
154 |
- k0 <- floor(k/2) |
|
155 |
- if(all(is.na(Sds))) return(Sds) |
|
156 |
- thresSD <- pmax(Sds, quantile(Sds, qSd, na.rm = TRUE), na.rm = TRUE) |
|
157 |
- addSD <- rep(median(Sds, na.rm = TRUE), k0) |
|
158 |
- sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k)) |
|
159 |
- sSds |
|
160 |
- } |
|
161 |
- compute.correction <- function(idx, qSd = 0.75) { |
|
162 |
- xx <- start(BSseq)[idx] |
|
163 |
- yy <- tstat[idx] |
|
164 |
- suppressWarnings({ |
|
165 |
- drange <- diff(range(xx, na.rm = TRUE)) |
|
166 |
- }) |
|
167 |
- if(drange <= 25000) |
|
168 |
- return(yy) |
|
169 |
- tstat.function <- approxfun(xx, yy) |
|
170 |
- xx.reg <- seq(from = min(xx), to = max(xx), by = 2000) |
|
171 |
- yy.reg <- tstat.function(xx.reg) |
|
172 |
- fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0), |
|
173 |
- family = "huber", maxk = 50000) |
|
174 |
- correction <- predict(fit, newdata = data.frame(xx.reg = xx)) |
|
175 |
- yy - correction |
|
176 |
- } |
|
177 |
- |
|
178 |
- estimate.var <- match.arg(estimate.var) |
|
179 |
- stopifnot(is(BSseq, "BSseq")) |
|
180 |
- stopifnot(hasBeenSmoothed(BSseq)) |
|
181 |
- if(is.numeric(group1)) { |
|
182 |
- stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq)) |
|
183 |
- group1 <- sampleNames(BSseq)[group1] |
|
184 |
- } |
|
185 |
- if(is.numeric(group2)) { |
|
186 |
- stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq)) |
|
187 |
- group2 <- sampleNames(BSseq)[group2] |
|
188 |
- } |
|
189 |
- stopifnot(length(intersect(group1, group2)) == 0) |
|
190 |
- if(estimate.var == "paired") |
|
191 |
- stopifnot(length(group1) == length(group2)) |
|
192 |
- stopifnot(all(c(group1, group2) %in% sampleNames(BSseq))) |
|
193 |
- |
|
194 |
- if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0)) |
|
195 |
- warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first") |
|
196 |
- |
|
197 |
- if(is.null(maxGap)) |
|
198 |
- maxGap <- BSseq@parameters$maxGap |
|
199 |
- |
|
200 |
- if(verbose) cat("preprocessing ... ") |
|
201 |
- stime <- system.time({ |
|
202 |
- clusterIdx <- makeClusters(BSseq, maxGap = maxGap, mc.cores = mc.cores) |
|
203 |
- })[3] |
|
204 |
- if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
205 |
- |
|
206 |
- if(verbose) cat("computing stats within groups ... ") |
|
207 |
- stime <- system.time({ |
|
208 |
- allPs <- getMeth(BSseq, type = "smooth", what = "perBase", |
|
209 |
- confint = FALSE)[, c(group1, group2)] |
|
210 |
- group1.means <- rowMeans(allPs[, group1, drop = FALSE], na.rm = TRUE) |
|
211 |
- group2.means <- rowMeans(allPs[, group2, drop = FALSE], na.rm = TRUE) |
|
212 |
- })[3] |
|
213 |
- if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
214 |
- |
|
215 |
- if(verbose) cat("computing stats across groups ... ") |
|
216 |
- stime <- system.time({ |
|
217 |
- switch(estimate.var, |
|
218 |
- "group2" = { |
|
219 |
- rawSds <- rowSds(allPs[, group2, drop = FALSE], na.rm = TRUE) |
|
220 |
- smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
221 |
- smoothSd(rawSds[idx], k = k) |
|
222 |
- }, mc.cores = mc.cores)) |
|
223 |
- scale <- sqrt(1/length(group1) + 1/length(group2)) |
|
224 |
- tstat.sd <- smoothSds * scale |
|
225 |
- }, |
|
226 |
- "same" = { |
|
227 |
- rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs[, group1, drop = FALSE]) + |
|
228 |
- (length(group2) - 1) * rowVars(allPs[, group2, drop = FALSE])) / |
|
229 |
- (length(group1) + length(group2) - 2)) |
|
230 |
- smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
231 |
- smoothSd(rawSds[idx], k = k) |
|
232 |
- }, mc.cores = mc.cores)) |
|
233 |
- scale <- sqrt(1/length(group1) + 1/length(group2)) |
|
234 |
- tstat.sd <- smoothSds * scale |
|
235 |
- }, |
|
236 |
- "paired" = { |
|
237 |
- rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE]) |
|
238 |
- smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
239 |
- smoothSd(rawSds[idx], k = k) |
|
240 |
- }, mc.cores = mc.cores)) |
|
241 |
- scale <- sqrt(1/length(group1)) |
|
242 |
- tstat.sd <- smoothSds * scale |
|
243 |
- }) |
|
244 |
- tstat <- (group1.means - group2.means) / tstat.sd |
|
245 |
- is.na(tstat)[tstat.sd == 0] <- TRUE |
|
246 |
- if(local.correct) { |
|
247 |
- tstat.corrected <- do.call(c, mclapply(clusterIdx, |
|
248 |
- compute.correction, qSd = qSd, |
|
249 |
- mc.cores = mc.cores)) |
|
250 |
- } |
|
251 |
- })[3] |
|
252 |
- if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
253 |
- |
|
254 |
- if(local.correct) { |
|
255 |
- stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
|
256 |
- tstat, tstat.corrected) |
|
257 |
- colnames(stats) <- c("rawSds", "tstat.sd", |
|
258 |
- "group2.means", "group1.means", "tstat", |
|
259 |
- "tstat.corrected") |
|
260 |
- |
|
261 |
- } else { |
|
262 |
- stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
|
263 |
- tstat) |
|
264 |
- colnames(stats) <- c("rawSds", "tstat.sd", |
|
265 |
- "group2.means", "group1.means", "tstat") |
|
266 |
- } |
|
267 |
- |
|
268 |
- parameters <- c(BSseq@parameters, |
|
269 |
- list(tstatText = sprintf("BSmooth.tstat (local.correct = %s, maxGap = %d)", |
|
270 |
- local.correct, maxGap), |
|
271 |
- group1 = group1, group2 = group2, k = k, qSd = qSd, |
|
272 |
- local.correct = local.correct, maxGap = maxGap)) |
|
273 |
- out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters) |
|
274 |
- out |
|
275 |
-} |
276 | 151 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,130 @@ |
1 |
+BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paired", "group2"), |
|
2 |
+ local.correct = TRUE, maxGap = NULL, qSd = 0.75, k = 101, mc.cores = 1, verbose = TRUE){ |
|
3 |
+ smoothSd <- function(Sds, k) { |
|
4 |
+ k0 <- floor(k/2) |
|
5 |
+ if(all(is.na(Sds))) return(Sds) |
|
6 |
+ thresSD <- pmax(Sds, quantile(Sds, qSd, na.rm = TRUE), na.rm = TRUE) |
|
7 |
+ addSD <- rep(median(Sds, na.rm = TRUE), k0) |
|
8 |
+ sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k)) |
|
9 |
+ sSds |
|
10 |
+ } |
|
11 |
+ compute.correction <- function(idx, qSd = 0.75) { |
|
12 |
+ xx <- start(BSseq)[idx] |
|
13 |
+ yy <- tstat[idx] |
|
14 |
+ suppressWarnings({ |
|
15 |
+ drange <- diff(range(xx, na.rm = TRUE)) |
|
16 |
+ }) |
|
17 |
+ if(drange <= 25000) |
|
18 |
+ return(yy) |
|
19 |
+ tstat.function <- approxfun(xx, yy) |
|
20 |
+ xx.reg <- seq(from = min(xx), to = max(xx), by = 2000) |
|
21 |
+ yy.reg <- tstat.function(xx.reg) |
|
22 |
+ fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0), |
|
23 |
+ family = "huber", maxk = 50000) |
|
24 |
+ correction <- predict(fit, newdata = data.frame(xx.reg = xx)) |
|
25 |
+ yy - correction |
|
26 |
+ } |
|
27 |
+ |
|
28 |
+ estimate.var <- match.arg(estimate.var) |
|
29 |
+ stopifnot(is(BSseq, "BSseq")) |
|
30 |
+ stopifnot(hasBeenSmoothed(BSseq)) |
|
31 |
+ if(is.character(group1)) { |
|
32 |
+ stopifnot(all(group1 %in% sampleNames(BSseq))) |
|
33 |
+ group1 <- match(group1, sampleNames(BSseq)) |
|
34 |
+ } |
|
35 |
+ if(is.numeric(group1)) { |
|
36 |
+ stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq)) |
|
37 |
+ } else stop("problems with argument 'group1'") |
|
38 |
+ if(is.character(group2)) { |
|
39 |
+ stopifnot(all(group2 %in% sampleNames(BSseq))) |
|
40 |
+ group2 <- match(group2, sampleNames(BSseq)) |
|
41 |
+ } |
|
42 |
+ if(is.numeric(group2)) { |
|
43 |
+ stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq)) |
|
44 |
+ } else stop("problems with argument 'group2'") |
|
45 |
+ stopifnot(length(intersect(group1, group2)) == 0) |
|
46 |
+ if(estimate.var == "paired") |
|
47 |
+ stopifnot(length(group1) == length(group2)) |
|
48 |
+ |
|
49 |
+ if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0)) |
|
50 |
+ warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first") |
|
51 |
+ |
|
52 |
+ if(is.null(maxGap)) |
|
53 |
+ maxGap <- BSseq@parameters$maxGap |
|
54 |
+ |
|
55 |
+ if(verbose) cat("[BSmooth.tstat] preprocessing ... ") |
|
56 |
+ stime <- system.time({ |
|
57 |
+ clusterIdx <- makeClusters(BSseq, maxGap = maxGap, mc.cores = mc.cores) |
|
58 |
+ })[3] |
|
59 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
60 |
+ |
|
61 |
+ if(verbose) cat("[BSmooth.tstat] computing stats within groups ... ") |
|
62 |
+ stime <- system.time({ |
|
63 |
+ allPs <- getMeth(BSseq, type = "smooth", what = "perBase", |
|
64 |
+ confint = FALSE)[, c(group1, group2)] |
|
65 |
+ group1.means <- rowMeans(allPs[, group1, drop = FALSE], na.rm = TRUE) |
|
66 |
+ group2.means <- rowMeans(allPs[, group2, drop = FALSE], na.rm = TRUE) |
|
67 |
+ })[3] |
|
68 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
69 |
+ |
|
70 |
+ if(verbose) cat("[BSmooth.tstat] computing stats across groups ... ") |
|
71 |
+ stime <- system.time({ |
|
72 |
+ switch(estimate.var, |
|
73 |
+ "group2" = { |
|
74 |
+ rawSds <- rowSds(allPs[, group2, drop = FALSE], na.rm = TRUE) |
|
75 |
+ smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
76 |
+ smoothSd(rawSds[idx], k = k) |
|
77 |
+ }, mc.cores = mc.cores)) |
|
78 |
+ scale <- sqrt(1/length(group1) + 1/length(group2)) |
|
79 |
+ tstat.sd <- smoothSds * scale |
|
80 |
+ }, |
|
81 |
+ "same" = { |
|
82 |
+ rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs[, group1, drop = FALSE]) + |
|
83 |
+ (length(group2) - 1) * rowVars(allPs[, group2, drop = FALSE])) / |
|
84 |
+ (length(group1) + length(group2) - 2)) |
|
85 |
+ smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
86 |
+ smoothSd(rawSds[idx], k = k) |
|
87 |
+ }, mc.cores = mc.cores)) |
|
88 |
+ scale <- sqrt(1/length(group1) + 1/length(group2)) |
|
89 |
+ tstat.sd <- smoothSds * scale |
|
90 |
+ }, |
|
91 |
+ "paired" = { |
|
92 |
+ rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE]) |
|
93 |
+ smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
94 |
+ smoothSd(rawSds[idx], k = k) |
|
95 |
+ }, mc.cores = mc.cores)) |
|
96 |
+ scale <- sqrt(1/length(group1)) |
|
97 |
+ tstat.sd <- smoothSds * scale |
|
98 |
+ }) |
|
99 |
+ tstat <- (group1.means - group2.means) / tstat.sd |
|
100 |
+ is.na(tstat)[tstat.sd == 0] <- TRUE |
|
101 |
+ if(local.correct) { |
|
102 |
+ tstat.corrected <- do.call(c, mclapply(clusterIdx, |
|
103 |
+ compute.correction, qSd = qSd, |
|
104 |
+ mc.cores = mc.cores)) |
|
105 |
+ } |
|
106 |
+ })[3] |
|
107 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
108 |
+ |
|
109 |
+ if(local.correct) { |
|
110 |
+ stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
|
111 |
+ tstat, tstat.corrected) |
|
112 |
+ colnames(stats) <- c("rawSds", "tstat.sd", |
|
113 |
+ "group2.means", "group1.means", "tstat", |
|
114 |
+ "tstat.corrected") |
|
115 |
+ |
|
116 |
+ } else { |
|
117 |
+ stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
|
118 |
+ tstat) |
|
119 |
+ colnames(stats) <- c("rawSds", "tstat.sd", |
|
120 |
+ "group2.means", "group1.means", "tstat") |
|
121 |
+ } |
|
122 |
+ |
|
123 |
+ parameters <- c(BSseq@parameters, |
|
124 |
+ list(tstatText = sprintf("BSmooth.tstat (local.correct = %s, maxGap = %d)", |
|
125 |
+ local.correct, maxGap), |
|
126 |
+ group1 = group1, group2 = group2, k = k, qSd = qSd, |
|
127 |
+ local.correct = local.correct, maxGap = maxGap)) |
|
128 |
+ out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters) |
|
129 |
+ out |
|
130 |
+} |
... | ... |
@@ -37,3 +37,69 @@ BSseqTstat <- function(gr = NULL, stats = NULL, parameters = NULL) { |
37 | 37 |
out |
38 | 38 |
} |
39 | 39 |
|
40 |
+summary.BSseqTstat <- function(object, ...) { |
|
41 |
+ quant <- quantile(getStats(object)[, "tstat.corrected"], |
|
42 |
+ prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999)) |
|
43 |
+ quant <- t(t(quant)) |
|
44 |
+ colnames(quant) <- "quantiles" |
|
45 |
+ out <- list(quantiles = quant) |
|
46 |
+ class(out) <- "summary.BSseqTstat" |
|
47 |
+ out |
|
48 |
+} |
|
49 |
+ |
|
50 |
+print.summary.BSseqTstat <- function(x, ...) { |
|
51 |
+ print(as.matrix(x$quantiles)) |
|
52 |
+} |
|
53 |
+ |
|
54 |
+plot.BSseqTstat <- function(x, y, ...) { |
|
55 |
+ tstat <- getStats(x)[, "tstat"] |
|
56 |
+ plot(density(tstat), xlim = c(-10,10), col = "blue", main = "") |
|
57 |
+ if("tstat.corrected" %in% colnames(getStats(x))) { |
|
58 |
+ tstat.cor <- getStats(x)[, "tstat.corrected"] |
|
59 |
+ lines(density(tstat.cor), col = "black") |
|
60 |
+ legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1), |
|
61 |
+ col = c("blue", "black")) |
|
62 |
+ } else { |
|
63 |
+ legend("topleft", legend = c("uncorrected"), lty = 1, |
|
64 |
+ col = c("blue")) |
|
65 |
+ } |
|
66 |
+} |
|
67 |
+ |
|
68 |
+getStats <- function(BSseqTstat, regions = NULL, column = "tstat.corrected") { |
|
69 |
+ stopifnot(is(BSseqTstat, "BSseqTstat")) |
|
70 |
+ stopifnot(column %in% colnames(BSseqTstat@stats)) |
|
71 |
+ if(class(regions) == "data.frame") |
|
72 |
+ regions <- data.frame2GRanges(regions) |
|
73 |
+ if(is.null(regions)) |
|
74 |
+ return(BSseqTstat@stats) |
|
75 |
+ stopifnot(is(regions, "GenomicRanges")) |
|
76 |
+ ov <- findOverlaps(BSseqTstat, regions) |
|
77 |
+ ov.sp <- split(queryHits(ov), subjectHits(ov)) |
|
78 |
+ getRegionStats <- function(idx) { |
|
79 |
+ mat <- BSseqTstat@stats[idx,, drop=FALSE] |
|
80 |
+ areaStat <- sum(mat[, column]) |
|
81 |
+ maxStat <- max(mat[, column]) |
|
82 |
+ c(areaStat, maxStat) |
|
83 |
+ } |
|
84 |
+ stats <- matrix(NA, ncol = 2, nrow = length(regions)) |
|
85 |
+ colnames(stats) <- c("areaStat", "maxStat") |
|
86 |
+ tmp <- lapply(ov.sp, getRegionStats) |
|
87 |
+ stats[as.integer(names(tmp)),] <- do.call(rbind, tmp) |
|
88 |
+ out <- as.data.frame(stats) |
|
89 |
+ if(! column %in% c("tstat.corrected", "tstat")) |
|
90 |
+ return(out) |
|
91 |
+ getRegionStats_ttest <- function(idx) { |
|
92 |
+ mat <- BSseqTstat@stats[idx,, drop=FALSE] |
|
93 |
+ group1.mean <- mean(mat[, "group1.means"]) |
|
94 |
+ group2.mean <- mean(mat[, "group2.means"]) |
|
95 |
+ meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"]) |
|
96 |
+ tstat.sd <- mean(mat[, "tstat.sd"]) |
|
97 |
+ c(meanDiff, group1.mean, group2.mean, tstat.sd) |
|
98 |
+ } |
|
99 |
+ stats_ttest<- matrix(NA, ncol = 4, nrow = length(regions)) |
|
100 |
+ colnames(stats_ttest) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd") |
|
101 |
+ tmp <- lapply(ov.sp, getRegionStats) |
|
102 |
+ stats_ttest[as.integer(names(tmp)),] <- do.call(rbind, tmp) |
|
103 |
+ out <- cbind(out, as.data.frame(stats_ttest)) |
|
104 |
+ out |
|
105 |
+} |
... | ... |
@@ -1,23 +1,22 @@ |
1 | 1 |
collapseBSseq <- function(BSseq, columns) { |
2 | 2 |
## columns is a vector of new names, names(columns) is sampleNames or empty |
3 |
+ stopifnot(is.character(columns)) |
|
3 | 4 |
if(is.null(names(columns)) && length(columns) != ncol(BSseq)) |
4 | 5 |
stop("if `columns' does not have names, it needs to be of the same length as `BSseq` has columns (samples)") |
5 | 6 |
if(!is.null(names(columns)) && !all(names(columns) %in% sampleNames(BSseq))) |
6 | 7 |
stop("if `columns` has names, they need to be sampleNames(BSseq)") |
7 |
- if(is.null(names(columns))) { |
|
8 |
- sp <- split(1:ncol(BSseq), columns) |
|
9 |
- } else { |
|
10 |
- sp <- split(names(columns), columns) |
|
11 |
- } |
|
8 |
+ if(is.null(names(columns))) |
|
9 |
+ columns.idx <- 1:ncol(BSseq) |
|
10 |
+ else |
|
11 |
+ columns.idx <- match(names(columns), sampleNames(BSseq)) |
|
12 |
+ sp <- split(columns.idx, columns) |
|
12 | 13 |
M <- do.call(cbind, lapply(sp, function(ss) { |
13 | 14 |
rowSums(getBSseq(BSseq, "M")[, ss, drop = FALSE]) |
14 | 15 |
})) |
15 |
- colnames(M) <- names(sp) |
|
16 | 16 |
Cov <- do.call(cbind, lapply(sp, function(ss) { |
17 | 17 |
rowSums(getBSseq(BSseq, "M")[, ss, drop = FALSE]) |
18 | 18 |
})) |
19 |
- colnames(Cov) <- names(sp) |
|
20 |
- BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov) |
|
19 |
+ BSseq(gr = getBSseq(BSseq, "gr"), M = M, Cov = Cov, sampleNames = names(sp)) |
|
21 | 20 |
} |
22 | 21 |
|
23 | 22 |
chrSelectBSseq <- function(BSseq, seqnames = NULL, order = FALSE) { |
... | ... |
@@ -192,59 +191,4 @@ getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
192 | 191 |
return(outMatrix) |
193 | 192 |
} |
194 | 193 |
|
195 |
-getStats <- function(BSseqTstat, regions = NULL, column = c("tstat.corrected", "tstat")) { |
|
196 |
- stopifnot(is(BSseqTstat, "BSseqTstat")) |
|
197 |
- column <- match.arg(column) |
|
198 |
- if(class(regions) == "data.frame") |
|
199 |
- regions <- data.frame2GRanges(regions) |
|
200 |
- if(is.null(regions)) |
|
201 |
- return(BSseqTstat@stats) |
|
202 |
- stopifnot(is(regions, "GenomicRanges")) |
|
203 |
- ov <- findOverlaps(BSseqTstat, regions) |
|
204 |
- ov.sp <- split(queryHits(ov), subjectHits(ov)) |
|
205 |
- getRegionStats <- function(idx) { |
|
206 |
- mat <- BSseqTstat@stats[idx,, drop=FALSE] |
|
207 |
- meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"]) |
|
208 |
- group1.mean <- mean(mat[, "group1.means"]) |
|
209 |
- group2.mean <- mean(mat[, "group2.means"]) |
|
210 |
- tstat.sd <- mean(mat[, "tstat.sd"]) |
|
211 |
- areaStat <- sum(mat[, column]) |
|
212 |
- maxStat <- max(mat[, column]) |
|
213 |
- c(meanDiff, group1.mean, group2.mean, tstat.sd, areaStat, maxStat) |
|
214 |
- } |
|
215 |
- stats <- lapply(ov.sp, getRegionStats) |
|
216 |
- out <- matrix(NA, ncol = 6, nrow = length(regions)) |
|
217 |
- out[as.integer(names(stats)),] <- do.call(rbind, stats) |
|
218 |
- colnames(out) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd", "areaStat", "maxStat") |
|
219 |
- out <- as.data.frame(out) |
|
220 |
- out |
|
221 |
-} |
|
222 |
- |
|
223 |
- |
|
224 |
-summary.BSseqTstat <- function(object, ...) { |
|
225 |
- quant <- quantile(getStats(object)[, "tstat.corrected"], |
|
226 |
- prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999)) |
|
227 |
- quant <- t(t(quant)) |
|
228 |
- colnames(quant) <- "quantiles" |
|
229 |
- out <- list(quantiles = quant) |
|
230 |
- class(out) <- "summary.BSseqTstat" |
|
231 |
- out |
|
232 |
-} |
|
233 | 194 |
|
234 |
-print.summary.BSseqTstat <- function(x, ...) { |
|
235 |
- print(as.matrix(x$quantiles)) |
|
236 |
-} |
|
237 |
- |
|
238 |
-plot.BSseqTstat <- function(x, y, ...) { |
|
239 |
- tstat <- getStats(x)[, "tstat"] |
|
240 |
- plot(density(tstat), xlim = c(-10,10), col = "blue", main = "") |
|
241 |
- if("tstat.corrected" %in% colnames(getStats(x))) { |
|
242 |
- tstat.cor <- getStats(x)[, "tstat.corrected"] |
|
243 |
- lines(density(tstat.cor), col = "black") |
|
244 |
- legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1), |
|
245 |
- col = c("blue", "black")) |
|
246 |
- } else { |
|
247 |
- legend("topleft", legend = c("uncorrected"), lty = 1, |
|
248 |
- col = c("blue")) |
|
249 |
- } |
|
250 |
-} |
... | ... |
@@ -1,24 +1,29 @@ |
1 | 1 |
dmrFinder <- function(BSseqTstat, cutoff = NULL, qcutoff = c(0.025, 0.975), |
2 |
- maxGap = 300, column = c("tstat.corrected", "tstat"), verbose = TRUE) { |
|
3 |
- column <- match.arg(column) |
|
4 |
- if(! column %in% colnames(BSseqTstat@stats)) |
|
5 |
- stop("'column' needs to be a column of 'BSseqTstat@stats'") |
|
2 |
+ maxGap = 300, stat = "tstat.corrected", verbose = TRUE) { |
|
3 |
+ subverbose <- max(as.integer(verbose) - 1L, 0L) |
|
4 |
+ if(! stat %in% colnames(BSseqTstat@stats)) |
|
5 |
+ stop("'stat' needs to be a column of 'BSseqTstat@stats'") |
|
6 | 6 |
if(is.null(cutoff)) |
7 |
- cutoff <- quantile(BSseqTstat@stats[, column], qcutoff) |
|
8 |
- direction <- as.integer(BSseqTstat@stats[, column] >= cutoff[2]) |
|
9 |
- direction[BSseqTstat@stats[, column] <= cutoff[1]] <- -1L |
|
7 |
+ cutoff <- quantile(BSseqTstat@stats[, stat], qcutoff) |
|
8 |
+ if(length(cutoff) == 1) |
|
9 |
+ cutoff <- c(-cutoff, cutoff) |
|
10 |
+ direction <- as.integer(BSseqTstat@stats[, stat] >= cutoff[2]) |
|
11 |
+ direction[BSseqTstat@stats[, stat] <= cutoff[1]] <- -1L |
|
10 | 12 |
direction[is.na(direction)] <- 0L |
11 | 13 |
chrs <- as.character(seqnames(BSseqTstat@gr)) |
12 | 14 |
positions <- start(BSseqTstat) |
13 |
- regions <- bsseq:::regionFinder3(direction, chr = chrs, pos = positions, maxGap = maxGap, verbose = verbose) |
|
14 |
- if(verbose) cat("creating dmr data.frame\n") |
|
15 |
+ regions <- bsseq:::regionFinder3(direction, chr = chrs, pos = positions, |
|
16 |
+ maxGap = maxGap, verbose = subverbose) |
|
17 |
+ if(verbose) cat("[dmrFinder] creating dmr data.frame\n") |
|
15 | 18 |
regions <- do.call(rbind, regions) |
16 | 19 |
rownames(regions) <- NULL |
17 |
- stats <- getStats(BSseqTstat, regions, column = column) |
|
18 |
- regions <- cbind(regions, stats) |
|
19 |
- regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo") |
|
20 | 20 |
regions$width <- regions$end - regions$start + 1 |
21 | 21 |
regions$invdensity <- regions$width / regions$n |
22 |
+ stats <- getStats(BSseqTstat, regions, column = stat) |
|
23 |
+ regions <- cbind(regions, stats) |
|
24 |
+ if(stat %in% c("tstat.corrected", "tstat")) { |
|
25 |
+ regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo") |
|
26 |
+ } |
|
22 | 27 |
regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),] |
23 | 28 |
regions |
24 | 29 |
} |
... | ... |
@@ -46,11 +51,11 @@ clusterMaker <- function(chr, pos, order.it=TRUE, maxGap=300){ |
46 | 51 |
|
47 | 52 |
regionFinder3 <- function(x, chr, positions, keep, maxGap = 300, verbose = TRUE) { |
48 | 53 |
getSegments3 <- function(x, cid, verbose = TRUE){ |
49 |
- if(verbose) cat("segmenting\n") |
|
54 |
+ if(verbose) cat("[regionFinder3] segmenting\n") |
|
50 | 55 |
segments <- cumsum(c(1, diff(x) != 0)) + |
51 | 56 |
cumsum(c(1, diff(cid) != 0)) |
52 | 57 |
names(segments) <- NULL |
53 |
- if(verbose) cat("splitting\n") |
|
58 |
+ if(verbose) cat("[regionFinder3] splitting\n") |
|
54 | 59 |
out <- list(up = split(which(x > 0), segments[x > 0]), |
55 | 60 |
down = split(which(x < 0), segments[x < 0]), |
56 | 61 |
nochange = split(which(x == 0), segments[x == 0])) |
... | ... |
@@ -2,16 +2,21 @@ fisherTests <- function(BSseq, group1, group2, lookup = NULL, |
2 | 2 |
returnLookup = TRUE, mc.cores = 1, |
3 | 3 |
verbose = TRUE) { |
4 | 4 |
stopifnot(is(BSseq, "BSseq")) |
5 |
- if(is.integer(group1)) { |
|
6 |
- stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq)) |
|
7 |
- group1 <- sampleNames(BSseq)[group1] |
|
5 |
+ if(is.character(group1)) { |
|
6 |
+ stopifnot(all(group1 %in% sampleNames(BSseq))) |
|
7 |
+ group1 <- match(group1, sampleNames(BSseq)) |
|
8 | 8 |
} |
9 |
- if(is.integer(group2)) { |
|
9 |
+ if(is.numeric(group1)) { |
|
10 |
+ stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq)) |
|
11 |
+ } else stop("problems with argument 'group1'") |
|
12 |
+ if(is.character(group2)) { |
|
13 |
+ stopifnot(all(group2 %in% sampleNames(BSseq))) |
|
14 |
+ group2 <- match(group2, sampleNames(BSseq)) |
|
15 |
+ } |
|
16 |
+ if(is.numeric(group2)) { |
|
10 | 17 |
stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq)) |
11 |
- group2 <- sampleNames(BSseq)[group2] |
|
12 |
- } |
|
18 |
+ } else stop("problems with argument 'group2'") |
|
13 | 19 |
stopifnot(length(intersect(group1, group2)) == 0) |
14 |
- sampleNames <- c(group1, group2) |
|
15 | 20 |
if(is.null(lookup)) { |
16 | 21 |
oldkeys <- "" |
17 | 22 |
} else { |
... | ... |
@@ -19,7 +24,7 @@ fisherTests <- function(BSseq, group1, group2, lookup = NULL, |
19 | 24 |
setequal(colnames(lookup), c("p.value", "log2OR"))) |
20 | 25 |
oldkeys <- rownames(lookup) |
21 | 26 |
} |
22 |
- if(verbose) cat(" setup ... ") |
|
27 |
+ if(verbose) cat("[fisherTests] setup ... ") |
|
23 | 28 |
stime <- system.time({ |
24 | 29 |
M1 <- rowSums(getCoverage(BSseq, type = "M")[, group1, drop = FALSE]) |
25 | 30 |
M2 <- rowSums(getCoverage(BSseq, type = "M")[, group2, drop = FALSE]) |
... | ... |
@@ -34,16 +39,16 @@ fisherTests <- function(BSseq, group1, group2, lookup = NULL, |
34 | 39 |
expand2[,"U1"] <- expand[,"Cov1"] - expand[,"M1"] |
35 | 40 |
expand2[,"U2"] <- expand[,"Cov2"] - expand[,"M2"] |
36 | 41 |
})[3] |
37 |
- if(verbose) cat(round(stime,1), "secs\n") |
|
38 |
- if(verbose) cat(sprintf(" performing %d tests (using %d cores) ... ", |
|
42 |
+ if(verbose) cat(round(stime,1), "done in secs\n") |
|
43 |
+ if(verbose) cat(sprintf("[fisherTests] performing %d tests (using %d cores) ... ", |
|
39 | 44 |
nrow(expand2), mc.cores)) |
40 | 45 |
stime <- system.time({ |
41 | 46 |
tests <- mclapply(1:nrow(expand2), function(ii) { |
42 | 47 |
unlist(fisher.test(matrix(expand2[ii,], ncol = 2))[c("p.value", "estimate")]) |
43 | 48 |
}, mc.cores = mc.cores) |
44 | 49 |
})[3] |
45 |
- if(verbose) cat(round(stime,1), "secs\n") |
|
46 |
- if(verbose) cat(" finalizing ... ") |
|
50 |
+ if(verbose) cat(round(stime,1), "done in secs\n") |
|
51 |
+ if(verbose) cat("[fisherTests] finalizing ... ") |
|
47 | 52 |
stime <- system.time({ |
48 | 53 |
tests <- do.call(rbind, tests) |
49 | 54 |
tests[,2] <- log2(tests[,2]) |
... | ... |
@@ -55,7 +60,7 @@ fisherTests <- function(BSseq, group1, group2, lookup = NULL, |
55 | 60 |
|
56 | 61 |
|
57 | 62 |
})[3] |
58 |
- if(verbose) cat(round(stime,1), "secs\n") |
|
63 |
+ if(verbose) cat(round(stime,1), "done in secs\n") |
|
59 | 64 |
if(returnLookup) |
60 | 65 |
return(list(results = results, lookup = lookup)) |
61 | 66 |
else |
... | ... |
@@ -14,10 +14,12 @@ plotAnnoTrack <- function(gr, annoTrack) { |
14 | 14 |
} |
15 | 15 |
|
16 | 16 |
plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addRegions = NULL, |
17 |
- annoTrack = NULL, col = NULL, lty = NULL, lwd = NULL, BSseqTstat = NULL, |
|
17 |
+ annoTrack = NULL, col = NULL, lty = NULL, lwd = NULL, |
|
18 |
+ BSseqTstat = NULL, stat = "tstat.corrected", stat.col = "black", |
|
19 |
+ stat.lwd = 1, stat.lty = 1, stat.ylim = c(-8,8), |
|
18 | 20 |
mainWithWidth = TRUE, regionCol = alpha("red", 0.1), addTicks = TRUE, |
19 | 21 |
addPoints = FALSE, pointsMinCov = 5, highlightMain = FALSE, verbose = TRUE) { |
20 |
- cat("preprocessing ...") |
|
22 |
+ cat("[plotManyRegions] preprocessing ...") |
|
21 | 23 |
if(!is.null(regions)) { |
22 | 24 |
if(is(regions, "data.frame")) |
23 | 25 |
gr <- data.frame2GRanges(regions, keepColumns = FALSE) |
... | ... |
@@ -39,9 +41,11 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg |
39 | 41 |
main <- rep(main, length = length(gr)) |
40 | 42 |
cat("done\n") |
41 | 43 |
for(ii in seq(along = gr)) { |
42 |
- if(verbose) cat(sprintf("plotting region %d (out of %d)\n", ii, nrow(regions))) |
|
44 |
+ if(verbose) cat(sprintf("[plotManyRegions] plotting region %d (out of %d)\n", ii, nrow(regions))) |
|
43 | 45 |
plotRegion(BSseq = BSseq, region = regions[ii,], extend = extend, |
44 | 46 |
col = col, lty = lty, lwd = lwd, main = main[ii], BSseqTstat = BSseqTstat, |
47 |
+ stat = stat, stat.col = stat.col, stat.lwd = stat.lwd, |
|
48 |
+ stat.lty = stat.lty, stat.ylim = stat.ylim, |
|
45 | 49 |
addRegions = addRegions, regionCol = regionCol, mainWithWidth = mainWithWidth, |
46 | 50 |
annoTrack = annoTrack, addTicks = addTicks, addPoints = addPoints, |
47 | 51 |
pointsMinCov = pointsMinCov, highlightMain = highlightMain) |
... | ... |
@@ -173,30 +177,33 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg |
173 | 177 |
regionCol = regionCol, highlightMain = highlightMain) |
174 | 178 |
|
175 | 179 |
if(addPoints) { |
176 |
- sapply(sampleNames(BSseq), function(samp) { |
|
177 |
- abline(v = positions[rawPs[, samp] > 0.1], col = "grey80", lty = 1) |
|
180 |
+ sapply(1:ncol(BSseq), function(sampIdx) { |
|
181 |
+ abline(v = positions[rawPs[, sampIdx] > 0.1], col = "grey80", lty = 1) |
|
178 | 182 |
}) |
179 | 183 |
} # This adds vertical grey lines so we can see where points are plotted |
180 | 184 |
|
181 |
- sapply(sampleNames(BSseq), function(samp) { |
|
182 |
- .bsPlotLines(positions, smoothPs[, samp], col = colEtc$col[samp], |
|
183 |
- lty = colEtc$lty[samp], lwd = colEtc$lwd[samp], |
|
185 |
+ sapply(1:ncol(BSseq), function(sampIdx) { |
|
186 |
+ .bsPlotLines(positions, smoothPs[, sampIdx], col = colEtc$col[sampIdx], |
|
187 |
+ lty = colEtc$lty[sampIdx], lwd = colEtc$lwd[sampIdx], |
|
184 | 188 |
plotRange = c(start(gr), end(gr))) |
185 | 189 |
}) |
186 | 190 |
|
187 | 191 |
if(addPoints) { |
188 |
- sapply(sampleNames(BSseq), function(samp) { |
|
189 |
- .bsPlotPoints(positions, rawPs[, samp], coverage[, samp], |
|
190 |
- col = colEtc$col[samp], pointsMinCov = pointsMinCov) |
|
192 |
+ sapply(1:ncol(BSseq), function(sampIdx) { |
|
193 |
+ .bsPlotPoints(positions, rawPs[, sampIdx], coverage[, sampIdx], |
|
194 |
+ col = colEtc$col[sampIdx], pointsMinCov = pointsMinCov) |
|
191 | 195 |
}) |
192 | 196 |
} |
193 | 197 |
} |
194 | 198 |
|
195 | 199 |
|
196 | 200 |
plotRegion <- function(BSseq, region = NULL, extend = 0, main = "", addRegions = NULL, annoTrack = NULL, |
197 |
- col = NULL, lty = NULL, lwd = NULL, BSseqTstat = NULL, mainWithWidth = TRUE, |
|
198 |
- regionCol = alpha("red", 0.1), addTicks = TRUE, addPoints = FALSE, |
|
199 |
- pointsMinCov = 5, highlightMain = FALSE) { |
|
201 |
+ col = NULL, lty = NULL, lwd = NULL, |
|
202 |
+ BSseqTstat = NULL, stat = "tstat.corrected", stat.col = "black", |
|
203 |
+ stat.lwd = 1, stat.lty = 1, stat.ylim = c(-8,8), |
|
204 |
+ mainWithWidth = TRUE, |
|
205 |
+ regionCol = alpha("red", 0.1), addTicks = TRUE, addPoints = FALSE, |
|
206 |
+ pointsMinCov = 5, highlightMain = FALSE) { |
|
200 | 207 |
|
201 | 208 |
opar <- par(mar = c(0,4.1,0,0), oma = c(5,0,4,2), mfrow = c(1,1)) |
202 | 209 |
on.exit(par(opar)) |
... | ... |
@@ -214,16 +221,14 @@ plotRegion <- function(BSseq, region = NULL, extend = 0, main = "", addRegions = |
214 | 221 |
if(!is.null(BSseqTstat)) { |
215 | 222 |
if(!is.null(BSseqTstat)) |
216 | 223 |
BSseqTstat <- subsetByOverlaps(BSseqTstat, gr) |
217 |
- plot(positions[1], 0.5, type = "n", xaxt = "n", yaxt = "n", |
|
218 |
- ylim = c(-8,8), xlim = plotRange, xlab = "", ylab = "t-stat") |
|
224 |
+ plot(start(gr), 0.5, type = "n", xaxt = "n", yaxt = "n", |
|
225 |
+ ylim = stat.ylim, xlim = c(start(gr), end(gr)), xlab = "", ylab = "t-stat") |
|
219 | 226 |
axis(side = 2, at = c(-5,0,5)) |
220 | 227 |
abline(h = 0, col = "grey60") |
221 |
- bsseq:::.bsPlotLines(start(BSseqTstat), BSseqTstat@stats[, "tstat"], |
|
222 |
- lty = 1, plotRange = plotRange, col = "red", lwd = 1) |
|
223 |
- bsseq:::.bsPlotLines(start(BSseqTstat), BSseqTstat@stats[, "tstat.corrected"], |
|
224 |
- lty = 2, plotRange = plotRange, col = "red", lwd = 1) |
|
225 |
- bsseq:::.bsPlotLines(start(BSseqTstat), 100*BSseqTstat@stats[, "tstat.sd"], |
|
226 |
- lty = 2, plotRange = plotRange, col = "blue", lwd = 1) |
|
228 |
+ mapply(function(stat, col, lty, lwd) { |
|
229 |
+ bsseq:::.bsPlotLines(start(BSseqTstat), BSseqTstat@stats[, stat], |
|
230 |
+ lty = lty, plotRange = c(start(gr), end(gr)), col = col, lwd = lwd) |
|
231 |
+ }, stat = stat, col = stat.col, lty = stat.lty, lwd = stat.lwd) |
|
227 | 232 |
} |
228 | 233 |
|
229 | 234 |
if(!is.null(annoTrack)) |
... | ... |
@@ -11,7 +11,7 @@ read.bismark <- function(files, sampleNames, rmZeroCov = FALSE, verbose = TRUE){ |
11 | 11 |
names(idxes) <- sampleNames |
12 | 12 |
allOut <- lapply(idxes, function(ii){ |
13 | 13 |
if (verbose) { |
14 |
- cat(sprintf("Reading file '%s' ... ", files[ii])) |
|
14 |
+ cat(sprintf("[read.bismark] Reading file '%s' ... ", files[ii])) |
|
15 | 15 |
} |
16 | 16 |
stime <- system.time({ |
17 | 17 |
raw <- read.bismarkFileRaw(thisfile = files[ii]) |
... | ... |
@@ -22,18 +22,18 @@ read.bismark <- function(files, sampleNames, rmZeroCov = FALSE, verbose = TRUE){ |
22 | 22 |
sampleNames = sampleNames[ii], rmZeroCov = rmZeroCov) |
23 | 23 |
})[3] |
24 | 24 |
if (verbose) { |
25 |
- cat(sprintf("in %.1f secs\n", stime)) |
|
25 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
26 | 26 |
} |
27 | 27 |
out |
28 | 28 |
}) |
29 | 29 |
if (verbose) { |
30 |
- cat(sprintf("Joining samples ... ")) |
|
30 |
+ cat(sprintf("[read.bismark] Joining samples ... ")) |
|
31 | 31 |
} |
32 | 32 |
stime <- system.time({ |
33 | 33 |
allOut <- combineList(allOut) |
34 | 34 |
})[3] |
35 | 35 |
if (verbose) { |
36 |
- cat(sprintf("in %.1f secs\n", stime)) |
|
36 |
+ cat(sprintf("done in %.1f secs\n", stime)) |
|
37 | 37 |
} |
38 | 38 |
allOut |
39 | 39 |
} |
... | ... |
@@ -79,7 +79,7 @@ read.umtab2.chr <- function(files, sampleNames = NULL, |
79 | 79 |
if(!keepFilt) { |
80 | 80 |
what0 <- what0[!grepl("^filt", what0)] |
81 | 81 |
} |
82 |
- if(verbose) cat("reading", files[1], "\n") |
|
82 |
+ if(verbose) cat("[read.umtab2.chr] reading", files[1], "\n") |
|
83 | 83 |
scanPars <- list(sep = "", quote = "", quiet = TRUE, skip = 1, |
84 | 84 |
what = what0, na.strings = c("NA", "?")) |
85 | 85 |
intab <- do.call(scan, c(scanPars, file = files[1])) |
... | ... |
@@ -158,7 +158,7 @@ read.umtab2.chr2 <- function(files, sampleNames = NULL, |
158 | 158 |
what0[["ref"]] <- character(0) |
159 | 159 |
if(stranded) |
160 | 160 |
what0[["strand"]] <- character(0) |
161 |
- if(verbose) cat("parsing all samples first\n") |
|
161 |
+ if(verbose) cat("[read.umtab2.chr] parsing all samples first\n") |
|
162 | 162 |
scanPars <- list(sep = "", quote = "", quiet = TRUE, skip = 1, |
163 | 163 |
what = what0, na.strings = c("NA", "?")) |
164 | 164 |
## First we get the locations of everything in all files |
... | ... |
@@ -202,7 +202,7 @@ read.umtab2.chr2 <- function(files, sampleNames = NULL, |
202 | 202 |
stopifnot(all(c(keepM, keepU) %in% c(allMnames, allUnames))) |
203 | 203 |
|
204 | 204 |
for(ii in seq_along(files)) { |
205 |
- if(verbose) cat("reading", files[ii], "\n") |
|
205 |
+ if(verbose) cat("[read.umtab2.chr] reading", files[ii], "\n") |
|
206 | 206 |
intab <- do.call(scan, c(scanPars, file = files[ii])) |
207 | 207 |
if(length(intab[[1]]) == 0) |
208 | 208 |
next |
... | ... |
@@ -290,7 +290,7 @@ read.umtab.chr <- function(files, sampleNames = NULL, |
290 | 290 |
return(list()) |
291 | 291 |
line1 <- strsplit(line1, "\t")[[1]] |
292 | 292 |
stopifnot(all(line1 == columnHeaders)) |
293 |
- if(verbose) cat("reading", files[1], "\n") |
|
293 |
+ if(verbose) cat("[read.umtab.chr] reading", files[1], "\n") |
|
294 | 294 |
scanPars <- list(sep = "", quote = "", quiet = TRUE, skip = 1, |
295 | 295 |
what = what0, na.strings = c("NA", "?")) |
296 | 296 |
intab <- do.call(scan, c(scanPars, file = files[1])) |
... | ... |
@@ -318,7 +318,7 @@ read.umtab.chr <- function(files, sampleNames = NULL, |
318 | 318 |
Ucy[,1] <- intab[["Ucy"]] |
319 | 319 |
csums[,1] <- as.integer(sapply(intab[c(allMnames, allUnames)], sum)) |
320 | 320 |
for(ii in seq_along(files[-1]) + 1) { |
321 |
- if(verbose) cat("reading", files[ii], "\n") |
|
321 |
+ if(verbose) cat("[read.umtab2] reading", files[ii], "\n") |
|
322 | 322 |
intab <- do.call(scan, c(scanPars, file = files[ii])) |
323 | 323 |
if(length(intab[[1]]) == 0) |
324 | 324 |
next |
... | ... |
@@ -373,7 +373,7 @@ read.bsmoothDirRaw <- function(dir, seqnames = NULL, keepCycle = FALSE, keepFilt |
373 | 373 |
what0[grep("^filt", names(what0))] <- replicate(length(grep("^filt", names(what0))), NULL) |
374 | 374 |
outList <- lapply(allChrFiles, function(thisfile) { |
375 | 375 |
if(verbose) |
376 |
- cat(sprintf("Reading '%s'\n", thisfile)) |
|
376 |
+ cat(sprintf("[read.bsmoothDirRaw] Reading '%s'\n", thisfile)) |
|
377 | 377 |
if(grepl("\\.gz$", thisfile)) |
378 | 378 |
con <- gzfile(thisfile) |
379 | 379 |
else |
... | ... |
@@ -436,7 +436,7 @@ read.bsmooth <- function(dirs, sampleNames = NULL, seqnames = NULL, returnRaw = |
436 | 436 |
idxes <- seq_along(dirs) |
437 | 437 |
names(idxes) <- sampleNames |
438 | 438 |
allOut <- lapply(idxes, function(ii) { |
439 |
- if(verbose) cat(sprintf("Reading dir '%s' ... ", dirs[ii])) |
|
439 |
+ if(verbose) cat(sprintf("[read.bsmooth] Reading dir '%s' ... ", dirs[ii])) |
|
440 | 440 |
stime <- system.time({ |
441 | 441 |
if(returnRaw) { |
442 | 442 |
out <- read.bsmoothDirRaw(dir = dirs[ii], seqnames = seqnames, keepCycle = TRUE, |
... | ... |
@@ -447,15 +447,15 @@ read.bsmooth <- function(dirs, sampleNames = NULL, seqnames = NULL, returnRaw = |
447 | 447 |
out <- sampleRawToBSseq(raw, qualityCutoff = qualityCutoff, rmZeroCov, sampleName = sampleNames[ii]) |
448 | 448 |
} |
449 | 449 |
})[3] |
450 |
- if(verbose) cat(sprintf("in %.1f secs\n", stime)) |
|
450 |
+ if(verbose) cat(sprintf("done in %.1f secs\n", stime)) |
|
451 | 451 |
out |
452 | 452 |
}) |
453 | 453 |
if(!returnRaw) { |
454 |
- if(verbose) cat(sprintf("Joining samples ... ")) |
|
454 |
+ if(verbose) cat(sprintf("[read.bsmooth] Joining samples ... ")) |
|
455 | 455 |
stime <- system.time({ |
456 | 456 |
allOut <- combineList(allOut) |
457 | 457 |
})[3] |
458 |
- if(verbose) cat(sprintf("in %.1f secs\n", stime)) |
|
458 |
+ if(verbose) cat(sprintf("done in %.1f secs\n", stime)) |
|
459 | 459 |
} |
460 | 460 |
allOut |
461 | 461 |
} |
... | ... |
@@ -464,9 +464,9 @@ parsingPipeline <- function(dirs, qualityCutoff = 20, outDir, seqnames = NULL, |
464 | 464 |
subdir = "ev_bt2_cpg_tab", timing = FALSE) { |
465 | 465 |
if(!all(file.exists(dirs))) |
466 | 466 |
stop("not all directories in 'dirs' exists.") |
467 |
- cat("Parsing all files.\n") |
|
467 |
+ cat("[parsingPipeline] Parsing all files.\n") |
|
468 | 468 |
oneDir <- function(dir) { |
469 |
- cat(" dir", basename(dir), ": ") |
|
469 |
+ cat("[parsingPipeline] dir", basename(dir), ": ") |
|
470 | 470 |
base <- basename(dir) |
471 | 471 |
cat("parsing, ") |
472 | 472 |
stime <- system.time({ |
... | ... |
@@ -476,7 +476,7 @@ parsingPipeline <- function(dirs, qualityCutoff = 20, outDir, seqnames = NULL, |
476 | 476 |
assign(paste0(base, ".raw"), raw) |
477 | 477 |
})[3] |
478 | 478 |
if(timing) { |
479 |
- cat(sprintf("\nIn %.1f secs\n", stime)) |
|
479 |
+ cat(sprintf("\ndone in %.1f secs\n", stime)) |
|
480 | 480 |
print(gc()) |
481 | 481 |
} |
482 | 482 |
cat("saving, ") |
... | ... |
@@ -489,7 +489,7 @@ parsingPipeline <- function(dirs, qualityCutoff = 20, outDir, seqnames = NULL, |
489 | 489 |
seqlevels(bsseq)[seqlevels(bsseq) == "chrgi|9626243|ref|NC_001416.1|"] <- "chrLambda" |
490 | 490 |
})[3] |
491 | 491 |
if(timing) { |
492 |
- cat(sprintf("\nIn %.1f secs\n", stime)) |
|
492 |
+ cat(sprintf("\ndone in %.1f secs\n", stime)) |
|
493 | 493 |
print(gc()) |
494 | 494 |
} |
495 | 495 |
cat("ordering, ") |
... | ... |
@@ -10,14 +10,17 @@ |
10 | 10 |
\item validObject(BSseq) has been extended to also check for |
11 | 11 |
sampleNames consistency. |
12 | 12 |
\item Fixed a bug related to validity checking. |
13 |
+ \item Increased maxk from 10,000 to 50,000 in calls to locfit, to |
|
14 |
+ allowing fitting the model on genomes with unusally long chromosomes |
|
15 |
+ (Thanks to Brian Herb for reporting). |
|
13 | 16 |
\item The class representation for class 'BSseq' has changed |
14 | 17 |
completely. The new class is build on 'SummarizedExperiment' from |
15 | 18 |
GenomicRanges instead of 'hasGRanges'. Use 'x <- updateObject(x)' to |
16 | 19 |
update serialized (saved) objects. |
17 | 20 |
\item Fixing a problem in orderBSseq related to chromosome names. |
18 |
- \item Increased maxk from 10,000 to 50,000 in calls to locfit, to |
|
19 |
- allowing fitting the model on genomes with unusally long chromosomes |
|
20 |
- (Thanks to Brian Herb for reporting). |
|
21 |
+ \item Allowed user specification of maxk, with a default of 10,000 |
|
22 |
+ in BSmooth. |
|
23 |
+ \item Many bugfixes made necessary by the new class representation. |
|
21 | 24 |
} |
22 | 25 |
} |
23 | 26 |
|
... | ... |
@@ -20,8 +20,8 @@ test_BSseq <- function() { |
20 | 20 |
checkEquals(dim(BStest), c(3,3)) |
21 | 21 |
checkEquals(nrow(BStest), 3) |
22 | 22 |
checkEquals(ncol(BStest), 3) |
23 |
- checkEquals(getCoverage(BStest, type = "M"), M) |
|
24 |
- checkEquals(getCoverage(BStest, type = "Cov"), M + 2) |
|
23 |
+ checkEquals(getCoverage(BStest, type = "M"), unname(M)) |
|
24 |
+ checkEquals(getCoverage(BStest, type = "Cov"), unname(M + 2)) |
|
25 | 25 |
checkEquals(sampleNames(BStest), colnames(M)) |
26 | 26 |
|
27 | 27 |
BStest2 <- BSseq(pos = 3:1, chr = rep("chr1", 3), M = M[3:1,], |
... | ... |
@@ -204,8 +204,8 @@ we will only keep CpGs where at least 2 cancer samples and at least 2 normal sam |
204 | 204 |
line breaks in Sweave) |
205 | 205 |
<<keepLoci>>= |
206 | 206 |
BS.cov <- getCoverage(BS.cancer.ex.fit) |
207 |
-keepLoci.ex <- which(rowSums(BS.cov[, c("C1", "C2", "C3")] >= 2) >= 2 & |
|
208 |
- rowSums(BS.cov[, c("N1", "N2", "N3")] >= 2) >= 2) |
|
207 |
+keepLoci.ex <- which(rowSums(BS.cov[, BS.cancer.ex$Type == "cancer"] >= 2) >= 2 & |
|
208 |
+ rowSums(BS.cov[, BS.cancer.ex$Type == "normal"] >= 2) >= 2) |
|
209 | 209 |
length(keepLoci.ex) |
210 | 210 |
BS.cancer.ex.fit <- BS.cancer.ex.fit[keepLoci.ex,] |
211 | 211 |
@ |