git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@67929 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,16 @@ |
1 |
+Package: bsseq |
|
2 |
+Version: 0.4.2 |
|
3 |
+Title: Analyze, manage and store bisulfite sequencing data |
|
4 |
+Description: Tools for analyzing and visualizing bisulfite sequencing data |
|
5 |
+Author: Kasper Daniel Hansen |
|
6 |
+Maintainer: Kasper Daniel Hansen <khansen@jhsph.edu> |
|
7 |
+Depends: R (>= 2.15), methods, BiocGenerics, IRanges, GenomicRanges, |
|
8 |
+ parallel, matrixStats |
|
9 |
+Imports: scales, stats, graphics, Biobase, locfit |
|
10 |
+Suggests: RUnit, bsseqData |
|
11 |
+Collate: hasGRanges.R BSseq_class.R BSseq_utils.R utils.R read.R |
|
12 |
+ BSmooth.R dmrFinder.R gof_stats.R plotting.R fisher.R fixes.R |
|
13 |
+License: Artistic-2.0 |
|
14 |
+LazyData: yes |
|
15 |
+biocViews: DNAMethylation |
|
16 |
+Packaged: 2012-07-10 04:15:29 UTC; khansen |
0 | 17 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,70 @@ |
1 |
+## |
|
2 |
+## Importing |
|
3 |
+## |
|
4 |
+ |
|
5 |
+import(methods) |
|
6 |
+importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames", |
|
7 |
+ "combine", "density", "intersect", "lapply", "ncol", |
|
8 |
+ "nrow", "order", "paste", "pmax", "pmin", "rbind", |
|
9 |
+ "Reduce", "rep.int", "rownames", "sapply", "setdiff", |
|
10 |
+ "strand", "strand<-", "union", "unique") |
|
11 |
+ |
|
12 |
+importFrom(stats, "approxfun", "fisher.test", "ppoints", |
|
13 |
+ "predict", "preplot", "qchisq", |
|
14 |
+ "qnorm", "qqplot", "qunif") |
|
15 |
+ |
|
16 |
+importFrom(graphics, "abline", "axis", "layout", "legend", "lines", |
|
17 |
+ "mtext", "par", "plot", "points", "rect", "rug") |
|
18 |
+ |
|
19 |
+import(parallel) |
|
20 |
+importFrom(locfit, "locfit", "lp") |
|
21 |
+importFrom(matrixStats, "rowSds", "rowVars") |
|
22 |
+import(IRanges) |
|
23 |
+import(GenomicRanges) |
|
24 |
+importFrom(scales, "alpha") |
|
25 |
+importClassesFrom(Biobase, "AnnotatedDataFrame") |
|
26 |
+importMethodsFrom(Biobase, "annotatedDataFrameFrom", |
|
27 |
+ "pData", "pData<-", |
|
28 |
+ "phenoData", "phenoData<-", |
|
29 |
+ "sampleNames", "sampleNames<-") |
|
30 |
+ |
|
31 |
+ |
|
32 |
+## |
|
33 |
+## Exporting |
|
34 |
+## |
|
35 |
+ |
|
36 |
+ |
|
37 |
+exportClasses("hasGRanges", |
|
38 |
+ "BSseq", |
|
39 |
+ "BSseqTstat", |
|
40 |
+ "matrixOrNULL") |
|
41 |
+ |
|
42 |
+exportMethods("[", "show", |
|
43 |
+ "seqnames", "seqnames<-", |
|
44 |
+ "seqlevels", "seqlevels<-", |
|
45 |
+ "seqlengths", "seqlengths<-", |
|
46 |
+ "start", "start<-", |
|
47 |
+ "end", "end<-", |
|
48 |
+ "width", "width<-", |
|
49 |
+ "strand", "strand<-", |
|
50 |
+ "granges", |
|
51 |
+ "dim", "nrow", "ncol", |
|
52 |
+ "sampleNames", "sampleNames<-", |
|
53 |
+ "phenoData", "phenoData<-", |
|
54 |
+ "pData", "pData<-", |
|
55 |
+ "findOverlaps", "subsetByOverlaps", |
|
56 |
+ "combine") |
|
57 |
+ |
|
58 |
+export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats", |
|
59 |
+ "collapseBSseq", "orderBSseq", "hasBeenSmoothed", "chrSelectBSseq", |
|
60 |
+ "BSmooth", "BSmooth.tstat", "dmrFinder", "fisherTests", |
|
61 |
+ "plotRegion", "plotManyRegions", |
|
62 |
+ "read.umtab", "read.umtab2", "read.bsmooth", |
|
63 |
+ "poissonGoodnessOfFit", "binomialGoodnessOfFit", |
|
64 |
+ "data.frame2GRanges", "BSseqTstat") |
|
65 |
+ |
|
66 |
+S3method("print", "chisqGoodnessOfFit") |
|
67 |
+S3method("plot", "chisqGoodnessOfFit") |
|
68 |
+S3method("print", "summary.BSseqTstat") |
|
69 |
+S3method("summary", "BSseqTstat") |
|
70 |
+S3method("plot", "BSseqTstat") |
0 | 71 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,269 @@ |
1 |
+makeClusters <- function(hasGRanges, maxGap = 10^8, mc.cores = 1) { |
|
2 |
+ chrOrder <- as.character(runValue(seqnames(hasGRanges))) |
|
3 |
+ if(anyDuplicated(chrOrder)) |
|
4 |
+ stop("argument 'hasGRanges' is not properly order") |
|
5 |
+ grBase <- granges(hasGRanges) |
|
6 |
+ clusters <- reduce(resize(grBase, width = 2*maxGap + 1, fix = "center")) |
|
7 |
+ start(clusters) <- pmax(rep(1, length(clusters)), start(clusters)) |
|
8 |
+ clusters.sp <- split(clusters, seqnames(clusters)) |
|
9 |
+ stopifnot(all(sapply(clusters.sp, function(cluster.gr) { |
|
10 |
+ if(length(cluster.gr) <= 1) return(TRUE) |
|
11 |
+ all(start(cluster.gr)[-length(cluster.gr)] < end(cluster.gr)[-1]) |
|
12 |
+ }))) # are the clusters ordered within the chromosome? This is probably guranteed |
|
13 |
+ clusters <- Reduce(c, clusters.sp[chrOrder]) |
|
14 |
+ stopifnot(all(chrOrder == runValue(seqnames(clusters)))) |
|
15 |
+ ov <- findOverlaps_mclapply(grBase, clusters, mc.cores = mc.cores) |
|
16 |
+ clusterIdx <- split(as.matrix(ov)[,1], as.matrix(ov)[,2]) |
|
17 |
+ names(clusterIdx) <- NULL |
|
18 |
+ clusterIdx |
|
19 |
+} |
|
20 |
+ |
|
21 |
+ |
|
22 |
+BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"), |
|
23 |
+ mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) { |
|
24 |
+ smooth <- function(idxes, sname) { |
|
25 |
+ ## Assuming that idxes is a set of indexes into the BSseq object |
|
26 |
+ ## sname is a single character |
|
27 |
+ if(verbose >= 3) |
|
28 |
+ cat(sprintf(" beginning: sample:%s, chr:%s, nLoci:%s\n", |
|
29 |
+ sname, as.character(seqnames(BSseq)[idxes[1]]), |
|
30 |
+ length(idxes))) |
|
31 |
+ Cov <- getCoverage(BSseq, type = "Cov")[idxes, sname] |
|
32 |
+ M <- getCoverage(BSseq, type = "M")[idxes, sname] |
|
33 |
+ pos <- start(BSseq)[idxes] |
|
34 |
+ stopifnot(all(diff(pos) > 0)) |
|
35 |
+ wh <- which(Cov != 0) |
|
36 |
+ nn <- ns / length(wh) |
|
37 |
+ if(length(wh) <= ns) { |
|
38 |
+ if(keep.se) |
|
39 |
+ se.coef <- rep(NA_real_, length(Cov)) |
|
40 |
+ else |
|
41 |
+ se.coef <- NULL |
|
42 |
+ return(list(coef = rep(NA_real_, length(Cov)), |
|
43 |
+ se.coef = se.coef, |
|
44 |
+ trans = NULL, h = h, nn = nn)) |
|
45 |
+ } |
|
46 |
+ sdata <- data.frame(pos = pos[wh], |
|
47 |
+ M = pmin(pmax(M[wh], 0.01), Cov[wh] - 0.01), |
|
48 |
+ Cov = Cov[wh]) |
|
49 |
+ fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata, |
|
50 |
+ weights = Cov, family = "binomial", maxk = 10000) |
|
51 |
+ pp <- preplot(fit, where = "data", band = "local", |
|
52 |
+ newdata = data.frame(pos = pos)) |
|
53 |
+ if(keep.se) { |
|
54 |
+ se.coef <- pp$se.fit |
|
55 |
+ } else { |
|
56 |
+ se.coef <- NULL |
|
57 |
+ } |
|
58 |
+ if(verbose >= 2) |
|
59 |
+ cat(sprintf(" sample:%s, chr:%s, nLoci:%s, nCoveredLoci:%s, done\n", |
|
60 |
+ sname, as.character(seqnames(BSseq)[idxes[1]]), |
|
61 |
+ length(idxes), nrow(sdata))) |
|
62 |
+ return(list(coef = pp$fit, se.coef = se.coef, |
|
63 |
+ trans = pp$trans, h = h, nn = nn)) |
|
64 |
+ } |
|
65 |
+ stopifnot(class(BSseq) == "BSseq") |
|
66 |
+ parallelBy <- match.arg(parallelBy) |
|
67 |
+ if(verbose) cat("preprocessing ... ") |
|
68 |
+ stime <- system.time({ |
|
69 |
+ clusterIdx <- makeClusters(BSseq, maxGap = maxGap, mc.cores = mc.cores) |
|
70 |
+ })[3] |
|
71 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
72 |
+ |
|
73 |
+ sampleNames <- sampleNames(BSseq) |
|
74 |
+ names(sampleNames) <- sampleNames |
|
75 |
+ |
|
76 |
+ stimeAll <- system.time({ |
|
77 |
+ switch(parallelBy, "sample" = { |
|
78 |
+ if(verbose) cat(sprintf("smoothing by 'sample' (mc.cores = %d, mc.preschedule = %s)\n", |
|
79 |
+ mc.cores, mc.preschedule)) |
|
80 |
+ out <- mclapply(sampleNames, function(nam) { |
|
81 |
+ stime <- system.time({ |
|
82 |
+ tmp <- lapply(clusterIdx, function(jj) { |
|
83 |
+ smooth(idxes = jj, sname = nam) |
|
84 |
+ }) |
|
85 |
+ coef <- do.call(c, lapply(tmp, function(xx) xx$coef)) |
|
86 |
+ se.coef <- do.call(c, lapply(tmp, function(xx) xx$se.coef)) |
|
87 |
+ })[3] |
|
88 |
+ if(verbose) { |
|
89 |
+ cat(sprintf(" sample %s (out of %d), done in %.1f sec\n", |
|
90 |
+ nam, length(sampleNames), stime)) |
|
91 |
+ } |
|
92 |
+ return(list(coef = coef, se.coef = se.coef)) |
|
93 |
+ }, mc.preschedule = mc.preschedule, mc.cores = mc.cores) |
|
94 |
+ coef <- do.call(cbind, lapply(out, function(xx) xx$coef)) |
|
95 |
+ se.coef <- do.call(cbind, lapply(out, function(xx) xx$se.coef)) |
|
96 |
+ }, "chromosome" = { |
|
97 |
+ if(verbose) cat(sprintf("smoothing by 'chromosome' (mc.cores = %d, mc.preschedule = %s)\n", |
|
98 |
+ mc.cores, mc.preschedule)) |
|
99 |
+ out <- mclapply(1:length(clusterIdx), function(ii) { |
|
100 |
+ stime <- system.time({ |
|
101 |
+ tmp <- lapply(sampleNames, function(nam) { |
|
102 |
+ smooth(idxes = clusterIdx[[ii]], sname = nam) |
|
103 |
+ }) |
|
104 |
+ coef <- do.call(cbind, lapply(tmp, function(xx) xx$coef)) |
|
105 |
+ se.coef <- do.call(cbind, lapply(tmp, function(xx) xx$se.coef)) |
|
106 |
+ })[3] |
|
107 |
+ if(verbose) |
|
108 |
+ cat(sprintf(" chr idx %d (out of %d), done in %.1f sec\n", |
|
109 |
+ ii, length(clusterIdx), stime)) |
|
110 |
+ return(list(coef = coef, se.coef = se.coef)) |
|
111 |
+ }, mc.preschedule = mc.preschedule, mc.cores = mc.cores) |
|
112 |
+ coef <- do.call(rbind, lapply(out, function(xx) xx$coef)) |
|
113 |
+ se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef)) |
|
114 |
+ }) |
|
115 |
+ })[3] |
|
116 |
+ if(verbose) |
|
117 |
+ cat(sprintf("smoothing done in %.1f sec\n", stimeAll)) |
|
118 |
+ |
|
119 |
+ rownames(coef) <- NULL |
|
120 |
+ colnames(coef) <- sampleNames(BSseq) |
|
121 |
+ if(!is.null(se.coef)) { |
|
122 |
+ rownames(se.coef) <- NULL |
|
123 |
+ colnames(se.coef) <- sampleNames(BSseq) |
|
124 |
+ } |
|
125 |
+ |
|
126 |
+ mytrans <- function(x) { |
|
127 |
+ y <- x |
|
128 |
+ ix <- which(x < 0) |
|
129 |
+ ix2 <- which(x > 0) |
|
130 |
+ y[ix] <- exp(x[ix])/(1 + exp(x[ix])) |
|
131 |
+ y[ix2] <- 1/(1 + exp(-x[ix2])) |
|
132 |
+ y |
|
133 |
+ } |
|
134 |
+ environment(mytrans) <- baseenv() |
|
135 |
+ BSseq@trans <- mytrans |
|
136 |
+ BSseq@coef <- coef |
|
137 |
+ BSseq@se.coef <- se.coef |
|
138 |
+ parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap), |
|
139 |
+ ns = ns, h = h, maxGap = maxGap) |
|
140 |
+ BSseq@parameters <- parameters |
|
141 |
+ BSseq |
|
142 |
+} |
|
143 |
+ |
|
144 |
+ |
|
145 |
+BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paired", "group2"), |
|
146 |
+ local.correct = TRUE, maxGap = NULL, qSd = 0.75, k = 101, mc.cores = 1, verbose = TRUE){ |
|
147 |
+ smoothSd <- function(Sds, k) { |
|
148 |
+ k0 <- floor(k/2) |
|
149 |
+ if(all(is.na(Sds))) return(Sds) |
|
150 |
+ thresSD <- pmax(Sds, quantile(Sds, qSd, na.rm = TRUE), na.rm = TRUE) |
|
151 |
+ addSD <- rep(median(Sds, na.rm = TRUE), k0) |
|
152 |
+ sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k)) |
|
153 |
+ sSds |
|
154 |
+ } |
|
155 |
+ compute.correction <- function(idx, qSd = 0.75) { |
|
156 |
+ xx <- start(BSseq)[idx] |
|
157 |
+ yy <- tstat[idx] |
|
158 |
+ suppressWarnings({ |
|
159 |
+ drange <- diff(range(xx, na.rm = TRUE)) |
|
160 |
+ }) |
|
161 |
+ if(drange <= 25000) |
|
162 |
+ return(yy) |
|
163 |
+ tstat.function <- approxfun(xx, yy) |
|
164 |
+ xx.reg <- seq(from = min(xx), to = max(xx), by = 2000) |
|
165 |
+ yy.reg <- tstat.function(xx.reg) |
|
166 |
+ fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0), |
|
167 |
+ family = "huber", maxk = 10000) |
|
168 |
+ correction <- predict(fit, newdata = data.frame(xx.reg = xx)) |
|
169 |
+ yy - correction |
|
170 |
+ } |
|
171 |
+ |
|
172 |
+ estimate.var <- match.arg(estimate.var) |
|
173 |
+ stopifnot(is(BSseq, "BSseq")) |
|
174 |
+ stopifnot(hasBeenSmoothed(BSseq)) |
|
175 |
+ if(is.numeric(group1)) { |
|
176 |
+ stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq)) |
|
177 |
+ group1 <- sampleNames(BSseq)[group1] |
|
178 |
+ } |
|
179 |
+ if(is.numeric(group2)) { |
|
180 |
+ stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq)) |
|
181 |
+ group2 <- sampleNames(BSseq)[group2] |
|
182 |
+ } |
|
183 |
+ stopifnot(length(intersect(group1, group2)) == 0) |
|
184 |
+ if(estimate.var == "paired") |
|
185 |
+ stopifnot(length(group1) == length(group2)) |
|
186 |
+ stopifnot(all(c(group1, group2) %in% sampleNames(BSseq))) |
|
187 |
+ |
|
188 |
+ if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0)) |
|
189 |
+ warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first") |
|
190 |
+ |
|
191 |
+ if(is.null(maxGap)) |
|
192 |
+ maxGap <- BSseq@parameters$maxGap |
|
193 |
+ |
|
194 |
+ if(verbose) cat("preprocessing ... ") |
|
195 |
+ stime <- system.time({ |
|
196 |
+ clusterIdx <- makeClusters(BSseq, maxGap = maxGap, mc.cores = mc.cores) |
|
197 |
+ })[3] |
|
198 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
199 |
+ |
|
200 |
+ if(verbose) cat("computing stats within groups ... ") |
|
201 |
+ stime <- system.time({ |
|
202 |
+ allPs <- getMeth(BSseq, type = "smooth", what = "perBase", |
|
203 |
+ confint = FALSE)[, c(group1, group2)] |
|
204 |
+ group1.means <- rowMeans(allPs[, group1, drop = FALSE], na.rm = TRUE) |
|
205 |
+ group2.means <- rowMeans(allPs[, group2, drop = FALSE], na.rm = TRUE) |
|
206 |
+ })[3] |
|
207 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
208 |
+ |
|
209 |
+ if(verbose) cat("computing stats across groups ... ") |
|
210 |
+ stime <- system.time({ |
|
211 |
+ switch(estimate.var, |
|
212 |
+ "group2" = { |
|
213 |
+ rawSds <- rowSds(allPs[, group2, drop = FALSE], na.rm = TRUE) |
|
214 |
+ smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
215 |
+ smoothSd(rawSds[idx], k = k) |
|
216 |
+ }, mc.cores = mc.cores)) |
|
217 |
+ scale <- sqrt(1/length(group1) + 1/length(group2)) |
|
218 |
+ tstat.sd <- smoothSds * scale |
|
219 |
+ }, |
|
220 |
+ "same" = { |
|
221 |
+ rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs[, group1, drop = FALSE]) + |
|
222 |
+ (length(group2) - 1) * rowVars(allPs[, group2, drop = FALSE])) / |
|
223 |
+ (length(group1) + length(group2) - 2)) |
|
224 |
+ smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
225 |
+ smoothSd(rawSds[idx], k = k) |
|
226 |
+ }, mc.cores = mc.cores)) |
|
227 |
+ scale <- sqrt(1/length(group1) + 1/length(group2)) |
|
228 |
+ tstat.sd <- smoothSds * scale |
|
229 |
+ }, |
|
230 |
+ "paired" = { |
|
231 |
+ rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE]) |
|
232 |
+ smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) { |
|
233 |
+ smoothSd(rawSds[idx], k = k) |
|
234 |
+ }, mc.cores = mc.cores)) |
|
235 |
+ scale <- sqrt(1/length(group1)) |
|
236 |
+ tstat.sd <- smoothSds * scale |
|
237 |
+ }) |
|
238 |
+ tstat <- (group1.means - group2.means) / tstat.sd |
|
239 |
+ is.na(tstat)[tstat.sd == 0] <- TRUE |
|
240 |
+ if(local.correct) { |
|
241 |
+ tstat.corrected <- do.call(c, mclapply(clusterIdx, |
|
242 |
+ compute.correction, qSd = qSd, |
|
243 |
+ mc.cores = mc.cores)) |
|
244 |
+ } |
|
245 |
+ })[3] |
|
246 |
+ if(verbose) cat(sprintf("done in %.1f sec\n", stime)) |
|
247 |
+ |
|
248 |
+ if(local.correct) { |
|
249 |
+ stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
|
250 |
+ tstat, tstat.corrected) |
|
251 |
+ colnames(stats) <- c("rawSds", "tstat.sd", |
|
252 |
+ "group2.means", "group1.means", "tstat", |
|
253 |
+ "tstat.corrected") |
|
254 |
+ |
|
255 |
+ } else { |
|
256 |
+ stats <- cbind(rawSds, tstat.sd, group2.means, group1.means, |
|
257 |
+ tstat) |
|
258 |
+ colnames(stats) <- c("rawSds", "tstat.sd", |
|
259 |
+ "group2.means", "group1.means", "tstat") |
|
260 |
+ } |
|
261 |
+ |
|
262 |
+ parameters <- c(BSseq@parameters, |
|
263 |
+ list(tstatText = sprintf("BSmooth.tstat (local.correct = %s, maxGap = %d)", |
|
264 |
+ local.correct, maxGap), |
|
265 |
+ group1 = group1, group2 = group2, k = k, qSd = qSd, |
|
266 |
+ local.correct = local.correct, maxGap = maxGap)) |
|
267 |
+ out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters) |
|
268 |
+ out |
|
269 |
+} |
0 | 270 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,434 @@ |
1 |
+setClassUnion("matrixOrNULL", c("matrix", "NULL")) |
|
2 |
+ |
|
3 |
+setClass("BSseq", contains = "hasGRanges", |
|
4 |
+ representation(M = "matrix", |
|
5 |
+ Cov = "matrix", |
|
6 |
+ coef = "matrixOrNULL", |
|
7 |
+ se.coef = "matrixOrNULL", |
|
8 |
+ trans = "function", |
|
9 |
+ parameters = "list", |
|
10 |
+ phenoData = "AnnotatedDataFrame")) |
|
11 |
+ |
|
12 |
+ |
|
13 |
+setClass("BSseqTstat", contains = "hasGRanges", |
|
14 |
+ representation(stats = "matrix", |
|
15 |
+ parameters = "list") |
|
16 |
+ ) |
|
17 |
+ |
|
18 |
+ |
|
19 |
+## |
|
20 |
+## Simple accessor and replacement functions |
|
21 |
+## |
|
22 |
+ |
|
23 |
+setValidity("BSseq", function(object) { |
|
24 |
+ msg <- NULL |
|
25 |
+ nCpGs <- length(object@gr) |
|
26 |
+ nSamples <- ncol(object@M) |
|
27 |
+ if(nCpGs != nrow(object@M) || |
|
28 |
+ nCpGs != nrow(object@Cov)) |
|
29 |
+ msg <- c(msg, "length of the 'gr' slot is not equal to the nrows the 'M' or 'Cov' slots") |
|
30 |
+ if(nSamples != ncol(object@Cov)) |
|
31 |
+ msg <- c(msg, "ncols of the 'M' slot is not equal to the ncols of the 'Cov' slot") |
|
32 |
+ if(nSamples != nrow(object@phenoData)) |
|
33 |
+ msg <- c(msg, "the 'phenoData' slot does not match the dimensions of the 'M' slot") |
|
34 |
+ if(any(object@M < 0)) |
|
35 |
+ msg <- c(msg, "the 'M' slot has negative entries") |
|
36 |
+ if(any(object@Cov < 0)) |
|
37 |
+ msg <- c(msg, "the 'Cov' slot has negative entries") |
|
38 |
+ if(any(object@M > object@Cov)) |
|
39 |
+ msg <- c(msg, "the 'M' slot has at least one entry bigger than the 'Cov' slot") |
|
40 |
+ if(!is.null(object@coef) && (nCpGs != nrow(object@coef) || |
|
41 |
+ nSamples != ncol(object@coef))) |
|
42 |
+ msg <- c(msg, "nrows and/or ncols of the 'coef' slot does not have the same dimensions as the 'M' slot") |
|
43 |
+ if(!is.null(object@se.coef) && (nCpGs != nrow(object@se.coef) || |
|
44 |
+ nSamples != ncol(object@se.coef))) |
|
45 |
+ msg <- c(msg, "nrows and/or ncols of the 'se.coef' slot does not have the same dimensions as the 'M' slot") |
|
46 |
+ if(!is.null(rownames(object@M)) || |
|
47 |
+ !is.null(rownames(object@Cov)) || |
|
48 |
+ !is.null(rownames(object@coef)) || |
|
49 |
+ !is.null(rownames(object@se.coef))) warning("unnecessary rownames in object") |
|
50 |
+ ## FIXME: check samplenames |
|
51 |
+ if(is.null(msg)) TRUE else msg |
|
52 |
+}) |
|
53 |
+ |
|
54 |
+ |
|
55 |
+ |
|
56 |
+setValidity("BSseqTstat", function(object) { |
|
57 |
+ msg <- NULL |
|
58 |
+ if(length(object@gr) != nrow(object@stats)) |
|
59 |
+ msg <- c(msg, "length of 'gr' is different from the number of rows of 'stats'") |
|
60 |
+ if(is.null(msg)) TRUE else msg |
|
61 |
+}) |
|
62 |
+ |
|
63 |
+ |
|
64 |
+setMethod("show", signature(object = "BSseq"), |
|
65 |
+ function(object) { |
|
66 |
+ cat("An object of type 'BSseq' with\n") |
|
67 |
+ cat(" ", nrow(object), "methylation loci\n") |
|
68 |
+ cat(" ", ncol(object), "samples\n") |
|
69 |
+ if(hasBeenSmoothed(object)) { |
|
70 |
+ cat("has been smoothed with\n") |
|
71 |
+ cat(" ", object@parameters$smoothText, "\n") |
|
72 |
+ } else { |
|
73 |
+ cat("has not been smoothed\n") |
|
74 |
+ } |
|
75 |
+ }) |
|
76 |
+ |
|
77 |
+setMethod("show", signature(object = "BSseqTstat"), |
|
78 |
+ function(object) { |
|
79 |
+ cat("An object of type 'BSseqTstat' with\n") |
|
80 |
+ cat(" ", length(object), "methylation loci\n") |
|
81 |
+ cat("based on smoothed data:\n") |
|
82 |
+ cat(" ", object@parameters$smoothText, "\n") |
|
83 |
+ cat("with parameters\n") |
|
84 |
+ cat(" ", object@parameters$tstatText, "\n") |
|
85 |
+ }) |
|
86 |
+ |
|
87 |
+ |
|
88 |
+## |
|
89 |
+## eSet stuff |
|
90 |
+## |
|
91 |
+ |
|
92 |
+setMethod("dim", "BSseq", function(x) { |
|
93 |
+ dim(x@M) |
|
94 |
+}) |
|
95 |
+setMethod("nrow", "BSseq", function(x) { |
|
96 |
+ nrow(x@M) |
|
97 |
+}) |
|
98 |
+setMethod("ncol", "BSseq", function(x) { |
|
99 |
+ ncol(x@M) |
|
100 |
+}) |
|
101 |
+ |
|
102 |
+setMethod("phenoData", "BSseq", function(object) { |
|
103 |
+ object@phenoData |
|
104 |
+}) |
|
105 |
+setReplaceMethod("phenoData", signature = signature( |
|
106 |
+ object = "BSseq", |
|
107 |
+ value = "AnnotatedDataFrame"), |
|
108 |
+ function(object, value) { |
|
109 |
+ object@phenoData <- value |
|
110 |
+ object |
|
111 |
+ }) |
|
112 |
+ |
|
113 |
+setMethod("pData", "BSseq", function(object) { |
|
114 |
+ pData(phenoData(object)) |
|
115 |
+}) |
|
116 |
+setReplaceMethod("pData", signature = signature( |
|
117 |
+ object = "BSseq", |
|
118 |
+ value = "data.frame"), |
|
119 |
+ function(object, value) { |
|
120 |
+ pd <- object@phenoData |
|
121 |
+ pData(pd) <- value |
|
122 |
+ object@phenoData <- pd |
|
123 |
+ object |
|
124 |
+ }) |
|
125 |
+ |
|
126 |
+setMethod("sampleNames", "BSseq", function(object) { |
|
127 |
+ sampleNames(phenoData(object)) |
|
128 |
+}) |
|
129 |
+ |
|
130 |
+setReplaceMethod("sampleNames", |
|
131 |
+ signature(object = "BSseq", value = "ANY"), |
|
132 |
+ function(object, value) { |
|
133 |
+ sampleNames(phenoData(object)) <- value |
|
134 |
+ colnames(object@M) <- value |
|
135 |
+ colnames(object@Cov) <- value |
|
136 |
+ if(!is.null(object@coef)) |
|
137 |
+ colnames(object@coef) <- value |
|
138 |
+ if(!is.null(object@se.coef)) |
|
139 |
+ colnames(object@se.coef) <- value |
|
140 |
+ object |
|
141 |
+ }) |
|
142 |
+ |
|
143 |
+hasBeenSmoothed <- function(BSseq) { |
|
144 |
+ !is.null(BSseq@coef) |
|
145 |
+} |
|
146 |
+ |
|
147 |
+## |
|
148 |
+## Subsetting and combining |
|
149 |
+## |
|
150 |
+ |
|
151 |
+setMethod("[", "BSseq", function(x, i, j, ...) { |
|
152 |
+ if(missing(drop)) |
|
153 |
+ drop <- FALSE |
|
154 |
+ if(missing(i) && missing(j)) |
|
155 |
+ stop("need [i,j] for subsetting") |
|
156 |
+ if(!missing(j)) |
|
157 |
+ x@phenoData <- phenoData(x)[j,, ..., drop = drop] |
|
158 |
+ else |
|
159 |
+ x@phenoData <- phenoData(x) |
|
160 |
+ if(missing(i)) { |
|
161 |
+ x@M <- getBSseq(x, "M")[, j, drop = FALSE] |
|
162 |
+ x@Cov <- getBSseq(x, "Cov")[, j, drop = FALSE] |
|
163 |
+ x@coef <- getBSseq(x, "coef")[, j, drop = FALSE] |
|
164 |
+ x@se.coef <- getBSseq(x, "se.coef")[, j, drop = FALSE] |
|
165 |
+ x@gr <- granges(x) |
|
166 |
+ } else { |
|
167 |
+ x@M <- getBSseq(x, "M")[i, j, drop = FALSE] |
|
168 |
+ x@Cov <- getBSseq(x, "Cov")[i, j, drop = FALSE] |
|
169 |
+ x@coef <- getBSseq(x, "coef")[i, j, drop = FALSE] |
|
170 |
+ x@se.coef <- getBSseq(x, "se.coef")[i, j, drop = FALSE] |
|
171 |
+ x@gr <- granges(x)[i] |
|
172 |
+ } |
|
173 |
+ x |
|
174 |
+}) |
|
175 |
+ |
|
176 |
+setMethod("[", "BSseqTstat", function(x, i, ...) { |
|
177 |
+ if(missing(i)) |
|
178 |
+ stop("need [i] for subsetting") |
|
179 |
+ if(missing(i)) |
|
180 |
+ return(x) |
|
181 |
+ x@gr <- x@gr[i] |
|
182 |
+ x@stats <- x@stats[i,, drop = FALSE] |
|
183 |
+ x |
|
184 |
+}) |
|
185 |
+ |
|
186 |
+setMethod("combine", signature(x = "BSseq", y = "BSseq"), function(x, y, ...) { |
|
187 |
+ ## All of this assumes that we are place x and y "next" to each other, |
|
188 |
+ ## ie. we are not combining the same set of samples sequenced at different times |
|
189 |
+ if (class(x) != class(y)) |
|
190 |
+ stop(paste("objects must be the same class, but are ", |
|
191 |
+ class(x), ", ", class(y), sep="")) |
|
192 |
+ if(hasBeenSmoothed(x) && hasBeenSmoothed(y) && !all.equal(x@trans, y@trans)) |
|
193 |
+ stop("'x' and 'y' need to be smoothed on the same scale") |
|
194 |
+ phenoData <- combine(phenoData(x), phenoData(y)) |
|
195 |
+ gr <- reduce(c(granges(x), granges(y)), min.gapwidth = 0L) |
|
196 |
+ mm.x <- as.matrix(findOverlaps(gr, granges(x))) |
|
197 |
+ mm.y <- as.matrix(findOverlaps(gr, granges(y))) |
|
198 |
+ sampleNames <- c(sampleNames(x), sampleNames(y)) |
|
199 |
+ ## FIXME: there is no check that the two sampleNames are disjoint. |
|
200 |
+ M <- Cov <- matrix(0, nrow = length(gr), ncol = length(sampleNames)) |
|
201 |
+ colnames(M) <- colnames(Cov) <- sampleNames |
|
202 |
+ M[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "M")[mm.x[,2],] |
|
203 |
+ M[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "M")[mm.y[,2],] |
|
204 |
+ Cov[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "Cov")[mm.x[,2],] |
|
205 |
+ Cov[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "Cov")[mm.y[,2],] |
|
206 |
+ if(!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) { |
|
207 |
+ coef <- NULL |
|
208 |
+ se.coef <- NULL |
|
209 |
+ trans <- NULL |
|
210 |
+ } else { |
|
211 |
+ trans <- x@trans |
|
212 |
+ coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames)) |
|
213 |
+ colnames(coef) <- sampleNames(phenoData) |
|
214 |
+ if(hasBeenSmoothed(x)) |
|
215 |
+ coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "coef")[mm.x[,2],] |
|
216 |
+ if(hasBeenSmoothed(y)) |
|
217 |
+ coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "coef")[mm.y[,2],] |
|
218 |
+ if(is.null(getBSseq(x, "se.coef")) && is.null(getBSseq(x, "se.coef"))) |
|
219 |
+ se.coef <- NULL |
|
220 |
+ else { |
|
221 |
+ se.coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames)) |
|
222 |
+ colnames(se.coef) <- sampleNames(phenoData) |
|
223 |
+ if(!is.null(getBSseq(x, "se.coef"))) |
|
224 |
+ se.coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "se.coef")[mm.x[,2],] |
|
225 |
+ if(!is.null(getBSseq(y, "se.coef"))) |
|
226 |
+ se.coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "se.coef")[mm.y[,2],] |
|
227 |
+ } |
|
228 |
+ } |
|
229 |
+ BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef, trans = trans, rmZeroCov = FALSE) |
|
230 |
+}) |
|
231 |
+ |
|
232 |
+getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans", "parameters")) { |
|
233 |
+ type <- match.arg(type) |
|
234 |
+ switch(type, |
|
235 |
+ "Cov" = { |
|
236 |
+ return(BSseq@Cov) |
|
237 |
+ }, |
|
238 |
+ "M" = { |
|
239 |
+ return(BSseq@M) |
|
240 |
+ }, |
|
241 |
+ "gr" = { |
|
242 |
+ return(BSseq@gr) |
|
243 |
+ }, |
|
244 |
+ "coef" = { |
|
245 |
+ return(BSseq@coef) |
|
246 |
+ }, |
|
247 |
+ "se.coef" = { |
|
248 |
+ return(BSseq@se.coef) |
|
249 |
+ }, |
|
250 |
+ "trans" = { |
|
251 |
+ return(BSseq@trans) |
|
252 |
+ }, |
|
253 |
+ "parameters" = { |
|
254 |
+ return(BSseq@parameters) |
|
255 |
+ }) |
|
256 |
+} |
|
257 |
+ |
|
258 |
+ |
|
259 |
+ |
|
260 |
+BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL, |
|
261 |
+ trans = NULL, parameters = NULL, phenoData = NULL, |
|
262 |
+ gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL, |
|
263 |
+ rmZeroCov = FALSE) { |
|
264 |
+ if(is.null(gr)) { |
|
265 |
+ if(is.null(pos) || is.null(chr)) |
|
266 |
+ stop("Need pos and chr") |
|
267 |
+ gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1)) |
|
268 |
+ } |
|
269 |
+ if(!is(gr, "GRanges")) |
|
270 |
+ stop("'gr' needs to be a GRanges") |
|
271 |
+ if(any(width(gr)) != 1) |
|
272 |
+ stop("'gr' needs to have widths of 1") |
|
273 |
+ if(is.null(M) || is.null(Cov)) |
|
274 |
+ stop("Need M and Cov") |
|
275 |
+ if(!is.matrix(M)) |
|
276 |
+ stop("'M' needs to be a matrix") |
|
277 |
+ if(!is.matrix(Cov)) |
|
278 |
+ stop("'Cov' needs to be a matrix") |
|
279 |
+ if(length(gr) != nrow(M) || |
|
280 |
+ length(gr) != nrow(Cov) || |
|
281 |
+ ncol(Cov) != ncol(M)) |
|
282 |
+ stop("'gr', 'M' and 'Cov' need to have similar dimensions") |
|
283 |
+ if(!is.null(rownames(M))) |
|
284 |
+ rownames(M) <- NULL |
|
285 |
+ if(!is.null(rownames(Cov))) |
|
286 |
+ rownames(Cov) <- NULL |
|
287 |
+ if(!is.null(names(gr))) |
|
288 |
+ names(gr) <- NULL |
|
289 |
+ ## deal with sampleNames |
|
290 |
+ if(is.null(sampleNames) && !is.null(phenoData) && !is.null(sampleNames(phenoData))) |
|
291 |
+ sampleNames <- sampleNames(phenoData) |
|
292 |
+ if(is.null(sampleNames) && !is.null(colnames(M))) |
|
293 |
+ sampleNames <- colnames(M) |
|
294 |
+ if(is.null(sampleNames) && !is.null(colnames(Cov))) |
|
295 |
+ sampleNames <- colnames(Cov) |
|
296 |
+ if(is.null(sampleNames)) |
|
297 |
+ sampleNames <- paste("V", 1:ncol(M), sep = "") |
|
298 |
+ if(length(unique(sampleNames)) != ncol(M)) |
|
299 |
+ stop("sampleNames need to be unique and of the right length.") |
|
300 |
+ ## check that 0 <= M <= Cov and remove positions with Cov = 0 |
|
301 |
+ if(any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) || |
|
302 |
+ any(is.infinite(Cov))) |
|
303 |
+ stop("'M' and 'Cov' may not contain NA or infinite values and 0 <= M <= Cov") |
|
304 |
+ if(rmZeroCov) { |
|
305 |
+ wh <- which(rowSums(Cov) == 0) |
|
306 |
+ if(length(wh) > 0) { |
|
307 |
+ gr <- gr[-wh] |
|
308 |
+ M <- M[-wh,,drop = FALSE] |
|
309 |
+ Cov <- Cov[-wh,,drop = FALSE] |
|
310 |
+ } |
|
311 |
+ } |
|
312 |
+ grR <- reduce(gr, min.gapwidth = 0L) |
|
313 |
+ if(!identical(grR, gr)) { |
|
314 |
+ ## Now we either need to re-order or collapse or both |
|
315 |
+ mm <- as.matrix(findOverlaps(grR, gr)) |
|
316 |
+ mm <- mm[order(mm[,1]),] |
|
317 |
+ if(length(grR) == length(gr)) { |
|
318 |
+ ## only re-ordering is necessary |
|
319 |
+ gr <- grR |
|
320 |
+ M <- M[mm[,2],,drop = FALSE] |
|
321 |
+ Cov <- Cov[mm[,2],,drop = FALSE] |
|
322 |
+ if(!is.null(coef)) |
|
323 |
+ coef <- coef[mm[,2],,drop = FALSE] |
|
324 |
+ if(!is.null(se.coef)) |
|
325 |
+ se.coef <- se.coef[mm[,2],, drop = FALSE] |
|
326 |
+ } else { |
|
327 |
+ warning("multiple positions, collapsing BSseq object\n") |
|
328 |
+ if(!is.null(coef) || !is.null(se.coef)) |
|
329 |
+ stop("Cannot collapse when 'coef' or 'se.coef' are present") |
|
330 |
+ gr <- grR |
|
331 |
+ sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))] |
|
332 |
+ names(sp) <- NULL |
|
333 |
+ M <- do.call(rbind, lapply(sp, function(ii) colSums(M[ii,, drop = FALSE]))) |
|
334 |
+ Cov <- do.call(rbind, lapply(sp, function(ii) colSums(Cov[ii,, drop = FALSE]))) |
|
335 |
+ } |
|
336 |
+ } |
|
337 |
+ if(is.null(colnames(M)) || any(sampleNames != colnames(M))) |
|
338 |
+ colnames(M) <- sampleNames |
|
339 |
+ if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov))) |
|
340 |
+ colnames(Cov) <- sampleNames |
|
341 |
+ if(is.null(phenoData)) |
|
342 |
+ phenoData <- annotatedDataFrameFrom(M, byrow = FALSE) |
|
343 |
+ BSseq <- new("BSseq", gr = gr, M = M, Cov = Cov, phenoData = phenoData) |
|
344 |
+ if(!is.null(coef)) { |
|
345 |
+ if(!is.matrix(coef) || |
|
346 |
+ nrow(coef) != nrow(BSseq) || |
|
347 |
+ ncol(coef) != ncol(BSseq)) |
|
348 |
+ stop("'coef' does not have the right dimensions") |
|
349 |
+ if(is.null(colnames(coef)) || any(sampleNames != colnames(coef))) |
|
350 |
+ colnames(coef) <- sampleNames |
|
351 |
+ if(!is.null(rownames(coef))) |
|
352 |
+ rownames(coef) <- NULL |
|
353 |
+ BSseq@coef <- coef |
|
354 |
+ } |
|
355 |
+ if(!is.null(se.coef)) { |
|
356 |
+ if(!is.matrix(se.coef) || |
|
357 |
+ nrow(se.coef) != nrow(BSseq) || |
|
358 |
+ ncol(se.coef) != ncol(BSseq)) |
|
359 |
+ stop("'se.coef' does not have the right dimensions") |
|
360 |
+ if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef))) |
|
361 |
+ colnames(se.coef) <- sampleNames |
|
362 |
+ if(!is.null(rownames(se.coef))) |
|
363 |
+ rownames(se.coef) <- NULL |
|
364 |
+ BSseq@se.coef <- se.coef |
|
365 |
+ } |
|
366 |
+ if(is.function(trans)) |
|
367 |
+ BSseq@trans <- trans |
|
368 |
+ if(is.list(parameters)) |
|
369 |
+ BSseq@parameters <- parameters |
|
370 |
+ BSseq |
|
371 |
+} |
|
372 |
+ |
|
373 |
+ |
|
374 |
+BSseqTstat <- function(gr = NULL, stats = NULL, parameters = NULL) { |
|
375 |
+ out <- new("BSseqTstat") |
|
376 |
+ out@gr <- gr |
|
377 |
+ out@stats <- stats |
|
378 |
+ out@parameters <- parameters |
|
379 |
+ out |
|
380 |
+} |
|
381 |
+ |
|
382 |
+ |
|
383 |
+ |
|
384 |
+ |
|
385 |
+ |
|
386 |
+## getD <- function(data, sample1, sample2, type = c("raw", "fit"), |
|
387 |
+## addPositions = FALSE, alpha = 0.95) { |
|
388 |
+## d.conf <- function(p1, n1, p2, n2) { |
|
389 |
+## ## Method 10 from Newcombe (Stat in Med, 1998) |
|
390 |
+## getRoots <- function(p, n) { |
|
391 |
+## mat <- cbind(p^2, -(2*p + z^2/n), 1+z^2/n) |
|
392 |
+## roots <- t(Re(apply(mat, 1, polyroot))) |
|
393 |
+## roots |
|
394 |
+## } |
|
395 |
+## roots1 <- roots2 <- matrix(NA_real_, nrow = length(p1), ncol = 2) |
|
396 |
+## idx <- !is.na(p1) |
|
397 |
+## roots1[idx,] <- getRoots(p1[idx], n1[idx]) |
|
398 |
+## idx <- !is.na(p2) |
|
399 |
+## roots2[idx,] <- getRoots(p2[idx], n2[idx]) |
|
400 |
+## d <- p1 - p2 |
|
401 |
+## suppressWarnings({ |
|
402 |
+## lower <- d - z * sqrt(roots1[,1]*(1-roots1[,1])/n1 + roots2[,2]*(1-roots2[,2])/n2) |
|
403 |
+## upper <- d + z * sqrt(roots1[,2]*(1-roots1[,2])/n1 + roots2[,1]*(1-roots2[,1])/n2) |
|
404 |
+## }) |
|
405 |
+## return(data.frame(d = d, lower = lower, upper = upper)) |
|
406 |
+## } |
|
407 |
+## type <- match.arg(type) |
|
408 |
+## z <- abs(qnorm((1-alpha)/2, mean = 0, sd = 1)) |
|
409 |
+## switch(type, |
|
410 |
+## raw = { |
|
411 |
+## conf <- d.conf(p1 = data$M[, sample1] / data$Cov[, sample1], |
|
412 |
+## n1 = data$Cov[, sample1], |
|
413 |
+## p2 = data$M[, sample2] / data$Cov[, sample2], |
|
414 |
+## n2 = data$Cov[, sample2]) |
|
415 |
+## out <- conf |
|
416 |
+## }, |
|
417 |
+## fit = { |
|
418 |
+## p1 <- data$trans(data$coef[, sample1]) |
|
419 |
+## p2 <- data$trans(data$coef[, sample2]) |
|
420 |
+## d <- p1 - p2 |
|
421 |
+## ## Based on the delta method |
|
422 |
+## se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 + |
|
423 |
+## (data$se.coef[, sample2] * p2 * (1-p2))^2) |
|
424 |
+ |
|
425 |
+## lower <- d - z * se.d |
|
426 |
+## upper <- d + z * se.d |
|
427 |
+## out <- data.frame(d = d, lower = lower, upper = upper) |
|
428 |
+## }) |
|
429 |
+## if(addPositions) { |
|
430 |
+## out$pos <- start(data$gr) |
|
431 |
+## out$chr <- seqnames(data$gr) |
|
432 |
+## } |
|
433 |
+## out |
|
434 |
+## } |
0 | 435 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,243 @@ |
1 |
+collapseBSseq <- function(BSseq, columns) { |
|
2 |
+ ## columns is a vector of new names, names(columns) is sampleNames or empty |
|
3 |
+ if(is.null(names(columns)) && length(columns) != ncol(BSseq)) |
|
4 |
+ stop("if `columns' does not have names, it needs to be of the same length as `BSseq` has columns (samples)") |
|
5 |
+ if(!is.null(names(columns)) && !all(names(columns) %in% sampleNames(BSseq))) |
|
6 |
+ stop("if `columns` has names, they need to be sampleNames(BSseq)") |
|
7 |
+ if(is.null(names(columns))) { |
|
8 |
+ sp <- split(1:ncol(BSseq), columns) |
|
9 |
+ } else { |
|
10 |
+ sp <- split(names(columns), columns) |
|
11 |
+ } |
|
12 |
+ M <- do.call(cbind, lapply(sp, function(ss) { |
|
13 |
+ rowSums(BSseq@M[, ss, drop = FALSE]) |
|
14 |
+ })) |
|
15 |
+ colnames(M) <- names(sp) |
|
16 |
+ Cov <- do.call(cbind, lapply(sp, function(ss) { |
|
17 |
+ rowSums(BSseq@Cov[, ss, drop = FALSE]) |
|
18 |
+ })) |
|
19 |
+ colnames(Cov) <- names(sp) |
|
20 |
+ BSseq(gr = BSseq@gr, M = M, Cov = Cov) |
|
21 |
+} |
|
22 |
+ |
|
23 |
+chrSelectBSseq <- function(BSseq, seqnames = NULL, order = FALSE) { |
|
24 |
+ gr <- GRanges(seqnames = seqnames, |
|
25 |
+ ranges = IRanges(start = rep(1, length(seqnames)), |
|
26 |
+ end = rep(10^9, length(seqnames)))) |
|
27 |
+ BSseq <- subsetByOverlaps(BSseq, gr) |
|
28 |
+ seqlevels(BSseq) <- seqlevels(BSseq)[seqlevels(BSseq) %in% seqnames] |
|
29 |
+ if(order) |
|
30 |
+ BSseq <- orderBSseq(BSseq, seqOrder = seqnames) |
|
31 |
+ BSseq |
|
32 |
+} |
|
33 |
+ |
|
34 |
+ |
|
35 |
+orderBSseq <- function(BSseq, seqOrder = NULL) { |
|
36 |
+ splitNames <- splitRanges(seqnames(BSseq)) |
|
37 |
+ if(is.null(seqOrder)) |
|
38 |
+ seqOrder <- names(splitNames) |
|
39 |
+ else |
|
40 |
+ stopifnot(all(seqOrder %in% names(splitNames))) |
|
41 |
+ splitOd <- lapply(seqOrder, function(nam) { |
|
42 |
+ seqRanges <- seqselect(ranges(granges(BSseq)), splitNames[[nam]]) |
|
43 |
+ as.integer(unlist(splitNames[[nam]])[order(start(seqRanges))]) |
|
44 |
+ }) |
|
45 |
+ BSseq <- BSseq[do.call(c, splitOd)] |
|
46 |
+ seqlevels(BSseq) <- seqOrder |
|
47 |
+ BSseq |
|
48 |
+} |
|
49 |
+ |
|
50 |
+ |
|
51 |
+getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"), |
|
52 |
+ what = c("perBase", "perRegion"), confint = FALSE, alpha = 0.95) { |
|
53 |
+ p.conf <- function(p, n, alpha) { |
|
54 |
+ z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1)) |
|
55 |
+ upper <- (p + z^2/(2*n) + z*sqrt( (p*(1-p) + z^2/(4*n)) / n)) / |
|
56 |
+ (1+z^2/n) |
|
57 |
+ lower <- (p + z^2/(2*n) - z*sqrt( (p*(1-p) + z^2/(4*n)) / n)) / |
|
58 |
+ (1+z^2/n) |
|
59 |
+ return(list(meth = p, lower = lower, upper = upper)) |
|
60 |
+ } |
|
61 |
+ stopifnot(is(BSseq, "BSseq")) |
|
62 |
+ type <- match.arg(type) |
|
63 |
+ what <- match.arg(what) |
|
64 |
+ z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1)) |
|
65 |
+ if(is.null(regions) && type == "smooth") { |
|
66 |
+ meth <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef")) |
|
67 |
+ if(confint) { |
|
68 |
+ upper <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") + |
|
69 |
+ z * getBSseq(BSseq, type = "se.coef")) |
|
70 |
+ lower <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") - |
|
71 |
+ z * getBSseq(BSseq, type = "se.coef")) |
|
72 |
+ return(list(meth = meth, lower = lower, upper = upper)) |
|
73 |
+ } else { |
|
74 |
+ return(meth) |
|
75 |
+ } |
|
76 |
+ } |
|
77 |
+ if(is.null(regions) && type == "raw") { |
|
78 |
+ meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov") |
|
79 |
+ if(confint) { |
|
80 |
+ return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"), alpha = alpha)) |
|
81 |
+ } else { |
|
82 |
+ return(meth) |
|
83 |
+ } |
|
84 |
+ } |
|
85 |
+ ## At this point, regions have been specified |
|
86 |
+ if(class(regions) == "data.frame") |
|
87 |
+ regions <- data.frame2GRanges(regions) |
|
88 |
+ stopifnot(is(regions, "GenomicRanges")) |
|
89 |
+ if(confint) stop("'confint = TRUE' is not supported by 'getMeth' when regions is given") |
|
90 |
+ grBSseq <- granges(BSseq) |
|
91 |
+ mm <- as.matrix(findOverlaps(regions, grBSseq)) |
|
92 |
+ mmsplit <- split(mm[,2], mm[,1]) |
|
93 |
+ if(what == "perBase") { |
|
94 |
+ if(type == "smooth") { |
|
95 |
+ out <- lapply(mmsplit, function(xx) { |
|
96 |
+ getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE]) |
|
97 |
+ }) |
|
98 |
+ } |
|
99 |
+ if(type == "raw") { |
|
100 |
+ out <- lapply(mmsplit, function(xx) { |
|
101 |
+ getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE] |
|
102 |
+ }) |
|
103 |
+ } |
|
104 |
+ outList <- vector("list", length(regions)) |
|
105 |
+ outList[as.integer(names(mmsplit))] <- out |
|
106 |
+ return(outList) |
|
107 |
+ } |
|
108 |
+ if(what == "perRegion") { |
|
109 |
+ if(type == "smooth") { |
|
110 |
+ out <- lapply(mmsplit, function(xx) { |
|
111 |
+ colMeans(getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE]), na.rm = TRUE) |
|
112 |
+ }) |
|
113 |
+ } |
|
114 |
+ if(type == "raw") { |
|
115 |
+ out <- lapply(mmsplit, function(xx) { |
|
116 |
+ colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE) |
|
117 |
+ }) |
|
118 |
+ } |
|
119 |
+ out <- do.call(rbind, out) |
|
120 |
+ outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions)) |
|
121 |
+ colnames(outMatrix) <- sampleNames(BSseq) |
|
122 |
+ outMatrix[as.integer(rownames(out)),] <- out |
|
123 |
+ return(outMatrix) |
|
124 |
+ } |
|
125 |
+} |
|
126 |
+ |
|
127 |
+ |
|
128 |
+getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"), |
|
129 |
+ what = c("perBase", "perRegionAverage", "perRegionTotal")) { |
|
130 |
+ stopifnot(is(BSseq, "BSseq")) |
|
131 |
+ type <- match.arg(type) |
|
132 |
+ what <- match.arg(what) |
|
133 |
+ if(is.null(regions)) { |
|
134 |
+ if(what == "perBase") |
|
135 |
+ return(getBSseq(BSseq, type = type)) |
|
136 |
+ if(what == "perRegionTotal") |
|
137 |
+ return(colSums(getBSseq(BSseq, type = type))) |
|
138 |
+ if(what == "perRegionAverage") |
|
139 |
+ return(colMeans(getBSseq(BSseq, type = type))) |
|
140 |
+ } |
|
141 |
+ if(class(regions) == "data.frame") |
|
142 |
+ regions <- data.frame2GRanges(regions) |
|
143 |
+ stopifnot(is(regions, "GenomicRanges")) |
|
144 |
+ grBSseq <- granges(BSseq) |
|
145 |
+ mm <- as.matrix(findOverlaps(regions, grBSseq)) |
|
146 |
+ mmsplit <- split(mm[,2], mm[,1]) |
|
147 |
+ if(what == "perBase") { |
|
148 |
+ if(type == "Cov") { |
|
149 |
+ out <- lapply(mmsplit, function(xx) { |
|
150 |
+ getBSseq(BSseq, "Cov")[xx,,drop = FALSE] |
|
151 |
+ }) |
|
152 |
+ } |
|
153 |
+ if(type == "M") { |
|
154 |
+ out <- lapply(mmsplit, function(xx) { |
|
155 |
+ getBSseq(BSseq, "M")[xx,,drop = FALSE] |
|
156 |
+ }) |
|
157 |
+ } |
|
158 |
+ outList <- vector("list", length(regions)) |
|
159 |
+ outList[as.integer(names(mmsplit))] <- out |
|
160 |
+ return(outList) |
|
161 |
+ } |
|
162 |
+ if(what == "perRegionAverage") { |
|
163 |
+ if(type == "Cov") { |
|
164 |
+ out <- lapply(mmsplit, function(xx) { |
|
165 |
+ colMeans(getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE) |
|
166 |
+ }) |
|
167 |
+ } |
|
168 |
+ if(type == "M") { |
|
169 |
+ out <- lapply(mmsplit, function(xx) { |
|
170 |
+ colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE], na.rm = TRUE) |
|
171 |
+ }) |
|
172 |
+ } |
|
173 |
+ } |
|
174 |
+ if(what == "perRegionTotal") { |
|
175 |
+ if(type == "Cov") { |
|
176 |
+ out <- lapply(mmsplit, function(xx) { |
|
177 |
+ colSums(getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE) |
|
178 |
+ }) |
|
179 |
+ } |
|
180 |
+ if(type == "M") { |
|
181 |
+ out <- lapply(mmsplit, function(xx) { |
|
182 |
+ colSums(getBSseq(BSseq, "M")[xx,,drop = FALSE], na.rm = TRUE) |
|
183 |
+ }) |
|
184 |
+ } |
|
185 |
+ } |
|
186 |
+ out <- do.call(rbind, out) |
|
187 |
+ outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions)) |
|
188 |
+ colnames(outMatrix) <- sampleNames(BSseq) |
|
189 |
+ outMatrix[as.integer(rownames(out)),] <- out |
|
190 |
+ return(outMatrix) |
|
191 |
+} |
|
192 |
+ |
|
193 |
+getStats <- function(BSseqTstat, regions = NULL, column = c("tstat.corrected", "tstat")) { |
|
194 |
+ stopifnot(is(BSseqTstat, "BSseqTstat")) |
|
195 |
+ column <- match.arg(column) |
|
196 |
+ if(class(regions) == "data.frame") |
|
197 |
+ regions <- data.frame2GRanges(regions) |
|
198 |
+ if(is.null(regions)) |
|
199 |
+ return(BSseqTstat@stats) |
|
200 |
+ stopifnot(is(regions, "GenomicRanges")) |
|
201 |
+ ov <- findOverlaps(BSseqTstat, regions) |
|
202 |
+ ov.sp <- split(queryHits(ov), subjectHits(ov)) |
|
203 |
+ getRegionStats <- function(idx) { |
|
204 |
+ mat <- BSseqTstat@stats[idx,, drop=FALSE] |
|
205 |
+ meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"]) |
|
206 |
+ group1.mean <- mean(mat[, "group1.means"]) |
|
207 |
+ group2.mean <- mean(mat[, "group2.means"]) |
|
208 |
+ tstat.sd <- mean(mat[, "tstat.sd"]) |
|
209 |
+ areaStat <- sum(mat[, column]) |
|
210 |
+ maxStat <- max(mat[, column]) |
|
211 |
+ c(meanDiff, group1.mean, group2.mean, tstat.sd, areaStat, maxStat) |
|
212 |
+ } |
|
213 |
+ stats <- lapply(ov.sp, getRegionStats) |
|
214 |
+ out <- matrix(NA, ncol = 6, nrow = length(regions)) |
|
215 |
+ out[as.integer(names(stats)),] <- do.call(rbind, stats) |
|
216 |
+ colnames(out) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd", "areaStat", "maxStat") |
|
217 |
+ out <- as.data.frame(out) |
|
218 |
+ out |
|
219 |
+} |
|
220 |
+ |
|
221 |
+ |
|
222 |
+summary.BSseqTstat <- function(object, ...) { |
|
223 |
+ quant <- quantile(getStats(object)[, "tstat.corrected"], |
|
224 |
+ prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999)) |
|
225 |
+ quant <- t(t(quant)) |
|
226 |
+ colnames(quant) <- "quantiles" |
|
227 |
+ out <- list(quantiles = quant) |
|
228 |
+ class(out) <- "summary.BSseqTstat" |
|
229 |
+ out |
|
230 |
+} |
|
231 |
+ |
|
232 |
+print.summary.BSseqTstat <- function(x, ...) { |
|
233 |
+ print(as.matrix(x$quantiles)) |
|
234 |
+} |
|
235 |
+ |
|
236 |
+plot.BSseqTstat <- function(x, y, ...) { |
|
237 |
+ tstat <- getStats(x)[, "tstat"] |
|
238 |
+ tstat.cor <- getStats(x)[, "tstat.corrected"] |
|
239 |
+ plot(density(tstat), xlim = c(-10,10), col = "blue", main = "") |
|
240 |
+ lines(density(tstat.cor), col = "black") |
|
241 |
+ legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1), |
|
242 |
+ col = c("blue", "black")) |
|
243 |
+} |
0 | 244 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,79 @@ |
1 |
+dmrFinder <- function(BSseqTstat, cutoff = NULL, qcutoff = c(0.025, 0.975), |
|
2 |
+ maxGap = 300, column = c("tstat.corrected", "tstat"), verbose = TRUE) { |
|
3 |
+ column <- match.arg(column) |
|
4 |
+ if(! column %in% colnames(BSseqTstat@stats)) |
|
5 |
+ stop("'column' needs to be a column of 'BSseqTstat@stats'") |
|
6 |
+ if(is.null(cutoff)) |
|
7 |
+ cutoff <- quantile(BSseqTstat@stats[, column], qcutoff) |
|
8 |
+ direction <- as.integer(BSseqTstat@stats[, column] >= cutoff[2]) |
|
9 |
+ direction[BSseqTstat@stats[, column] <= cutoff[1]] <- -1L |
|
10 |
+ direction[is.na(direction)] <- 0L |
|
11 |
+ chrs <- as.character(seqnames(BSseqTstat@gr)) |
|
12 |
+ positions <- start(BSseqTstat) |
|
13 |
+ regions <- bsseq:::regionFinder3(direction, chr = chrs, pos = positions, maxGap = maxGap, verbose = verbose) |
|
14 |
+ if(verbose) cat("creating dmr data.frame\n") |
|
15 |
+ regions <- do.call(rbind, regions) |
|
16 |
+ rownames(regions) <- NULL |
|
17 |
+ stats <- getStats(BSseqTstat, regions, column = column) |
|
18 |
+ regions <- cbind(regions, stats) |
|
19 |
+ regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo") |
|
20 |
+ regions$width <- regions$end - regions$start + 1 |
|
21 |
+ regions$invdensity <- regions$width / regions$n |
|
22 |
+ regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),] |
|
23 |
+ regions |
|
24 |
+} |
|
25 |
+ |
|
26 |
+clusterMaker <- function(chr,pos,order.it=TRUE,maxGap=300){ |
|
27 |
+ nonaIndex=which(!is.na(chr) & !is.na(pos)) |
|
28 |
+ Indexes=split(nonaIndex,chr[nonaIndex]) |
|
29 |
+ clusterIDs=rep(NA,length(chr)) |
|
30 |
+ LAST=0 |
|
31 |
+ for(i in seq(along=Indexes)){ |
|
32 |
+ Index=Indexes[[i]] |
|
33 |
+ x=pos[Index] |
|
34 |
+ if(order.it){ Index=Index[order(x)];x=pos[Index] } |
|
35 |
+ y=as.numeric(diff(x)>maxGap) |
|
36 |
+ z=cumsum(c(1,y)) |
|
37 |
+ clusterIDs[Index]=z+LAST |
|
38 |
+ LAST=max(z)+LAST |
|
39 |
+ } |
|
40 |
+ clusterIDs |
|
41 |
+} |
|
42 |
+ |
|
43 |
+ |
|
44 |
+regionFinder3 <- function(x, chr, positions, keep, maxGap = 300, verbose = TRUE) { |
|
45 |
+ getSegments3 <- function(x, cid, verbose = TRUE){ |
|
46 |
+ if(verbose) cat("segmenting\n") |
|
47 |
+ segments <- cumsum(c(1, diff(x) != 0)) + |
|
48 |
+ cumsum(c(1, diff(cid) != 0)) |
|
49 |
+ names(segments) <- NULL |
|
50 |
+ if(verbose) cat("splitting\n") |
|
51 |
+ out <- list(up = split(which(x > 0), segments[x > 0]), |
|
52 |
+ down = split(which(x < 0), segments[x < 0]), |
|
53 |
+ nochange = split(which(x == 0), segments[x == 0])) |
|
54 |
+ names(out[[1]]) <- NULL |
|
55 |
+ names(out[[2]]) <- NULL |
|
56 |
+ names(out[[3]]) <- NULL |
|
57 |
+ out |
|
58 |
+ } |
|
59 |
+ cid0 <- clusterMaker(chr, positions, maxGap = maxGap) |
|
60 |
+ segments <- getSegments3(x = x, cid = cid0, verbose = verbose) |
|
61 |
+ out <- vector("list", 2) |
|
62 |
+ names(out) <- c("up", "down") |
|
63 |
+ for(ii in 1:2) { |
|
64 |
+ idx <- segments[[ii]] |
|
65 |
+ if(length(idx) == 0) { |
|
66 |
+ out[[ii]] <- NULL |
|
67 |
+ } else { |
|
68 |
+ out[[ii]] <- data.frame(chr = sapply(idx, function(jj) chr[jj[1]]), |
|
69 |
+ start = sapply(idx, function(jj) min(positions[jj])), |
|
70 |
+ end = sapply(idx, function(jj) max(positions[jj])), |
|
71 |
+ idxStart = sapply(idx, function(jj) min(jj)), |
|
72 |
+ idxEnd = sapply(idx, function(jj) max(jj)), |
|
73 |
+ cluster = sapply(idx, function(jj) cid0[jj[1]]), |
|
74 |
+ n = sapply(idx, length)) |
|
75 |
+ } |
|
76 |
+ } |
|
77 |
+ out |
|
78 |
+} |
|
79 |
+ |
0 | 80 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,65 @@ |
1 |
+fisherTests <- function(BSseq, group1, group2, lookup = NULL, |
|
2 |
+ returnLookup = TRUE, mc.cores = 1, |
|
3 |
+ verbose = TRUE) { |
|
4 |
+ stopifnot(is(BSseq, "BSseq")) |
|
5 |
+ if(is.integer(group1)) { |
|
6 |
+ stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq)) |
|
7 |
+ group1 <- sampleNames(BSseq)[group1] |
|
8 |
+ } |
|
9 |
+ if(is.integer(group2)) { |
|
10 |
+ stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq)) |
|
11 |
+ group2 <- sampleNames(BSseq)[group2] |
|
12 |
+ } |
|
13 |
+ stopifnot(length(intersect(group1, group2)) == 0) |
|
14 |
+ sampleNames <- c(group1, group2) |
|
15 |
+ if(is.null(lookup)) { |
|
16 |
+ oldkeys <- "" |
|
17 |
+ } else { |
|
18 |
+ stopifnot(is.matrix(lookup) && ncol(lookup) == 2 && |
|
19 |
+ setequal(colnames(lookup), c("p.value", "log2OR"))) |
|
20 |
+ oldkeys <- rownames(lookup) |
|
21 |
+ } |
|
22 |
+ if(verbose) cat(" setup ... ") |
|
23 |
+ stime <- system.time({ |
|
24 |
+ M1 <- rowSums(getCoverage(BSseq, type = "M")[, group1, drop = FALSE]) |
|
25 |
+ M2 <- rowSums(getCoverage(BSseq, type = "M")[, group2, drop = FALSE]) |
|
26 |
+ Cov1 <- rowSums(getCoverage(BSseq, type = "Cov")[, group1, drop = FALSE]) |
|
27 |
+ Cov2 <- rowSums(getCoverage(BSseq, type = "Cov")[, group2, drop = FALSE]) |
|
28 |
+ keys <- paste(Cov1, Cov2, M1, M2, sep = "_") |
|
29 |
+ newkeys <- setdiff(keys, oldkeys) |
|
30 |
+ expand <- matrix(as.integer(do.call(rbind, strsplit(newkeys, "_"))), ncol = 4) |
|
31 |
+ colnames(expand) <- c("Cov1", "Cov2", "M1", "M2") |
|
32 |
+ expand2 <- expand |
|
33 |
+ colnames(expand2) <- c("U1", "U2", "M1", "M2") |
|
34 |
+ expand2[,"U1"] <- expand[,"Cov1"] - expand[,"M1"] |
|
35 |
+ expand2[,"U2"] <- expand[,"Cov2"] - expand[,"M2"] |
|
36 |
+ })[3] |
|
37 |
+ if(verbose) cat(round(stime,1), "secs\n") |
|
38 |
+ if(verbose) cat(sprintf(" performing %d tests (using %d cores) ... ", |
|
39 |
+ nrow(expand2), mc.cores)) |
|
40 |
+ stime <- system.time({ |
|
41 |
+ tests <- mclapply(1:nrow(expand2), function(ii) { |
|
42 |
+ unlist(fisher.test(matrix(expand2[ii,], ncol = 2))[c("p.value", "estimate")]) |
|
43 |
+ }, mc.cores = mc.cores) |
|
44 |
+ })[3] |
|
45 |
+ if(verbose) cat(round(stime,1), "secs\n") |
|
46 |
+ if(verbose) cat(" finalizing ... ") |
|
47 |
+ stime <- system.time({ |
|
48 |
+ tests <- do.call(rbind, tests) |
|
49 |
+ tests[,2] <- log2(tests[,2]) |
|
50 |
+ colnames(tests) <- c("p.value", "log2OR") |
|
51 |
+ rownames(tests) <- newkeys |
|
52 |
+ lookup <- rbind(tests, lookup) |
|
53 |
+ results <- lookup[keys,] |
|
54 |
+ rownames(results) <- NULL |
|
55 |
+ |
|
56 |
+ |
|
57 |
+ })[3] |
|
58 |
+ if(verbose) cat(round(stime,1), "secs\n") |
|
59 |
+ if(returnLookup) |
|
60 |
+ return(list(results = results, lookup = lookup)) |
|
61 |
+ else |
|
62 |
+ return(results) |
|
63 |
+} |
|
64 |
+ |
|
65 |
+ |
0 | 66 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,78 @@ |
1 |
+findOverlaps_mclapply <- function (query, subject, maxgap = 0L, minoverlap = 1L, |
|
2 |
+ type = c("any", "start", "end", "within", "equal"), |
|
3 |
+ select = c("all", "first"), ignore.strand = FALSE, |
|
4 |
+ mc.cores = 1, mc.preschedule = TRUE, verbose = FALSE, ...) { |
|
5 |
+ if(!is(query, "GenomicRanges") || !is(subject, "GenomicRanges")) |
|
6 |
+ stop("findOverlaps_mclapply needs 'query' and 'subject' to be 'GenomicRanges'") |
|
7 |
+ if (!IRanges:::isSingleNumber(maxgap) || maxgap < 0) |
|
8 |
+ stop("'maxgap' must be a non-negative integer") |
|
9 |
+ type <- match.arg(type) |
|
10 |
+ select <- match.arg(select) |
|
11 |
+ seqinfo <- merge(seqinfo(query), seqinfo(subject)) |
|
12 |
+ DIM <- c(length(query), length(subject)) |
|
13 |
+ if (min(DIM) == 0L) { |
|
14 |
+ matchMatrix <- matrix(integer(0), nrow = 0L, ncol = 2L, |
|
15 |
+ dimnames = list(NULL, c("queryHits", "subjectHits"))) |
|
16 |
+ } |
|
17 |
+ else { |
|
18 |
+ querySeqnames <- seqnames(query) |
|
19 |
+ querySplitRanges <- splitRanges(querySeqnames) |
|
20 |
+ uniqueQuerySeqnames <- names(querySplitRanges)[sapply(querySplitRanges, length) > 0] # FIX: only keep seqnames with ranges |
|
21 |
+ subjectSeqnames <- seqnames(subject) |
|
22 |
+ subjectSplitRanges <- splitRanges(subjectSeqnames) |
|
23 |
+ uniqueSubjectSeqnames <- names(subjectSplitRanges)[sapply(subjectSplitRanges, length) > 0] # FIX: only keep seqnames with ranges |
|
24 |
+ commonSeqnames <- intersect(uniqueQuerySeqnames, |
|
25 |
+ uniqueSubjectSeqnames) |
|
26 |
+ if (ignore.strand) { |
|
27 |
+ queryStrand <- rep.int(1L, length(query)) |
|
28 |
+ subjectStrand <- rep.int(1L, length(subject)) |
|
29 |
+ } |
|
30 |
+ else { |
|
31 |
+ queryStrand <- strand(query) |
|
32 |
+ levels(queryStrand) <- c("1", "-1", "0") |
|
33 |
+ queryStrand@values <- as.integer(as.character(runValue(queryStrand))) |
|
34 |
+ queryStrand <- as.vector(queryStrand) |
|
35 |
+ subjectStrand <- strand(subject) |
|
36 |
+ levels(subjectStrand) <- c("1", "-1", "0") |
|
37 |
+ subjectStrand@values <- as.integer(as.character(runValue(subjectStrand))) |
|
38 |
+ subjectStrand <- as.vector(subjectStrand) |
|
39 |
+ } |
|
40 |
+ queryRanges <- unname(ranges(query)) |
|
41 |
+ subjectRanges <- unname(ranges(subject)) |
|
42 |
+ matchMatrix <- do.call(rbind, mclapply(commonSeqnames, |
|
43 |
+ function(seqnm) { |
|
44 |
+ if(verbose) cat(seqnm, "\n") # FIX : added verbosity |
|
45 |
+ if (isCircular(seqinfo)[seqnm] %in% TRUE) |
|
46 |
+ circle.length <- seqlengths(seqinfo)[seqnm] |
|
47 |
+ else circle.length <- NA |
|
48 |
+ qIdxs <- querySplitRanges[[seqnm]] |
|
49 |
+ sIdxs <- subjectSplitRanges[[seqnm]] |
|
50 |
+ ## FIX: added ::: tpo get .findOverlaps.circle |
|
51 |
+ overlaps <- GenomicRanges:::.findOverlaps.circle(circle.length, |
|
52 |
+ seqselect(queryRanges, qIdxs), seqselect(subjectRanges, |
|
53 |
+ sIdxs), maxgap, minoverlap, type) |
|
54 |
+ qHits <- queryHits(overlaps) |
|
55 |
+ sHits <- subjectHits(overlaps) |
|
56 |
+ matches <- cbind(queryHits = as.integer(qIdxs)[qHits], |
|
57 |
+ subjectHits = as.integer(sIdxs)[sHits]) |
|
58 |
+ matches[which(seqselect(queryStrand, qIdxs)[qHits] * |
|
59 |
+ seqselect(subjectStrand, sIdxs)[sHits] != |
|
60 |
+ -1L), , drop = FALSE] |
|
61 |
+ }, mc.cores = mc.cores, mc.preschedule = mc.preschedule)) |
|
62 |
+ if (is.null(matchMatrix)) { |
|
63 |
+ matchMatrix <- matrix(integer(0), nrow = 0L, |
|
64 |
+ ncol = 2L, dimnames = list(NULL, c("queryHits", |
|
65 |
+ "subjectHits"))) |
|
66 |
+ } |
|
67 |
+ matchMatrix <- matchMatrix[IRanges:::orderIntegerPairs(matchMatrix[, |
|
68 |
+ 1L], matchMatrix[, 2L]), , drop = FALSE] |
|
69 |
+ } |
|
70 |
+ if (select == "all") { |
|
71 |
+ new("Hits", queryHits = unname(matchMatrix[, 1L]), |
|
72 |
+ subjectHits = unname(matchMatrix[, 2L]), queryLength = DIM[1], |
|
73 |
+ subjectLength = DIM[2]) |
|
74 |
+ } |
|
75 |
+ else { |
|
76 |
+ IRanges:::.hitsMatrixToVector(matchMatrix, length(query)) |
|
77 |
+ } |
|
78 |
+} |
0 | 79 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,71 @@ |
1 |
+poissonGoodnessOfFit <- function(BSseq, nQuantiles = 10^5) { |
|
2 |
+ Cov <- getBSseq(BSseq, type = "Cov") |
|
3 |
+ lambda <- rowMeans(Cov) |
|
4 |
+ out <- list(chisq = NULL, df = ncol(Cov) - 1) |
|
5 |
+ out$chisq <- rowSums((Cov - lambda)^2 / lambda) |
|
6 |
+ if(nQuantiles > length(out$chisq)) |
|
7 |
+ nQuantiles <- length(out$chisq) |
|
8 |
+ probs <- seq(from = 0, to = 1, length.out = nQuantiles) |
|
9 |
+ out$quantiles <- quantile(out$chisq, prob = probs, na.rm = TRUE) |
|
10 |
+ class(out) <- "chisqGoodnessOfFit" |
|
11 |
+ return(out) |
|
12 |
+} |
|
13 |
+ |
|
14 |
+print.chisqGoodnessOfFit <- function(x, ...) { |
|
15 |
+ cat("Chisq Goodness of Fit object\n") |
|
16 |
+ cat(" ", length(x$chisq), "tests\n") |
|
17 |
+ cat(" ", x$df, "degrees of freedom\n") |
|
18 |
+} |
|
19 |
+ |
|
20 |
+binomialGoodnessOfFit <- function(BSseq, method = c("MLE"), nQuantiles = 10^5) { |
|
21 |
+ method <- match.arg(method) |
|
22 |
+ M <- getCoverage(BSseq, type = "M", what = "perBase") |
|
23 |
+ Cov <- getCoverage(BSseq, type = "Cov", what = "perBase") |
|
24 |
+ if(method == "MLE") # This is the MLE under iid |
|
25 |
+ p <- rowSums(M) / rowSums(Cov) |
|
26 |
+ ## p <- rowMeans(M / Cov, na.rm = TRUE) |
|
27 |
+ out <- list(chisq = NULL, quantiles = NULL, df = ncol(Cov) - 1) |
|
28 |
+ ## This gets tricky. For the binomial model, we need Cov > 0, but |
|
29 |
+ ## this implies that a given location may have less then nCol(BSseq) |
|
30 |
+ ## number of observations, and hence the degrees of freedom will |
|
31 |
+ ## vary. |
|
32 |
+ out$chisq <- rowSums((M - Cov*p)^2 / sqrt(Cov * p * (1-p))) |
|
33 |
+ probs <- seq(from = 0, to = 1, length.out = nQuantiles) |
|
34 |
+ ## Next line will remove all locations where not all samples have coverage |
|
35 |
+ out$quantiles <- quantile(out$chisq, prob = probs, na.rm = TRUE) |
|
36 |
+ class(out) <- "chisqGoodnessOfFit" |
|
37 |
+ return(out) |
|
38 |
+} |
|
39 |
+ |
|
40 |
+plot.chisqGoodnessOfFit <- function(x, type = c("chisq", "pvalue"), |
|
41 |
+ plotCol = TRUE, qqline = TRUE, pch = 16, cex = 0.75, ...) { |
|
42 |
+ type <- match.arg(type) |
|
43 |
+ switch(type, |
|
44 |
+ "chisq" = { |
|
45 |
+ yy <- x$quantiles |
|
46 |
+ xx <- qchisq(ppoints(yy), df = x$df) |
|
47 |
+ }, |
|
48 |
+ "pvalue" = { |
|
49 |
+ yy <- sort(1 - qchisq(x$quantiles, df = x$df)) |
|
50 |
+ xx <- qunif(ppoints(yy), min = 0, max = 1) |
|
51 |
+ }) |
|
52 |
+ if(plotCol) { |
|
53 |
+ colors <- rep("black", length(yy)) |
|
54 |
+ colors[floor(length(colors)*.95):length(colors)] <- "red" |
|
55 |
+ colors[floor(length(colors)*.99):length(colors)] <- "violet" |
|
56 |
+ colors[floor(length(colors)*.999):length(colors)] <- "orange" |
|
57 |
+ } else { |
|
58 |
+ colors <- "black" |
|
59 |
+ } |
|
60 |
+ qqplot(xx, yy, xlab = "Theoretical quantiles", |
|
61 |
+ ylab = "Observed quantiles", col = colors, pch = pch, cex = cex, ...) |
|
62 |
+ if(qqline) { |
|
63 |
+ yyqt <- quantile(yy, c(0.25, 0.75)) |
|
64 |
+ xxqt <- quantile(xx, c(0.25, 0.75)) |
|
65 |
+ slope <- diff(yyqt) / diff(xxqt) |
|
66 |
+ int <- yyqt[1L] - slope * xxqt[1L] |
|
67 |
+ abline(int, slope, col = "blue") |
|
68 |
+ } |
|
69 |
+ abline(0, 1, col = "blue", lty = 2) |
|
70 |
+} |
|
71 |
+ |
0 | 72 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,161 @@ |
1 |
+setClassUnion("matrixOrNULL", c("matrix", "NULL")) |
|
2 |
+ |
|
3 |
+setClass("hasGRanges", |
|
4 |
+ representation(gr = "GRanges")) |
|
5 |
+ |
|
6 |
+setMethod("seqnames", signature(x = "hasGRanges"), function(x) { |
|
7 |
+ seqnames(x@gr) |
|
8 |
+}) |
|
9 |
+setReplaceMethod("seqnames", "hasGRanges", function(x, value) { |
|
10 |
+ gr <- granges(x) |
|
11 |
+ seqnames(gr) <- value |
|
12 |
+ x@gr <- gr |
|
13 |
+ x |
|
14 |
+}) |
|
15 |
+ |
|
16 |
+setMethod("seqlevels", signature(x = "hasGRanges"), function(x) { |
|
17 |
+ seqlevels(x@gr) |
|
18 |
+}) |
|
19 |
+setReplaceMethod("seqlevels", "hasGRanges", function(x, value) { |
|
20 |
+ gr <- granges(x) |
|
21 |
+ seqlevels(gr) <- value |
|
22 |
+ x@gr <- gr |
|
23 |
+ x |
|
24 |
+}) |
|
25 |
+ |
|
26 |
+setMethod("seqlengths", signature(x = "hasGRanges"), function(x) { |
|
27 |
+ seqlengths(x@gr) |
|
28 |
+}) |
|