Browse code

Adds bsseq and DBCiP to the repos.

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@67929 bc3139a8-67e5-0310-9ffc-ced21a209358

m.carlson authored on 20/07/2012 17:45:09
Showing 42 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,16 @@
1
+Package: bsseq
2
+Version: 0.4.2
3
+Title: Analyze, manage and store bisulfite sequencing data
4
+Description: Tools for analyzing and visualizing bisulfite sequencing data
5
+Author: Kasper Daniel Hansen
6
+Maintainer: Kasper Daniel Hansen <khansen@jhsph.edu>
7
+Depends: R (>= 2.15), methods, BiocGenerics, IRanges, GenomicRanges,
8
+        parallel, matrixStats
9
+Imports: scales, stats, graphics, Biobase, locfit
10
+Suggests: RUnit, bsseqData
11
+Collate: hasGRanges.R BSseq_class.R BSseq_utils.R utils.R read.R
12
+        BSmooth.R dmrFinder.R gof_stats.R plotting.R fisher.R fixes.R
13
+License: Artistic-2.0
14
+LazyData: yes
15
+biocViews: DNAMethylation
16
+Packaged: 2012-07-10 04:15:29 UTC; khansen
0 17
new file mode 100644
... ...
@@ -0,0 +1,70 @@
1
+##
2
+## Importing
3
+##
4
+
5
+import(methods)
6
+importFrom(BiocGenerics, "anyDuplicated", "cbind", "colnames",
7
+           "combine", "density", "intersect", "lapply", "ncol",
8
+           "nrow", "order", "paste", "pmax", "pmin", "rbind",
9
+           "Reduce", "rep.int", "rownames", "sapply", "setdiff",
10
+           "strand", "strand<-", "union", "unique")
11
+
12
+importFrom(stats, "approxfun", "fisher.test", "ppoints",
13
+           "predict", "preplot", "qchisq",
14
+           "qnorm", "qqplot", "qunif")
15
+
16
+importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
17
+           "mtext", "par", "plot", "points", "rect", "rug")
18
+
19
+import(parallel)
20
+importFrom(locfit, "locfit", "lp")
21
+importFrom(matrixStats, "rowSds", "rowVars")
22
+import(IRanges)
23
+import(GenomicRanges)
24
+importFrom(scales, "alpha")
25
+importClassesFrom(Biobase, "AnnotatedDataFrame")
26
+importMethodsFrom(Biobase, "annotatedDataFrameFrom",
27
+                  "pData", "pData<-",
28
+                  "phenoData", "phenoData<-",
29
+                  "sampleNames", "sampleNames<-")
30
+
31
+
32
+##
33
+## Exporting
34
+##
35
+
36
+
37
+exportClasses("hasGRanges",
38
+              "BSseq",
39
+              "BSseqTstat",
40
+              "matrixOrNULL")
41
+
42
+exportMethods("[", "show",
43
+              "seqnames", "seqnames<-",
44
+              "seqlevels", "seqlevels<-",
45
+              "seqlengths", "seqlengths<-",
46
+              "start", "start<-",
47
+              "end", "end<-",
48
+              "width", "width<-",
49
+              "strand", "strand<-",
50
+              "granges",
51
+              "dim", "nrow", "ncol",
52
+              "sampleNames", "sampleNames<-",
53
+              "phenoData", "phenoData<-",
54
+              "pData", "pData<-",
55
+              "findOverlaps", "subsetByOverlaps",
56
+              "combine")
57
+
58
+export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
59
+       "collapseBSseq", "orderBSseq", "hasBeenSmoothed", "chrSelectBSseq",
60
+       "BSmooth", "BSmooth.tstat", "dmrFinder", "fisherTests",
61
+       "plotRegion", "plotManyRegions", 
62
+       "read.umtab", "read.umtab2", "read.bsmooth",
63
+       "poissonGoodnessOfFit", "binomialGoodnessOfFit",
64
+       "data.frame2GRanges", "BSseqTstat")
65
+
66
+S3method("print", "chisqGoodnessOfFit")
67
+S3method("plot", "chisqGoodnessOfFit")
68
+S3method("print", "summary.BSseqTstat")
69
+S3method("summary", "BSseqTstat")
70
+S3method("plot", "BSseqTstat")
0 71
new file mode 100644
... ...
@@ -0,0 +1,269 @@
1
+makeClusters <- function(hasGRanges, maxGap = 10^8, mc.cores = 1) {
2
+    chrOrder <- as.character(runValue(seqnames(hasGRanges)))
3
+    if(anyDuplicated(chrOrder))
4
+        stop("argument 'hasGRanges' is not properly order")
5
+    grBase <- granges(hasGRanges)
6
+    clusters <- reduce(resize(grBase, width = 2*maxGap + 1, fix = "center"))
7
+    start(clusters) <- pmax(rep(1, length(clusters)), start(clusters))
8
+    clusters.sp <- split(clusters, seqnames(clusters))
9
+    stopifnot(all(sapply(clusters.sp, function(cluster.gr) {
10
+        if(length(cluster.gr) <= 1) return(TRUE)
11
+        all(start(cluster.gr)[-length(cluster.gr)] < end(cluster.gr)[-1])
12
+    }))) # are the clusters ordered within the chromosome? This is probably guranteed
13
+    clusters <- Reduce(c, clusters.sp[chrOrder])
14
+    stopifnot(all(chrOrder == runValue(seqnames(clusters))))
15
+    ov <- findOverlaps_mclapply(grBase, clusters, mc.cores = mc.cores)
16
+    clusterIdx <- split(as.matrix(ov)[,1], as.matrix(ov)[,2])
17
+    names(clusterIdx) <- NULL
18
+    clusterIdx
19
+}
20
+    
21
+
22
+BSmooth <- function(BSseq, ns = 70, h = 1000, maxGap = 10^8, parallelBy = c("sample", "chromosome"),
23
+                    mc.preschedule = FALSE, mc.cores = 1, keep.se = FALSE, verbose = TRUE) {
24
+    smooth <- function(idxes, sname) {
25
+        ## Assuming that idxes is a set of indexes into the BSseq object
26
+        ## sname is a single character
27
+        if(verbose >= 3)
28
+            cat(sprintf("    beginning: sample:%s, chr:%s, nLoci:%s\n",
29
+                        sname, as.character(seqnames(BSseq)[idxes[1]]),
30
+                        length(idxes)))
31
+        Cov <- getCoverage(BSseq, type = "Cov")[idxes, sname]
32
+        M <- getCoverage(BSseq, type = "M")[idxes, sname]
33
+        pos <- start(BSseq)[idxes]
34
+        stopifnot(all(diff(pos) > 0))
35
+        wh <- which(Cov != 0)
36
+        nn <- ns / length(wh)
37
+        if(length(wh) <= ns) {
38
+            if(keep.se)
39
+                se.coef <- rep(NA_real_, length(Cov))
40
+            else
41
+                se.coef <- NULL
42
+            return(list(coef = rep(NA_real_, length(Cov)),
43
+                        se.coef = se.coef,
44
+                        trans = NULL, h = h, nn = nn))
45
+        }
46
+        sdata <- data.frame(pos = pos[wh],
47
+                            M = pmin(pmax(M[wh], 0.01), Cov[wh] - 0.01),
48
+                            Cov = Cov[wh])
49
+        fit <- locfit(M ~ lp(pos, nn = nn, h = h), data = sdata,
50
+                      weights = Cov, family = "binomial", maxk = 10000)
51
+        pp <- preplot(fit, where = "data", band = "local",
52
+                      newdata = data.frame(pos = pos))
53
+        if(keep.se) {
54
+            se.coef <- pp$se.fit
55
+        } else {
56
+            se.coef <- NULL
57
+        }
58
+        if(verbose >= 2)
59
+            cat(sprintf("    sample:%s, chr:%s, nLoci:%s, nCoveredLoci:%s, done\n",
60
+                        sname, as.character(seqnames(BSseq)[idxes[1]]),
61
+                        length(idxes), nrow(sdata)))
62
+        return(list(coef = pp$fit, se.coef = se.coef,
63
+                    trans = pp$trans, h = h, nn = nn))
64
+    }
65
+    stopifnot(class(BSseq) == "BSseq")
66
+    parallelBy <- match.arg(parallelBy)
67
+    if(verbose) cat("preprocessing ... ")
68
+    stime <- system.time({
69
+        clusterIdx <- makeClusters(BSseq, maxGap = maxGap, mc.cores = mc.cores)
70
+    })[3]
71
+    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
72
+
73
+    sampleNames <- sampleNames(BSseq)
74
+    names(sampleNames) <- sampleNames
75
+
76
+    stimeAll <- system.time({
77
+        switch(parallelBy, "sample" = {
78
+            if(verbose) cat(sprintf("smoothing by 'sample' (mc.cores = %d, mc.preschedule = %s)\n",
79
+                                    mc.cores, mc.preschedule))
80
+            out <- mclapply(sampleNames, function(nam) {
81
+                stime <- system.time({
82
+                    tmp <- lapply(clusterIdx, function(jj) {
83
+                        smooth(idxes = jj, sname = nam)
84
+                    })
85
+                    coef <- do.call(c, lapply(tmp, function(xx) xx$coef))
86
+                    se.coef <- do.call(c, lapply(tmp, function(xx) xx$se.coef))
87
+                })[3]
88
+                if(verbose) {
89
+                    cat(sprintf("  sample %s (out of %d), done in %.1f sec\n",
90
+                                nam, length(sampleNames), stime))
91
+                }
92
+                return(list(coef = coef, se.coef = se.coef))
93
+            }, mc.preschedule = mc.preschedule, mc.cores = mc.cores)
94
+            coef <- do.call(cbind, lapply(out, function(xx) xx$coef))
95
+            se.coef <- do.call(cbind, lapply(out, function(xx) xx$se.coef))
96
+        }, "chromosome" = {
97
+            if(verbose) cat(sprintf("smoothing by 'chromosome' (mc.cores = %d, mc.preschedule = %s)\n",
98
+                                    mc.cores, mc.preschedule))
99
+            out <- mclapply(1:length(clusterIdx), function(ii) {
100
+                stime <- system.time({
101
+                    tmp <- lapply(sampleNames, function(nam) {
102
+                        smooth(idxes = clusterIdx[[ii]], sname = nam)
103
+                    })
104
+                    coef <- do.call(cbind, lapply(tmp, function(xx) xx$coef))
105
+                    se.coef <- do.call(cbind, lapply(tmp, function(xx) xx$se.coef))
106
+                })[3]
107
+                if(verbose)
108
+                    cat(sprintf("  chr idx %d (out of %d), done in %.1f sec\n",
109
+                                ii, length(clusterIdx), stime))
110
+                return(list(coef = coef, se.coef = se.coef))
111
+            }, mc.preschedule = mc.preschedule, mc.cores = mc.cores)
112
+            coef <- do.call(rbind, lapply(out, function(xx) xx$coef))
113
+            se.coef <- do.call(rbind, lapply(out, function(xx) xx$se.coef))
114
+        })
115
+    })[3]
116
+    if(verbose)
117
+        cat(sprintf("smoothing done in %.1f sec\n", stimeAll))
118
+
119
+    rownames(coef) <- NULL
120
+    colnames(coef) <- sampleNames(BSseq)
121
+    if(!is.null(se.coef)) {
122
+        rownames(se.coef) <- NULL
123
+        colnames(se.coef) <- sampleNames(BSseq)
124
+    }
125
+    
126
+    mytrans <- function(x) {
127
+        y <- x
128
+        ix <- which(x < 0)
129
+        ix2 <- which(x > 0)
130
+        y[ix] <- exp(x[ix])/(1 + exp(x[ix]))
131
+        y[ix2] <- 1/(1 + exp(-x[ix2]))
132
+        y
133
+    }
134
+    environment(mytrans) <- baseenv()
135
+    BSseq@trans <- mytrans
136
+    BSseq@coef <- coef
137
+    BSseq@se.coef <- se.coef
138
+    parameters <- list(smoothText = sprintf("BSmooth (ns = %d, h = %d, maxGap = %d)", ns, h, maxGap),
139
+                       ns = ns, h = h, maxGap = maxGap)
140
+    BSseq@parameters <- parameters
141
+    BSseq
142
+}
143
+
144
+
145
+BSmooth.tstat <- function(BSseq, group1, group2, estimate.var = c("same", "paired", "group2"),
146
+                          local.correct = TRUE, maxGap = NULL, qSd = 0.75, k = 101, mc.cores = 1, verbose = TRUE){
147
+    smoothSd <- function(Sds, k) {
148
+        k0 <- floor(k/2)
149
+        if(all(is.na(Sds))) return(Sds)
150
+        thresSD <- pmax(Sds, quantile(Sds, qSd, na.rm = TRUE), na.rm = TRUE)
151
+        addSD <- rep(median(Sds, na.rm = TRUE), k0)
152
+        sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k))
153
+        sSds
154
+    }
155
+    compute.correction <- function(idx, qSd = 0.75) {
156
+        xx <- start(BSseq)[idx]
157
+        yy <- tstat[idx]
158
+        suppressWarnings({
159
+            drange <- diff(range(xx, na.rm = TRUE))
160
+        })
161
+        if(drange <= 25000)
162
+            return(yy)
163
+        tstat.function <- approxfun(xx, yy)
164
+        xx.reg <- seq(from = min(xx), to = max(xx), by = 2000)
165
+        yy.reg <- tstat.function(xx.reg)
166
+        fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0),
167
+                      family = "huber", maxk = 10000) 
168
+        correction <- predict(fit, newdata = data.frame(xx.reg = xx))
169
+        yy - correction 
170
+    }
171
+
172
+    estimate.var <- match.arg(estimate.var)
173
+    stopifnot(is(BSseq, "BSseq"))
174
+    stopifnot(hasBeenSmoothed(BSseq))
175
+    if(is.numeric(group1)) {
176
+        stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq))
177
+        group1 <- sampleNames(BSseq)[group1]
178
+    }
179
+    if(is.numeric(group2)) {
180
+        stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq))
181
+        group2 <- sampleNames(BSseq)[group2]
182
+    }
183
+    stopifnot(length(intersect(group1, group2)) == 0)
184
+    if(estimate.var == "paired")
185
+        stopifnot(length(group1) == length(group2))
186
+    stopifnot(all(c(group1, group2) %in% sampleNames(BSseq)))
187
+
188
+    if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0))
189
+        warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first")
190
+    
191
+    if(is.null(maxGap))
192
+        maxGap <- BSseq@parameters$maxGap
193
+    
194
+    if(verbose) cat("preprocessing ... ")
195
+    stime <- system.time({
196
+        clusterIdx <- makeClusters(BSseq, maxGap = maxGap, mc.cores = mc.cores)
197
+    })[3]
198
+    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
199
+        
200
+    if(verbose) cat("computing stats within groups ... ")
201
+    stime <- system.time({
202
+        allPs <- getMeth(BSseq, type = "smooth", what = "perBase",
203
+                         confint = FALSE)[, c(group1, group2)]
204
+        group1.means <- rowMeans(allPs[, group1, drop = FALSE], na.rm = TRUE)
205
+        group2.means <- rowMeans(allPs[, group2, drop = FALSE], na.rm = TRUE)
206
+        })[3]                               
207
+    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
208
+    
209
+    if(verbose) cat("computing stats across groups ... ")
210
+    stime <- system.time({
211
+        switch(estimate.var,
212
+               "group2" = {
213
+                   rawSds <- rowSds(allPs[, group2, drop = FALSE], na.rm = TRUE)
214
+                   smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
215
+                       smoothSd(rawSds[idx], k = k)
216
+                   }, mc.cores = mc.cores))
217
+                   scale <- sqrt(1/length(group1) + 1/length(group2))
218
+                   tstat.sd <- smoothSds * scale
219
+               },
220
+               "same" = {
221
+                   rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs[, group1, drop = FALSE]) +
222
+                                    (length(group2) - 1) * rowVars(allPs[, group2, drop = FALSE])) /
223
+                                  (length(group1) + length(group2) - 2))
224
+                   smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
225
+                       smoothSd(rawSds[idx], k = k)
226
+                   }, mc.cores = mc.cores))
227
+                   scale <- sqrt(1/length(group1) + 1/length(group2))
228
+                   tstat.sd <- smoothSds * scale
229
+               },
230
+               "paired" = {
231
+                   rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE])
232
+                   smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
233
+                       smoothSd(rawSds[idx], k = k)
234
+                   }, mc.cores = mc.cores))
235
+                   scale <- sqrt(1/length(group1))
236
+                   tstat.sd <- smoothSds * scale
237
+               })
238
+        tstat <- (group1.means - group2.means) / tstat.sd
239
+        is.na(tstat)[tstat.sd == 0] <- TRUE
240
+        if(local.correct) {
241
+            tstat.corrected <- do.call(c, mclapply(clusterIdx,
242
+                                                   compute.correction, qSd = qSd,
243
+                                                   mc.cores = mc.cores))
244
+        }
245
+    })[3]
246
+    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
247
+    
248
+    if(local.correct) {
249
+        stats <- cbind(rawSds, tstat.sd, group2.means, group1.means,
250
+                       tstat, tstat.corrected)
251
+        colnames(stats) <- c("rawSds", "tstat.sd",
252
+                             "group2.means", "group1.means", "tstat",
253
+                             "tstat.corrected")
254
+ 
255
+    } else {
256
+        stats <- cbind(rawSds, tstat.sd, group2.means, group1.means,
257
+                       tstat)
258
+        colnames(stats) <- c("rawSds", "tstat.sd",
259
+                             "group2.means", "group1.means", "tstat")
260
+    }
261
+    
262
+    parameters <- c(BSseq@parameters,
263
+                    list(tstatText = sprintf("BSmooth.tstat (local.correct = %s, maxGap = %d)",
264
+                         local.correct, maxGap),
265
+                         group1 = group1, group2 = group2, k = k, qSd = qSd,
266
+                         local.correct = local.correct, maxGap = maxGap))
267
+    out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters)
268
+    out
269
+}
0 270
new file mode 100644
... ...
@@ -0,0 +1,434 @@
1
+setClassUnion("matrixOrNULL", c("matrix", "NULL"))
2
+
3
+setClass("BSseq", contains = "hasGRanges", 
4
+         representation(M = "matrix",
5
+                        Cov = "matrix",
6
+                        coef = "matrixOrNULL",
7
+                        se.coef = "matrixOrNULL",
8
+                        trans = "function",
9
+                        parameters = "list",
10
+                        phenoData = "AnnotatedDataFrame"))
11
+
12
+
13
+setClass("BSseqTstat", contains = "hasGRanges", 
14
+         representation(stats = "matrix",
15
+                        parameters = "list")
16
+         )
17
+         
18
+
19
+##
20
+## Simple accessor and replacement functions
21
+##
22
+
23
+setValidity("BSseq", function(object) {
24
+    msg <- NULL
25
+    nCpGs <- length(object@gr)
26
+    nSamples <- ncol(object@M)
27
+    if(nCpGs != nrow(object@M) ||
28
+       nCpGs != nrow(object@Cov))
29
+        msg <- c(msg, "length of the 'gr' slot is not equal to the nrows the 'M' or 'Cov' slots")
30
+    if(nSamples != ncol(object@Cov))
31
+        msg <- c(msg, "ncols of the 'M' slot is not equal to the ncols of the 'Cov' slot")
32
+    if(nSamples != nrow(object@phenoData))
33
+        msg <- c(msg, "the 'phenoData' slot does not match the dimensions of the 'M' slot")
34
+    if(any(object@M < 0))
35
+        msg <- c(msg, "the 'M' slot has negative entries")
36
+    if(any(object@Cov < 0))
37
+        msg <- c(msg, "the 'Cov' slot has negative entries")
38
+    if(any(object@M > object@Cov))
39
+        msg <- c(msg, "the 'M' slot has at least one entry bigger than the 'Cov' slot")
40
+    if(!is.null(object@coef) && (nCpGs != nrow(object@coef) ||
41
+                                 nSamples != ncol(object@coef)))
42
+        msg <- c(msg, "nrows and/or ncols of the 'coef' slot does not have the same dimensions as the 'M' slot")
43
+    if(!is.null(object@se.coef) && (nCpGs != nrow(object@se.coef) ||
44
+                                    nSamples != ncol(object@se.coef)))
45
+        msg <- c(msg, "nrows and/or ncols of the 'se.coef' slot does not have the same dimensions as the 'M' slot")
46
+    if(!is.null(rownames(object@M)) ||
47
+       !is.null(rownames(object@Cov)) ||
48
+       !is.null(rownames(object@coef)) ||
49
+       !is.null(rownames(object@se.coef))) warning("unnecessary rownames in object")
50
+    ## FIXME: check samplenames
51
+    if(is.null(msg)) TRUE else msg
52
+})
53
+
54
+
55
+
56
+setValidity("BSseqTstat", function(object) {
57
+    msg <- NULL
58
+    if(length(object@gr) != nrow(object@stats))
59
+        msg <- c(msg, "length of 'gr' is different from the number of rows of 'stats'")
60
+    if(is.null(msg)) TRUE else msg
61
+})
62
+
63
+
64
+setMethod("show", signature(object = "BSseq"),
65
+          function(object) {
66
+              cat("An object of type 'BSseq' with\n")
67
+              cat(" ", nrow(object), "methylation loci\n")
68
+              cat(" ", ncol(object), "samples\n")
69
+              if(hasBeenSmoothed(object)) {
70
+                  cat("has been smoothed with\n")
71
+                  cat(" ", object@parameters$smoothText, "\n")
72
+              } else {
73
+                  cat("has not been smoothed\n")
74
+              }
75
+          })
76
+
77
+setMethod("show", signature(object = "BSseqTstat"),
78
+          function(object) {
79
+              cat("An object of type 'BSseqTstat' with\n")
80
+              cat(" ", length(object), "methylation loci\n")
81
+              cat("based on smoothed data:\n")
82
+              cat(" ", object@parameters$smoothText, "\n")
83
+              cat("with parameters\n")
84
+              cat(" ", object@parameters$tstatText, "\n")
85
+          })
86
+
87
+
88
+##
89
+## eSet stuff
90
+##
91
+
92
+setMethod("dim", "BSseq", function(x) {
93
+    dim(x@M)
94
+})
95
+setMethod("nrow", "BSseq", function(x) {
96
+    nrow(x@M)
97
+})
98
+setMethod("ncol", "BSseq", function(x) {
99
+    ncol(x@M)
100
+})
101
+
102
+setMethod("phenoData", "BSseq", function(object) {
103
+    object@phenoData
104
+})
105
+setReplaceMethod("phenoData", signature = signature(
106
+                              object = "BSseq",
107
+                              value = "AnnotatedDataFrame"),
108
+                 function(object, value) {
109
+                     object@phenoData <- value
110
+                     object
111
+                 })
112
+
113
+setMethod("pData", "BSseq", function(object) {
114
+    pData(phenoData(object))
115
+})
116
+setReplaceMethod("pData", signature = signature(
117
+                              object = "BSseq",
118
+                              value = "data.frame"),
119
+                 function(object, value) {
120
+                     pd <- object@phenoData
121
+                     pData(pd) <- value
122
+                     object@phenoData <- pd
123
+                     object
124
+                 })
125
+
126
+setMethod("sampleNames", "BSseq", function(object) {
127
+    sampleNames(phenoData(object))
128
+})
129
+
130
+setReplaceMethod("sampleNames",
131
+                 signature(object = "BSseq", value = "ANY"),
132
+                 function(object, value) {
133
+                     sampleNames(phenoData(object)) <- value
134
+                     colnames(object@M) <- value
135
+                     colnames(object@Cov) <- value
136
+                     if(!is.null(object@coef))
137
+                         colnames(object@coef) <- value
138
+                     if(!is.null(object@se.coef))
139
+                         colnames(object@se.coef) <- value
140
+                     object
141
+                 })
142
+
143
+hasBeenSmoothed <- function(BSseq) {
144
+    !is.null(BSseq@coef)
145
+}
146
+
147
+##
148
+## Subsetting and combining
149
+##
150
+
151
+setMethod("[", "BSseq", function(x, i, j, ...) {
152
+    if(missing(drop))
153
+        drop <- FALSE
154
+    if(missing(i) && missing(j))
155
+        stop("need [i,j] for subsetting")
156
+    if(!missing(j))
157
+        x@phenoData <- phenoData(x)[j,, ..., drop = drop]
158
+    else
159
+        x@phenoData <- phenoData(x)
160
+    if(missing(i)) {
161
+        x@M <- getBSseq(x, "M")[, j, drop = FALSE]
162
+        x@Cov <- getBSseq(x, "Cov")[, j, drop = FALSE]
163
+        x@coef <- getBSseq(x, "coef")[, j, drop = FALSE]
164
+        x@se.coef <- getBSseq(x, "se.coef")[, j, drop = FALSE]
165
+        x@gr <- granges(x)
166
+    } else {
167
+        x@M <- getBSseq(x, "M")[i, j, drop = FALSE]
168
+        x@Cov <- getBSseq(x, "Cov")[i, j, drop = FALSE]
169
+        x@coef <- getBSseq(x, "coef")[i, j, drop = FALSE]
170
+        x@se.coef <- getBSseq(x, "se.coef")[i, j, drop = FALSE]
171
+        x@gr <- granges(x)[i]
172
+    }
173
+    x
174
+})
175
+
176
+setMethod("[", "BSseqTstat", function(x, i, ...) {
177
+    if(missing(i))
178
+        stop("need [i] for subsetting")
179
+    if(missing(i))
180
+        return(x)
181
+    x@gr <- x@gr[i]
182
+    x@stats <- x@stats[i,, drop = FALSE]
183
+    x
184
+})
185
+
186
+setMethod("combine", signature(x = "BSseq", y = "BSseq"), function(x, y, ...) {
187
+    ## All of this assumes that we are place x and y "next" to each other,
188
+    ##  ie. we are not combining the same set of samples sequenced at different times
189
+    if (class(x) != class(y))
190
+        stop(paste("objects must be the same class, but are ",
191
+                   class(x), ", ", class(y), sep=""))
192
+    if(hasBeenSmoothed(x) && hasBeenSmoothed(y) && !all.equal(x@trans, y@trans))
193
+        stop("'x' and 'y' need to be smoothed on the same scale")
194
+    phenoData <- combine(phenoData(x), phenoData(y))
195
+    gr <- reduce(c(granges(x), granges(y)), min.gapwidth = 0L)
196
+    mm.x <- as.matrix(findOverlaps(gr, granges(x)))
197
+    mm.y <- as.matrix(findOverlaps(gr, granges(y)))
198
+    sampleNames <- c(sampleNames(x), sampleNames(y))
199
+    ## FIXME: there is no check that the two sampleNames are disjoint.
200
+    M <- Cov <- matrix(0, nrow = length(gr), ncol = length(sampleNames))
201
+    colnames(M) <- colnames(Cov) <- sampleNames
202
+    M[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "M")[mm.x[,2],]
203
+    M[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "M")[mm.y[,2],]
204
+    Cov[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "Cov")[mm.x[,2],]
205
+    Cov[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "Cov")[mm.y[,2],]
206
+    if(!hasBeenSmoothed(x) || !hasBeenSmoothed(y)) {
207
+        coef <- NULL
208
+        se.coef <- NULL
209
+        trans <- NULL
210
+    } else {
211
+        trans <- x@trans
212
+        coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames))
213
+        colnames(coef) <- sampleNames(phenoData)
214
+        if(hasBeenSmoothed(x))
215
+            coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "coef")[mm.x[,2],]
216
+        if(hasBeenSmoothed(y))
217
+            coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "coef")[mm.y[,2],]
218
+        if(is.null(getBSseq(x, "se.coef")) && is.null(getBSseq(x, "se.coef")))
219
+            se.coef <- NULL
220
+        else {
221
+            se.coef <- matrix(0, nrow = length(gr), ncol = length(sampleNames))
222
+            colnames(se.coef) <- sampleNames(phenoData)
223
+            if(!is.null(getBSseq(x, "se.coef")))
224
+                se.coef[mm.x[,1], 1:ncol(x)] <- getBSseq(x, "se.coef")[mm.x[,2],]
225
+            if(!is.null(getBSseq(y, "se.coef")))
226
+                se.coef[mm.y[,1], ncol(x) + 1:ncol(y)] <- getBSseq(y, "se.coef")[mm.y[,2],]
227
+        }
228
+    }
229
+    BSseq(gr = gr, M = M, Cov = Cov, coef = coef, se.coef = se.coef, trans = trans, rmZeroCov = FALSE)
230
+})
231
+
232
+getBSseq <- function(BSseq, type = c("Cov", "M", "gr", "coef", "se.coef", "trans", "parameters")) {
233
+    type <- match.arg(type)
234
+    switch(type,
235
+           "Cov" = {
236
+               return(BSseq@Cov)
237
+           },
238
+           "M" = {
239
+               return(BSseq@M)
240
+           },
241
+           "gr" = {
242
+               return(BSseq@gr)
243
+           },
244
+           "coef" = {
245
+               return(BSseq@coef)
246
+           },
247
+           "se.coef" = {
248
+               return(BSseq@se.coef)
249
+           },
250
+           "trans" = {
251
+               return(BSseq@trans)
252
+           },
253
+           "parameters" = {
254
+               return(BSseq@parameters)
255
+           })
256
+}
257
+
258
+
259
+
260
+BSseq <- function(M = NULL, Cov = NULL, coef = NULL, se.coef = NULL,
261
+                  trans = NULL, parameters = NULL, phenoData = NULL,
262
+                  gr = NULL, pos = NULL, chr = NULL, sampleNames = NULL,
263
+                  rmZeroCov = FALSE) {
264
+    if(is.null(gr)) {
265
+        if(is.null(pos) || is.null(chr))
266
+            stop("Need pos and chr")
267
+        gr <- GRanges(seqnames = chr, ranges = IRanges(start = pos, width = 1))
268
+    }
269
+    if(!is(gr, "GRanges"))
270
+        stop("'gr' needs to be a GRanges")
271
+    if(any(width(gr)) != 1)
272
+        stop("'gr' needs to have widths of 1")
273
+    if(is.null(M) || is.null(Cov))
274
+        stop("Need M and Cov")
275
+    if(!is.matrix(M))
276
+        stop("'M' needs to be a matrix")
277
+    if(!is.matrix(Cov))
278
+        stop("'Cov' needs to be a matrix")
279
+    if(length(gr) != nrow(M) ||
280
+       length(gr) != nrow(Cov) ||
281
+       ncol(Cov) != ncol(M))
282
+        stop("'gr', 'M' and 'Cov' need to have similar dimensions")
283
+    if(!is.null(rownames(M)))
284
+        rownames(M) <- NULL
285
+    if(!is.null(rownames(Cov)))
286
+        rownames(Cov) <- NULL
287
+    if(!is.null(names(gr)))
288
+        names(gr) <- NULL
289
+    ## deal with sampleNames
290
+    if(is.null(sampleNames) && !is.null(phenoData) && !is.null(sampleNames(phenoData)))
291
+        sampleNames <- sampleNames(phenoData)
292
+    if(is.null(sampleNames) && !is.null(colnames(M)))
293
+        sampleNames <- colnames(M)
294
+    if(is.null(sampleNames) && !is.null(colnames(Cov)))
295
+        sampleNames <- colnames(Cov)
296
+    if(is.null(sampleNames))
297
+        sampleNames <- paste("V", 1:ncol(M), sep = "")
298
+    if(length(unique(sampleNames)) != ncol(M))
299
+        stop("sampleNames need to be unique and of the right length.")
300
+    ## check that 0 <= M <= Cov and remove positions with Cov = 0
301
+    if(any(M < 0) || any(M > Cov) || any(is.na(M)) || any(is.na(Cov)) ||
302
+       any(is.infinite(Cov)))
303
+        stop("'M' and 'Cov' may not contain NA or infinite values and 0 <= M <= Cov")
304
+    if(rmZeroCov) {
305
+        wh <- which(rowSums(Cov) == 0)
306
+        if(length(wh) > 0) {
307
+            gr <- gr[-wh]
308
+            M <- M[-wh,,drop = FALSE]
309
+            Cov <- Cov[-wh,,drop = FALSE]
310
+        }
311
+    }
312
+    grR <- reduce(gr, min.gapwidth = 0L)
313
+    if(!identical(grR, gr)) {
314
+        ## Now we either need to re-order or collapse or both
315
+        mm <- as.matrix(findOverlaps(grR, gr))
316
+        mm <- mm[order(mm[,1]),]
317
+        if(length(grR) == length(gr)) {
318
+            ## only re-ordering is necessary 
319
+            gr <- grR
320
+            M <- M[mm[,2],,drop = FALSE]
321
+            Cov <- Cov[mm[,2],,drop = FALSE]
322
+            if(!is.null(coef))
323
+                coef <- coef[mm[,2],,drop = FALSE]
324
+            if(!is.null(se.coef))
325
+                se.coef <- se.coef[mm[,2],, drop = FALSE]
326
+        } else {
327
+            warning("multiple positions, collapsing BSseq object\n")
328
+            if(!is.null(coef) || !is.null(se.coef))
329
+                stop("Cannot collapse when 'coef' or 'se.coef' are present")
330
+            gr <- grR
331
+            sp <- split(mm[,2], mm[,1])[as.character(1:length(grR))]
332
+            names(sp) <- NULL
333
+            M <- do.call(rbind, lapply(sp, function(ii) colSums(M[ii,, drop = FALSE])))
334
+            Cov <- do.call(rbind, lapply(sp, function(ii) colSums(Cov[ii,, drop = FALSE])))
335
+        }
336
+    }
337
+    if(is.null(colnames(M)) || any(sampleNames != colnames(M)))
338
+        colnames(M) <- sampleNames
339
+    if(is.null(colnames(Cov)) || any(sampleNames != colnames(Cov)))
340
+        colnames(Cov) <- sampleNames
341
+    if(is.null(phenoData))
342
+        phenoData <- annotatedDataFrameFrom(M, byrow = FALSE)
343
+    BSseq <- new("BSseq", gr = gr, M = M, Cov = Cov, phenoData = phenoData)
344
+    if(!is.null(coef)) {
345
+        if(!is.matrix(coef) ||
346
+           nrow(coef) != nrow(BSseq) ||
347
+           ncol(coef) != ncol(BSseq))
348
+            stop("'coef' does not have the right dimensions")
349
+        if(is.null(colnames(coef)) || any(sampleNames != colnames(coef)))
350
+            colnames(coef) <- sampleNames
351
+        if(!is.null(rownames(coef)))
352
+            rownames(coef) <- NULL
353
+        BSseq@coef <- coef
354
+    }
355
+    if(!is.null(se.coef)) {
356
+        if(!is.matrix(se.coef) ||
357
+           nrow(se.coef) != nrow(BSseq) ||
358
+           ncol(se.coef) != ncol(BSseq))
359
+            stop("'se.coef' does not have the right dimensions")
360
+        if(is.null(colnames(se.coef)) || any(sampleNames != colnames(se.coef)))
361
+            colnames(se.coef) <- sampleNames
362
+        if(!is.null(rownames(se.coef)))
363
+            rownames(se.coef) <- NULL
364
+        BSseq@se.coef <- se.coef
365
+    }
366
+    if(is.function(trans))
367
+        BSseq@trans <- trans
368
+    if(is.list(parameters))
369
+        BSseq@parameters <- parameters
370
+    BSseq
371
+}
372
+
373
+
374
+BSseqTstat <- function(gr = NULL, stats = NULL, parameters = NULL) {
375
+    out <- new("BSseqTstat")
376
+    out@gr <- gr
377
+    out@stats <- stats
378
+    out@parameters <- parameters
379
+    out
380
+}
381
+
382
+
383
+
384
+                  
385
+
386
+## getD <- function(data, sample1, sample2, type = c("raw", "fit"),
387
+##                  addPositions = FALSE, alpha = 0.95) {
388
+##     d.conf <- function(p1, n1, p2, n2) {
389
+##         ## Method 10 from Newcombe (Stat in Med, 1998)
390
+##         getRoots <- function(p, n) {
391
+##             mat <- cbind(p^2, -(2*p + z^2/n), 1+z^2/n)
392
+##             roots <- t(Re(apply(mat, 1, polyroot)))
393
+##             roots
394
+##         }
395
+##         roots1 <- roots2 <- matrix(NA_real_, nrow = length(p1), ncol = 2)
396
+##         idx <- !is.na(p1)
397
+##         roots1[idx,] <- getRoots(p1[idx], n1[idx])
398
+##         idx <- !is.na(p2)
399
+##         roots2[idx,] <- getRoots(p2[idx], n2[idx])
400
+##         d <- p1 - p2
401
+##         suppressWarnings({
402
+##             lower <- d - z * sqrt(roots1[,1]*(1-roots1[,1])/n1 + roots2[,2]*(1-roots2[,2])/n2)
403
+##             upper <- d + z * sqrt(roots1[,2]*(1-roots1[,2])/n1 + roots2[,1]*(1-roots2[,1])/n2)
404
+##         })
405
+##         return(data.frame(d = d, lower = lower, upper = upper))
406
+##     }
407
+##     type <- match.arg(type)
408
+##     z <- abs(qnorm((1-alpha)/2, mean = 0, sd = 1))
409
+##     switch(type,
410
+##            raw = {
411
+##                conf <- d.conf(p1 = data$M[, sample1] / data$Cov[, sample1],
412
+##                               n1 = data$Cov[, sample1],
413
+##                               p2 = data$M[, sample2] / data$Cov[, sample2],
414
+##                               n2 = data$Cov[, sample2])
415
+##                out <- conf
416
+##            },
417
+##            fit = {
418
+##                p1 <- data$trans(data$coef[, sample1])
419
+##                p2 <- data$trans(data$coef[, sample2])
420
+##                d <- p1 - p2
421
+##                ## Based on the delta method
422
+##                se.d <- sqrt((data$se.coef[, sample1] * p1 * (1-p1))^2 +
423
+##                             (data$se.coef[, sample2] * p2 * (1-p2))^2)
424
+               
425
+##                lower <- d - z * se.d
426
+##                upper <- d + z * se.d
427
+##                out <- data.frame(d = d, lower = lower, upper = upper)
428
+##            })
429
+##     if(addPositions) {
430
+##         out$pos <- start(data$gr)
431
+##         out$chr <- seqnames(data$gr)
432
+##     }
433
+##     out
434
+## }
0 435
new file mode 100644
... ...
@@ -0,0 +1,243 @@
1
+collapseBSseq <- function(BSseq, columns) {
2
+    ## columns is a vector of new names, names(columns) is sampleNames or empty
3
+    if(is.null(names(columns)) && length(columns) != ncol(BSseq))
4
+        stop("if `columns' does not have names, it needs to be of the same length as `BSseq` has columns (samples)")
5
+    if(!is.null(names(columns)) && !all(names(columns) %in% sampleNames(BSseq)))
6
+        stop("if `columns` has names, they need to be sampleNames(BSseq)")
7
+    if(is.null(names(columns))) {
8
+        sp <- split(1:ncol(BSseq), columns)
9
+    } else {
10
+        sp <- split(names(columns), columns)
11
+    }
12
+    M <- do.call(cbind, lapply(sp, function(ss) {
13
+        rowSums(BSseq@M[, ss, drop = FALSE])
14
+    }))
15
+    colnames(M) <- names(sp)
16
+    Cov <- do.call(cbind, lapply(sp, function(ss) {
17
+        rowSums(BSseq@Cov[, ss, drop = FALSE])
18
+    }))
19
+    colnames(Cov) <- names(sp)
20
+    BSseq(gr = BSseq@gr, M = M, Cov = Cov)
21
+}
22
+
23
+chrSelectBSseq <- function(BSseq, seqnames = NULL, order = FALSE) {
24
+    gr <- GRanges(seqnames = seqnames,
25
+                  ranges = IRanges(start = rep(1, length(seqnames)),
26
+                  end = rep(10^9, length(seqnames))))
27
+    BSseq <- subsetByOverlaps(BSseq, gr)
28
+    seqlevels(BSseq) <- seqlevels(BSseq)[seqlevels(BSseq) %in% seqnames]
29
+    if(order)
30
+        BSseq <- orderBSseq(BSseq, seqOrder = seqnames)
31
+    BSseq
32
+}
33
+
34
+
35
+orderBSseq <- function(BSseq, seqOrder = NULL) {
36
+    splitNames <- splitRanges(seqnames(BSseq))
37
+    if(is.null(seqOrder))
38
+        seqOrder <- names(splitNames)
39
+    else
40
+        stopifnot(all(seqOrder %in% names(splitNames)))
41
+    splitOd <- lapply(seqOrder, function(nam) {
42
+        seqRanges <- seqselect(ranges(granges(BSseq)), splitNames[[nam]]) 
43
+        as.integer(unlist(splitNames[[nam]])[order(start(seqRanges))])
44
+    })
45
+    BSseq <- BSseq[do.call(c, splitOd)]
46
+    seqlevels(BSseq) <- seqOrder
47
+    BSseq
48
+}
49
+
50
+
51
+getMeth <- function(BSseq, regions = NULL, type = c("smooth", "raw"),
52
+                    what = c("perBase", "perRegion"), confint = FALSE, alpha = 0.95) {
53
+    p.conf <- function(p, n, alpha) {
54
+        z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
55
+        upper <- (p + z^2/(2*n) + z*sqrt(  (p*(1-p) + z^2/(4*n)) / n)) /
56
+            (1+z^2/n)
57
+        lower <- (p + z^2/(2*n) - z*sqrt(  (p*(1-p) + z^2/(4*n)) / n)) /
58
+            (1+z^2/n)
59
+        return(list(meth = p, lower = lower, upper = upper))
60
+    }
61
+    stopifnot(is(BSseq, "BSseq"))
62
+    type <- match.arg(type)
63
+    what <- match.arg(what)
64
+    z <- abs(qnorm((1 - alpha)/2, mean = 0, sd = 1))
65
+    if(is.null(regions) && type == "smooth") {
66
+        meth <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef"))
67
+        if(confint) {
68
+            upper <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") +
69
+                                      z * getBSseq(BSseq, type = "se.coef"))
70
+            lower <- getBSseq(BSseq, type = "trans")(getBSseq(BSseq, type = "coef") -
71
+                                      z * getBSseq(BSseq, type = "se.coef"))
72
+            return(list(meth = meth, lower = lower, upper = upper))
73
+        } else {
74
+            return(meth)
75
+        }
76
+    }
77
+    if(is.null(regions) && type == "raw") {
78
+        meth <- getBSseq(BSseq, type = "M") / getBSseq(BSseq, type = "Cov")
79
+        if(confint) {
80
+            return(p.conf(meth, n = getBSseq(BSseq, type = "Cov"), alpha = alpha))
81
+        } else {
82
+            return(meth)
83
+        }
84
+    }
85
+    ## At this point, regions have been specified
86
+    if(class(regions) == "data.frame")
87
+        regions <- data.frame2GRanges(regions)
88
+    stopifnot(is(regions, "GenomicRanges"))
89
+    if(confint) stop("'confint = TRUE' is not supported by 'getMeth' when regions is given")
90
+    grBSseq <- granges(BSseq)
91
+    mm <- as.matrix(findOverlaps(regions, grBSseq))
92
+    mmsplit <- split(mm[,2], mm[,1])
93
+    if(what == "perBase") {
94
+        if(type == "smooth") {
95
+            out <- lapply(mmsplit, function(xx) {
96
+                getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE])
97
+            })
98
+        }
99
+        if(type == "raw") {
100
+            out <- lapply(mmsplit, function(xx) {
101
+                getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE]
102
+            })
103
+        }
104
+        outList <- vector("list", length(regions))
105
+        outList[as.integer(names(mmsplit))] <- out
106
+        return(outList)
107
+    }
108
+    if(what == "perRegion") {
109
+        if(type == "smooth") {
110
+            out <- lapply(mmsplit, function(xx) {
111
+                colMeans(getBSseq(BSseq, "trans")(getBSseq(BSseq, "coef")[xx,,drop = FALSE]), na.rm = TRUE)
112
+            })
113
+        }
114
+        if(type == "raw") {
115
+            out <- lapply(mmsplit, function(xx) {
116
+                colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE] / getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE)
117
+            })
118
+        }
119
+        out <- do.call(rbind, out)
120
+        outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
121
+        colnames(outMatrix) <- sampleNames(BSseq)
122
+        outMatrix[as.integer(rownames(out)),] <- out
123
+        return(outMatrix)
124
+    }
125
+}
126
+
127
+
128
+getCoverage <- function(BSseq, regions = NULL, type = c("Cov", "M"),
129
+                    what = c("perBase", "perRegionAverage", "perRegionTotal")) {
130
+    stopifnot(is(BSseq, "BSseq"))
131
+    type <- match.arg(type)
132
+    what <- match.arg(what)
133
+    if(is.null(regions)) {
134
+        if(what == "perBase")
135
+            return(getBSseq(BSseq, type = type))
136
+        if(what == "perRegionTotal")
137
+            return(colSums(getBSseq(BSseq, type = type)))
138
+        if(what == "perRegionAverage")
139
+            return(colMeans(getBSseq(BSseq, type = type)))
140
+    }
141
+    if(class(regions) == "data.frame")
142
+        regions <- data.frame2GRanges(regions)
143
+    stopifnot(is(regions, "GenomicRanges"))
144
+    grBSseq <- granges(BSseq)
145
+    mm <- as.matrix(findOverlaps(regions, grBSseq))
146
+    mmsplit <- split(mm[,2], mm[,1])
147
+    if(what == "perBase") {
148
+        if(type == "Cov") {
149
+            out <- lapply(mmsplit, function(xx) {
150
+                getBSseq(BSseq, "Cov")[xx,,drop = FALSE]
151
+            })
152
+        }
153
+        if(type == "M") {
154
+            out <- lapply(mmsplit, function(xx) {
155
+                getBSseq(BSseq, "M")[xx,,drop = FALSE]
156
+            })
157
+        }
158
+        outList <- vector("list", length(regions))
159
+        outList[as.integer(names(mmsplit))] <- out
160
+        return(outList)
161
+    }
162
+    if(what == "perRegionAverage") {
163
+        if(type == "Cov") {
164
+            out <- lapply(mmsplit, function(xx) {
165
+                colMeans(getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE)
166
+            })
167
+        }
168
+        if(type == "M") {
169
+            out <- lapply(mmsplit, function(xx) {
170
+                colMeans(getBSseq(BSseq, "M")[xx,,drop = FALSE], na.rm = TRUE)
171
+            })
172
+        }
173
+    }
174
+    if(what == "perRegionTotal") {
175
+        if(type == "Cov") {
176
+            out <- lapply(mmsplit, function(xx) {
177
+                colSums(getBSseq(BSseq, "Cov")[xx,,drop = FALSE], na.rm = TRUE)
178
+            })
179
+        }
180
+        if(type == "M") {
181
+            out <- lapply(mmsplit, function(xx) {
182
+                colSums(getBSseq(BSseq, "M")[xx,,drop = FALSE], na.rm = TRUE)
183
+            })
184
+        }
185
+    }
186
+    out <- do.call(rbind, out)
187
+    outMatrix <- matrix(NA, ncol = ncol(BSseq), nrow = length(regions))
188
+    colnames(outMatrix) <- sampleNames(BSseq)
189
+    outMatrix[as.integer(rownames(out)),] <- out
190
+    return(outMatrix)
191
+}
192
+
193
+getStats <- function(BSseqTstat, regions = NULL, column = c("tstat.corrected", "tstat")) {
194
+    stopifnot(is(BSseqTstat, "BSseqTstat"))
195
+    column <- match.arg(column)
196
+    if(class(regions) == "data.frame")
197
+        regions <- data.frame2GRanges(regions)
198
+    if(is.null(regions))
199
+        return(BSseqTstat@stats)
200
+    stopifnot(is(regions, "GenomicRanges"))
201
+    ov <- findOverlaps(BSseqTstat, regions)
202
+    ov.sp <- split(queryHits(ov), subjectHits(ov))
203
+    getRegionStats <- function(idx) {
204
+        mat <- BSseqTstat@stats[idx,, drop=FALSE]
205
+        meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"])
206
+        group1.mean <- mean(mat[, "group1.means"])
207
+        group2.mean <- mean(mat[, "group2.means"])
208
+        tstat.sd <- mean(mat[, "tstat.sd"])
209
+        areaStat <- sum(mat[, column])
210
+        maxStat <- max(mat[, column])
211
+        c(meanDiff, group1.mean, group2.mean, tstat.sd, areaStat, maxStat)
212
+    }
213
+    stats <- lapply(ov.sp, getRegionStats)
214
+    out <- matrix(NA, ncol = 6, nrow = length(regions))
215
+    out[as.integer(names(stats)),] <- do.call(rbind, stats)
216
+    colnames(out) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd", "areaStat", "maxStat")
217
+    out <- as.data.frame(out)
218
+    out
219
+}
220
+
221
+
222
+summary.BSseqTstat <- function(object, ...) {
223
+    quant <- quantile(getStats(object)[, "tstat.corrected"],
224
+                      prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999))
225
+    quant <- t(t(quant))
226
+    colnames(quant) <- "quantiles"
227
+    out <- list(quantiles = quant)
228
+    class(out) <- "summary.BSseqTstat"
229
+    out
230
+}
231
+
232
+print.summary.BSseqTstat <- function(x, ...) {
233
+    print(as.matrix(x$quantiles))
234
+}
235
+
236
+plot.BSseqTstat <- function(x, y, ...) {
237
+    tstat <- getStats(x)[, "tstat"]
238
+    tstat.cor <- getStats(x)[, "tstat.corrected"]
239
+    plot(density(tstat), xlim = c(-10,10), col = "blue", main = "")
240
+    lines(density(tstat.cor), col = "black")
241
+    legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1),
242
+           col = c("blue", "black"))
243
+}
0 244
new file mode 100644
... ...
@@ -0,0 +1,79 @@
1
+dmrFinder <- function(BSseqTstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
2
+                             maxGap = 300, column = c("tstat.corrected", "tstat"), verbose = TRUE) {
3
+    column <- match.arg(column)
4
+    if(! column %in% colnames(BSseqTstat@stats))
5
+        stop("'column' needs to be a column of 'BSseqTstat@stats'")
6
+    if(is.null(cutoff))
7
+        cutoff <- quantile(BSseqTstat@stats[, column], qcutoff)
8
+    direction <- as.integer(BSseqTstat@stats[, column] >= cutoff[2])
9
+    direction[BSseqTstat@stats[, column] <= cutoff[1]] <- -1L
10
+    direction[is.na(direction)] <- 0L
11
+    chrs <- as.character(seqnames(BSseqTstat@gr))
12
+    positions <- start(BSseqTstat)
13
+    regions <- bsseq:::regionFinder3(direction, chr = chrs, pos = positions, maxGap = maxGap, verbose = verbose)
14
+    if(verbose) cat("creating dmr data.frame\n")
15
+    regions <- do.call(rbind, regions)
16
+    rownames(regions) <- NULL
17
+    stats <- getStats(BSseqTstat, regions, column = column)
18
+    regions <- cbind(regions, stats)
19
+    regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo")
20
+    regions$width <- regions$end - regions$start + 1
21
+    regions$invdensity <- regions$width / regions$n
22
+    regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),]
23
+    regions
24
+}
25
+
26
+clusterMaker <- function(chr,pos,order.it=TRUE,maxGap=300){
27
+  nonaIndex=which(!is.na(chr) & !is.na(pos))
28
+  Indexes=split(nonaIndex,chr[nonaIndex])
29
+  clusterIDs=rep(NA,length(chr))
30
+  LAST=0
31
+  for(i in seq(along=Indexes)){
32
+    Index=Indexes[[i]]
33
+    x=pos[Index]
34
+    if(order.it){ Index=Index[order(x)];x=pos[Index] }
35
+    y=as.numeric(diff(x)>maxGap)
36
+    z=cumsum(c(1,y))
37
+    clusterIDs[Index]=z+LAST
38
+    LAST=max(z)+LAST
39
+  }
40
+  clusterIDs
41
+}  
42
+
43
+
44
+regionFinder3 <- function(x, chr, positions, keep, maxGap = 300, verbose = TRUE) {
45
+    getSegments3 <- function(x, cid, verbose = TRUE){
46
+        if(verbose) cat("segmenting\n")
47
+        segments <- cumsum(c(1, diff(x) != 0)) +
48
+            cumsum(c(1, diff(cid) != 0))
49
+        names(segments) <- NULL
50
+        if(verbose) cat("splitting\n")
51
+        out <- list(up = split(which(x > 0), segments[x > 0]),
52
+                    down = split(which(x < 0), segments[x < 0]),
53
+                    nochange = split(which(x == 0), segments[x == 0]))
54
+        names(out[[1]]) <- NULL
55
+        names(out[[2]]) <- NULL
56
+        names(out[[3]]) <- NULL
57
+        out
58
+    }
59
+    cid0 <- clusterMaker(chr, positions, maxGap = maxGap)
60
+    segments <- getSegments3(x = x, cid = cid0, verbose = verbose)
61
+    out <- vector("list", 2)
62
+    names(out) <- c("up", "down")
63
+    for(ii in 1:2) {
64
+        idx <- segments[[ii]]
65
+        if(length(idx) == 0) {
66
+            out[[ii]] <- NULL
67
+        } else {
68
+            out[[ii]] <- data.frame(chr = sapply(idx, function(jj) chr[jj[1]]),
69
+                                    start = sapply(idx, function(jj) min(positions[jj])),
70
+                                    end = sapply(idx, function(jj) max(positions[jj])),
71
+                                    idxStart = sapply(idx, function(jj) min(jj)),
72
+                                    idxEnd = sapply(idx, function(jj) max(jj)),
73
+                                    cluster = sapply(idx, function(jj) cid0[jj[1]]),
74
+                                    n = sapply(idx, length))
75
+        }
76
+    }
77
+    out
78
+}
79
+
0 80
new file mode 100644
... ...
@@ -0,0 +1,65 @@
1
+fisherTests <- function(BSseq, group1, group2, lookup = NULL,
2
+                        returnLookup = TRUE, mc.cores = 1,
3
+                        verbose = TRUE) {
4
+    stopifnot(is(BSseq, "BSseq"))
5
+    if(is.integer(group1)) {
6
+        stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq))
7
+        group1 <- sampleNames(BSseq)[group1]
8
+    }
9
+    if(is.integer(group2)) {
10
+        stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq))
11
+        group2 <- sampleNames(BSseq)[group2]
12
+    }
13
+    stopifnot(length(intersect(group1, group2)) == 0)
14
+    sampleNames <- c(group1, group2)
15
+    if(is.null(lookup)) {
16
+        oldkeys <- ""
17
+    } else {
18
+        stopifnot(is.matrix(lookup) && ncol(lookup) == 2 &&
19
+                  setequal(colnames(lookup), c("p.value", "log2OR")))
20
+        oldkeys <- rownames(lookup)
21
+    }
22
+    if(verbose) cat(" setup ... ")
23
+    stime <- system.time({
24
+        M1 <- rowSums(getCoverage(BSseq, type = "M")[, group1, drop = FALSE])
25
+        M2 <- rowSums(getCoverage(BSseq, type = "M")[, group2, drop = FALSE])
26
+        Cov1 <- rowSums(getCoverage(BSseq, type = "Cov")[, group1, drop = FALSE])
27
+        Cov2 <- rowSums(getCoverage(BSseq, type = "Cov")[, group2, drop = FALSE])
28
+        keys <- paste(Cov1, Cov2, M1, M2, sep = "_")
29
+        newkeys <- setdiff(keys, oldkeys)
30
+        expand <- matrix(as.integer(do.call(rbind, strsplit(newkeys, "_"))), ncol = 4)
31
+        colnames(expand) <- c("Cov1", "Cov2", "M1", "M2")
32
+        expand2 <- expand
33
+        colnames(expand2) <- c("U1", "U2", "M1", "M2")
34
+        expand2[,"U1"] <- expand[,"Cov1"] - expand[,"M1"]
35
+        expand2[,"U2"] <- expand[,"Cov2"] - expand[,"M2"]
36
+    })[3]
37
+    if(verbose) cat(round(stime,1), "secs\n")
38
+    if(verbose) cat(sprintf(" performing %d tests (using %d cores) ... ",
39
+                            nrow(expand2), mc.cores))
40
+    stime <- system.time({
41
+        tests <- mclapply(1:nrow(expand2), function(ii) {
42
+            unlist(fisher.test(matrix(expand2[ii,], ncol = 2))[c("p.value", "estimate")])
43
+        }, mc.cores = mc.cores)
44
+    })[3]
45
+    if(verbose) cat(round(stime,1), "secs\n")
46
+    if(verbose) cat(" finalizing ... ")
47
+    stime <- system.time({
48
+        tests <- do.call(rbind, tests)
49
+        tests[,2] <- log2(tests[,2])
50
+        colnames(tests) <- c("p.value", "log2OR")
51
+        rownames(tests) <- newkeys
52
+        lookup <- rbind(tests, lookup)
53
+        results <- lookup[keys,]
54
+        rownames(results) <- NULL
55
+            
56
+        
57
+    })[3]
58
+    if(verbose) cat(round(stime,1), "secs\n")
59
+    if(returnLookup)
60
+        return(list(results = results, lookup = lookup))
61
+    else
62
+        return(results)
63
+}
64
+
65
+
0 66
new file mode 100644
... ...
@@ -0,0 +1,78 @@
1
+findOverlaps_mclapply <- function (query, subject, maxgap = 0L, minoverlap = 1L, 
2
+                                   type = c("any", "start", "end", "within", "equal"),
3
+                                   select = c("all", "first"), ignore.strand = FALSE,
4
+                                   mc.cores = 1, mc.preschedule = TRUE, verbose = FALSE, ...) {
5
+    if(!is(query, "GenomicRanges") || !is(subject, "GenomicRanges"))
6
+        stop("findOverlaps_mclapply needs 'query' and 'subject' to be 'GenomicRanges'")
7
+    if (!IRanges:::isSingleNumber(maxgap) || maxgap < 0) 
8
+        stop("'maxgap' must be a non-negative integer")
9
+    type <- match.arg(type)
10
+    select <- match.arg(select)
11
+    seqinfo <- merge(seqinfo(query), seqinfo(subject))
12
+    DIM <- c(length(query), length(subject))
13
+    if (min(DIM) == 0L) {
14
+        matchMatrix <- matrix(integer(0), nrow = 0L, ncol = 2L, 
15
+                              dimnames = list(NULL, c("queryHits", "subjectHits")))
16
+    }
17
+    else {
18
+        querySeqnames <- seqnames(query)
19
+        querySplitRanges <- splitRanges(querySeqnames)
20
+        uniqueQuerySeqnames <- names(querySplitRanges)[sapply(querySplitRanges, length) > 0] # FIX: only keep seqnames with ranges
21
+        subjectSeqnames <- seqnames(subject)
22
+        subjectSplitRanges <- splitRanges(subjectSeqnames)
23
+        uniqueSubjectSeqnames <- names(subjectSplitRanges)[sapply(subjectSplitRanges, length) > 0] # FIX: only keep seqnames with ranges
24
+        commonSeqnames <- intersect(uniqueQuerySeqnames, 
25
+                                    uniqueSubjectSeqnames)
26
+        if (ignore.strand) {
27
+            queryStrand <- rep.int(1L, length(query))
28
+            subjectStrand <- rep.int(1L, length(subject))
29
+        }
30
+        else {
31
+            queryStrand <- strand(query)
32
+            levels(queryStrand) <- c("1", "-1", "0")
33
+            queryStrand@values <- as.integer(as.character(runValue(queryStrand)))
34
+            queryStrand <- as.vector(queryStrand)
35
+            subjectStrand <- strand(subject)
36
+            levels(subjectStrand) <- c("1", "-1", "0")
37
+            subjectStrand@values <- as.integer(as.character(runValue(subjectStrand)))
38
+            subjectStrand <- as.vector(subjectStrand)
39
+        }
40
+        queryRanges <- unname(ranges(query))
41
+        subjectRanges <- unname(ranges(subject))
42
+        matchMatrix <- do.call(rbind, mclapply(commonSeqnames, 
43
+                                               function(seqnm) {
44
+                                                   if(verbose) cat(seqnm, "\n") # FIX : added verbosity
45
+                                                   if (isCircular(seqinfo)[seqnm] %in% TRUE) 
46
+                                                       circle.length <- seqlengths(seqinfo)[seqnm]
47
+                                                   else circle.length <- NA
48
+                                                   qIdxs <- querySplitRanges[[seqnm]]
49
+                                                   sIdxs <- subjectSplitRanges[[seqnm]]
50
+                                                   ## FIX: added ::: tpo get .findOverlaps.circle
51
+                                                   overlaps <- GenomicRanges:::.findOverlaps.circle(circle.length, 
52
+                                                                                                    seqselect(queryRanges, qIdxs), seqselect(subjectRanges, 
53
+                                                                                                                                             sIdxs), maxgap, minoverlap, type)
54
+                                                   qHits <- queryHits(overlaps)
55
+                                                   sHits <- subjectHits(overlaps)
56
+                                                   matches <- cbind(queryHits = as.integer(qIdxs)[qHits], 
57
+                                                                    subjectHits = as.integer(sIdxs)[sHits])
58
+                                                   matches[which(seqselect(queryStrand, qIdxs)[qHits] * 
59
+                                                                 seqselect(subjectStrand, sIdxs)[sHits] != 
60
+                                                                 -1L), , drop = FALSE]
61
+                                               }, mc.cores = mc.cores, mc.preschedule = mc.preschedule))
62
+        if (is.null(matchMatrix)) {
63
+                matchMatrix <- matrix(integer(0), nrow = 0L, 
64
+                                      ncol = 2L, dimnames = list(NULL, c("queryHits", 
65
+                                                 "subjectHits")))
66
+            }
67
+        matchMatrix <- matchMatrix[IRanges:::orderIntegerPairs(matchMatrix[, 
68
+                                                                           1L], matchMatrix[, 2L]), , drop = FALSE]
69
+    }
70
+    if (select == "all") {
71
+        new("Hits", queryHits = unname(matchMatrix[, 1L]), 
72
+            subjectHits = unname(matchMatrix[, 2L]), queryLength = DIM[1], 
73
+            subjectLength = DIM[2])
74
+    }
75
+    else {
76
+        IRanges:::.hitsMatrixToVector(matchMatrix, length(query))
77
+    }
78
+}
0 79
new file mode 100644
... ...
@@ -0,0 +1,71 @@
1
+poissonGoodnessOfFit <- function(BSseq, nQuantiles = 10^5) {
2
+    Cov <- getBSseq(BSseq, type = "Cov")
3
+    lambda <- rowMeans(Cov)
4
+    out <- list(chisq = NULL, df = ncol(Cov) - 1)
5
+    out$chisq <- rowSums((Cov - lambda)^2 / lambda)
6
+    if(nQuantiles > length(out$chisq))
7
+        nQuantiles <- length(out$chisq)
8
+    probs <- seq(from = 0, to = 1, length.out = nQuantiles)
9
+    out$quantiles <- quantile(out$chisq, prob = probs, na.rm = TRUE)
10
+    class(out) <- "chisqGoodnessOfFit"
11
+    return(out)
12
+}
13
+
14
+print.chisqGoodnessOfFit <- function(x, ...) {
15
+    cat("Chisq Goodness of Fit object\n")
16
+    cat(" ", length(x$chisq), "tests\n")
17
+    cat(" ", x$df, "degrees of freedom\n")
18
+}
19
+
20
+binomialGoodnessOfFit <- function(BSseq, method = c("MLE"), nQuantiles = 10^5) {
21
+    method <- match.arg(method)
22
+    M <- getCoverage(BSseq, type = "M", what = "perBase")
23
+    Cov <- getCoverage(BSseq, type = "Cov", what = "perBase")
24
+    if(method == "MLE") # This is the MLE under iid
25
+        p <- rowSums(M) / rowSums(Cov) 
26
+    ## p <- rowMeans(M / Cov, na.rm = TRUE)
27
+    out <- list(chisq = NULL, quantiles = NULL, df = ncol(Cov) - 1)
28
+    ## This gets tricky.  For the binomial model, we need Cov > 0, but
29
+    ## this implies that a given location may have less then nCol(BSseq)
30
+    ## number of observations, and hence the degrees of freedom will
31
+    ## vary.  
32
+    out$chisq <- rowSums((M - Cov*p)^2 / sqrt(Cov * p * (1-p)))
33
+    probs <- seq(from = 0, to = 1, length.out = nQuantiles)
34
+    ## Next line will remove all locations where not all samples have coverage
35
+    out$quantiles <- quantile(out$chisq, prob = probs, na.rm = TRUE)
36
+    class(out) <- "chisqGoodnessOfFit"
37
+    return(out)
38
+}
39
+
40
+plot.chisqGoodnessOfFit <- function(x, type = c("chisq", "pvalue"),
41
+                                    plotCol = TRUE, qqline = TRUE, pch = 16, cex = 0.75, ...) {
42
+    type <- match.arg(type)
43
+    switch(type,
44
+           "chisq" = {
45
+               yy <- x$quantiles
46
+               xx <- qchisq(ppoints(yy), df = x$df)
47
+           },
48
+           "pvalue" = {
49
+               yy <- sort(1 - qchisq(x$quantiles, df = x$df))
50
+               xx <- qunif(ppoints(yy), min = 0, max = 1)
51
+           })
52
+    if(plotCol) {
53
+        colors <- rep("black", length(yy))
54
+        colors[floor(length(colors)*.95):length(colors)] <- "red"
55
+        colors[floor(length(colors)*.99):length(colors)] <- "violet"
56
+        colors[floor(length(colors)*.999):length(colors)] <- "orange"
57
+    } else {
58
+        colors <- "black"
59
+    }
60
+    qqplot(xx, yy, xlab = "Theoretical quantiles",
61
+           ylab = "Observed quantiles", col = colors, pch = pch, cex = cex, ...)
62
+    if(qqline) {
63
+        yyqt <- quantile(yy, c(0.25, 0.75))
64
+        xxqt <- quantile(xx, c(0.25, 0.75))
65
+        slope <- diff(yyqt) / diff(xxqt)
66
+        int <- yyqt[1L] - slope * xxqt[1L]
67
+        abline(int, slope, col = "blue")
68
+    }
69
+    abline(0, 1, col = "blue", lty = 2)
70
+}
71
+
0 72
new file mode 100644
... ...
@@ -0,0 +1,161 @@
1
+setClassUnion("matrixOrNULL", c("matrix", "NULL"))
2
+
3
+setClass("hasGRanges",
4
+         representation(gr = "GRanges"))
5
+
6
+setMethod("seqnames", signature(x = "hasGRanges"), function(x) {
7
+    seqnames(x@gr)
8
+})
9
+setReplaceMethod("seqnames", "hasGRanges", function(x, value) {
10
+    gr <- granges(x)
11
+    seqnames(gr) <- value
12
+    x@gr <- gr
13
+    x
14
+})
15
+
16
+setMethod("seqlevels", signature(x = "hasGRanges"), function(x) {
17
+    seqlevels(x@gr)
18
+})
19
+setReplaceMethod("seqlevels", "hasGRanges", function(x, value) {
20
+    gr <- granges(x)
21
+    seqlevels(gr) <- value
22
+    x@gr <- gr
23
+    x
24
+})
25
+
26
+setMethod("seqlengths", signature(x = "hasGRanges"), function(x) {
27
+    seqlengths(x@gr)
28
+})