Browse code

merge conflicts

From: Kasper Daniel Hansen <kasperdanielhansen@gmail.com>

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@114581 bc3139a8-67e5-0310-9ffc-ced21a209358

khansen authored on 10/03/2016 02:44:17
Showing 10 changed files

... ...
@@ -1,5 +1,5 @@
1 1
 Package: bsseq
2
-Version: 1.7.2
2
+Version: 1.7.6
3 3
 Title: Analyze, manage and store bisulfite sequencing data
4 4
 Description: A collection of tools for analyzing and visualizing bisulfite
5 5
     sequencing data.
... ...
@@ -26,7 +26,8 @@ Imports:
26 26
     data.table,
27 27
     S4Vectors,
28 28
     R.utils (>= 2.0.0),
29
-    matrixStats
29
+    matrixStats,
30
+    permute
30 31
 Suggests:
31 32
     RUnit,
32 33
     bsseqData,
... ...
@@ -15,7 +15,7 @@ importFrom(graphics, "abline", "axis", "layout", "legend", "lines",
15 15
            "mtext", "par", "plot", "points", "rect", "rug")
16 16
 import(parallel)
17 17
 importFrom(locfit, "locfit", "lp")
18
-importFrom(matrixStats, "rowSds", "rowVars")
18
+importFrom(matrixStats, "rowSds", "rowVars", "rowMaxs", "rowMins")
19 19
 import(IRanges)
20 20
 import(GenomicRanges)
21 21
 import(SummarizedExperiment)
... ...
@@ -32,6 +32,7 @@ importFrom(gtools, "combinations")
32 32
 importFrom(data.table, "fread", "setnames")
33 33
 importFrom(R.utils, "isGzipped", "gunzip")
34 34
 import(limma)
35
+importFrom(permute, "shuffleSet", "how")
35 36
 
36 37
 ##
37 38
 ## Exporting
... ...
@@ -63,7 +63,7 @@ smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1,
63 63
                              smoothSd(getStats(BSseqStat, what = "rawSds")[idx], k = k, qSd = qSd)
64 64
                          }, mc.cores = mc.cores))
65 65
     if("smoothSds" %in% names(getStats(BSseqStat)))
66
-        BSseqStat@stats[["smoothSds"]] <- stat
66
+        BSseqStat@stats[["smoothSds"]] <- smoothSds
67 67
     else
68 68
         BSseqStat@stats <- c(getStats(BSseqStat), list(smoothSds = smoothSds))
69 69
     BSseqStat
... ...
@@ -73,7 +73,7 @@ smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1,
73 73
 computeStat <- function(BSseqStat, coef = NULL) {
74 74
     stopifnot(is(BSseqStat, "BSseqStat"))
75 75
     if(is.null(coef)) {
76
-        coef <- 1:ncol(BSseqStat$rawTstats)
76
+        coef <- 1:ncol(getStats(BSseqStat, what = "rawTstats"))
77 77
     }
78 78
     tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE]
79 79
     tstats <- tstats * getStats(BSseqStat, what = "rawSds") /
... ...
@@ -136,3 +136,31 @@ localCorrectStat <- function(BSseqStat, threshold = c(-15,15), mc.cores = 1, ver
136 136
     BSseqTstat
137 137
 }
138 138
 
139
+fstat.pipeline <- function(BSseq, design, contrasts, cutoff, fac, nperm = 1000,
140
+                           coef = NULL, maxGap.sd = 10 ^ 8, maxGap.dmr = 300,
141
+                           mc.cores = 1) {
142
+    bstat <- BSmooth.fstat(BSseq = BSseq, design = design,
143
+                           contrasts = contrasts,
144
+                           returnModelCoefficients = TRUE)
145
+    bstat <- smoothSds(bstat)
146
+    bstat <- computeStat(bstat, coef = coef)
147
+    dmrs <- dmrFinder(bstat, cutoff = cutoff)
148
+    idxMatrix <- permuteAll(nperm, design)
149
+    nullDist <- getNullDistribution_BSmooth.fstat(BSseq = BSseq,
150
+                                                  idxMatrix = idxMatrix,
151
+                                                  design = design,
152
+                                                  contrasts = contrasts,
153
+                                                  coef = coef,
154
+                                                  cutoff = cutoff,
155
+                                                  maxGap.sd = maxGap.sd,
156
+                                                  maxGap.dmr = maxGap.dmr,
157
+                                                  mc.cores = mc.cores)
158
+    fwer <- getFWER.fstat(null = c(list(dmrs), nullDist), type = "dmrs")
159
+    dmrs$fwer <- fwer
160
+    meth <- getMeth(BSseq, dmrs, what = "perRegion")
161
+    meth <- t(apply(meth, 1, function(xx) tapply(xx, fac, mean)))
162
+    dmrs <- cbind(dmrs, meth)
163
+    dmrs$maxDiff <- rowMaxs(meth) - rowMins(meth)
164
+    list(bstat = bstat, dmrs = dmrs, idxMatrix = idxMatrix, nullDist = nullDist)
165
+}
166
+
... ...
@@ -39,8 +39,10 @@ setMethod("[", "BSseqStat", function(x, i, ...) {
39 39
     if(missing(i))
40 40
         return(x)
41 41
     x@gr <- x@gr[i]
42
-    x@stats <- lapply(names(x@stats), function(nam) {
43
-        if(nam %in% c("rawTstats")) {
42
+    statnames <- names(x@stats)
43
+    names(statnames) <- statnames
44
+    x@stats <- lapply(statnames, function(nam) {
45
+        if(nam %in% c("rawTstats", "modelCoefficients")) {
44 46
             stopifnot(is.matrix(x@stats[[nam]]))
45 47
             return(x@stats[[nam]][i,,drop=FALSE])
46 48
         }
... ...
@@ -79,7 +79,7 @@ combineList <- function(x, ...) {
79 79
         Cov <- do.call(cbind, lapply(x, function(xx) getBSseq(xx, "Cov")))
80 80
     } else {
81 81
         gr <- sort(reduce(do.call(c, unname(lapply(x, granges))), min.gapwidth = 0L))
82
-        sampleNames <- do.call(c, lapply(x, sampleNames))
82
+        sampleNames <- do.call(c, unname(lapply(x, sampleNames)))
83 83
         M <- matrix(0, ncol = length(sampleNames), nrow = length(gr))
84 84
         colnames(M) <- sampleNames
85 85
         Cov <- M
... ...
@@ -121,7 +121,7 @@ setMethod("findOverlaps",
121 121
           signature(query = "hasGRanges", subject = "GenomicRanges"),
122 122
           function (query, subject, maxgap = 0L, minoverlap = 1L,
123 123
                     type = c("any", "start", "end", "within", "equal"),
124
-                    select = c("all", "first"),
124
+                    select = c("all", "first", "last", "arbitrary"),
125 125
                     ignore.strand = FALSE, ...) {
126 126
               findOverlaps(query = granges(query), subject = subject,
127 127
                            maxgap = maxgap, minoverlap = minoverlap,
... ...
@@ -133,7 +133,7 @@ setMethod("findOverlaps",
133 133
           signature(query = "hasGRanges", subject = "hasGRanges"),
134 134
           function (query, subject, maxgap = 0L, minoverlap = 1L,
135 135
                     type = c("any", "start", "end", "within", "equal"),
136
-                    select = c("all", "first"),
136
+                    select = c("all", "first", "last", "arbitrary"),
137 137
                     ignore.strand = FALSE, ...) {
138 138
               findOverlaps(query = granges(query), subject = granges(subject),
139 139
                            maxgap = maxgap, minoverlap = minoverlap,
... ...
@@ -145,7 +145,7 @@ setMethod("findOverlaps",
145 145
           signature(query = "GenomicRanges", subject = "hasGRanges"),
146 146
           function (query, subject, maxgap = 0L, minoverlap = 1L,
147 147
                     type = c("any", "start", "end", "within", "equal"),
148
-                    select = c("all", "first"),
148
+                    select = c("all", "first", "last", "arbitrary"),
149 149
                     ignore.strand = FALSE, ...) {
150 150
               findOverlaps(query = query, subject = granges(subject),
151 151
                            maxgap = maxgap, minoverlap = minoverlap,
... ...
@@ -19,12 +19,22 @@ getNullDistribution_BSmooth.tstat <- function(BSseq, idxMatrix1, idxMatrix2,
19 19
     nullDist
20 20
 }
21 21
 
22
-getNullDistribution_BSmooth.fstat <- function(BSseq, idxMatrix, design, contrasts,
23
-                                              coef = NULL, cutoff, maxGap.sd, maxGap.dmr, mc.cores = 1) {
24
-    message(sprintf("[getNullDistribution_BSmooth.fstat] performing %d permutations\n", nrow(idxMatrix)))
25
-    nullDist <- mclapply(1:nrow(idxMatrix), function(ii) {
22
+getNullDistribution_BSmooth.fstat <- function(BSseq,
23
+                                              idxMatrix,
24
+                                              design, contrasts,
25
+                                              coef = NULL, cutoff,
26
+                                              maxGap.sd, maxGap.dmr,
27
+                                              mc.cores = 1) {
28
+
29
+    message(sprintf("[getNullDistribution_BSmooth.fstat] performing %d permutations\n",
30
+                    nrow(idxMatrix)))
31
+    # NOTE: Using mc.preschedule = TRUE
32
+    nullDist <- mclapply(seq_len(nrow(idxMatrix)), function(ii) {
26 33
         ptime1 <- proc.time()
27
-        bstat <- BSmooth.fstat(BSseq[, idxMatrix[ii,]], design = design, contrasts = contrasts)
34
+        # NOTE: More efficient to permute design matrix using idxMatrix[ii, ] than
35
+        #       permute the raw data with BSseq[, idxMatrix[ii, ]]
36
+        bstat <- BSmooth.fstat(BSseq, design = design[idxMatrix[ii, ], ],
37
+                               contrasts = contrasts)
28 38
         bstat <- smoothSds(bstat, maxGap = maxGap.sd)
29 39
         bstat <- computeStat(bstat, coef = coef)
30 40
         dmrs0 <- dmrFinder(bstat, cutoff = cutoff, maxGap = maxGap.dmr)
... ...
@@ -32,10 +42,18 @@ getNullDistribution_BSmooth.fstat <- function(BSseq, idxMatrix, design, contrast
32 42
         stime <- (ptime2 - ptime1)[3]
33 43
         message(sprintf("[getNullDistribution_BSmooth.fstat] completing permutation %d in %.1f sec\n", ii, stime))
34 44
         dmrs0
35
-    }, mc.cores = min(nrow(idxMatrix), mc.cores), mc.preschedule = FALSE)
45
+    }, mc.cores = min(nrow(idxMatrix), mc.cores), mc.preschedule = TRUE)
36 46
     nullDist
37 47
 }
38 48
 
49
+permuteAll <- function(nperm, design) {
50
+    message(sprintf("[permuteAll] performing %d unrestricted permutations of the design matrix\n", nperm))
51
+
52
+    CTRL <- how(nperm = nperm)
53
+    # NOTE: shuffleSet() returns a nperm * nrow(design) matrix of permutations
54
+    idxMatrix <- shuffleSet(n = nrow(design), control = CTRL)
55
+}
56
+
39 57
 getNullBlocks_BSmooth.tstat <- function(BSseq, idxMatrix1, idxMatrix2, estimate.var = "same",
40 58
                           mc.cores = 1) {
41 59
     getNullDistribution_BSmooth.tstat(BSseq = BSseq, idxMatrix1 = idxMatrix1,
... ...
@@ -54,9 +72,6 @@ getNullDmrs_BSmooth.tstat <- function(BSseq, idxMatrix1, idxMatrix2, estimate.va
54 72
                        mc.cores = mc.cores)
55 73
 }
56 74
 
57
-
58
-
59
-
60 75
 subsetDmrs <- function(xx) {
61 76
     if(is.null(xx) || is(xx, "try-error"))
62 77
         return(NULL)
... ...
@@ -81,26 +96,66 @@ getFWER <- function(null, type = "blocks") {
81 96
     null <- null[-1]
82 97
     null <- null[!sapply(null, is.null)]
83 98
     better <- sapply(1:nrow(reference), function(ii) {
84
-        meanDiff <- abs(reference$meanDiff[ii])
99
+        # meanDiff <- abs(reference$meanDiff[ii])
100
+        areaStat <- abs(reference$areaStat[ii])
101
+        width <- reference$width[ii]
102
+        n <- reference$n[ii]
103
+        if (type == "blocks") {
104
+            out <- sapply(null, function(nulldist) {
105
+                # any(abs(nulldist$meanDiff) >= meanDiff &
106
+                        # nulldist$width >= width)
107
+                any(abs(nulldist$areaStat) >= areaStat &
108
+                        nulldist$width >= width)
109
+            })
110
+        }
111
+        if (type == "dmrs") {
112
+            out <- sapply(null, function(nulldist) {
113
+                # any(abs(nulldist$meanDiff) >= meanDiff &
114
+                #     nulldist$n >= n)
115
+                any(abs(nulldist$areaStat) >= areaStat &
116
+                        nulldist$n >= n)
117
+            })
118
+        }
119
+        sum(out)
120
+    })
121
+    better
122
+}
123
+
124
+# NOTE: Identical to getFWER() except uses areaStat rather than meanDiff
125
+#       to compare regions.
126
+getFWER.fstat <- function(null, type = "blocks") {
127
+    reference <- null[[1]]
128
+    null <- null[-1]
129
+    null <- null[!sapply(null, is.null)]
130
+    # TODO: Will break if null == list(), which can occur in practice (although
131
+    #       rarely).
132
+    better <- sapply(1:nrow(reference), function(ii) {
133
+        # meanDiff <- abs(reference$meanDiff[ii])
134
+        areaStat <- abs(reference$areaStat[ii])
85 135
         width <- reference$width[ii]
86 136
         n <- reference$n[ii]
87
-        if(type == "blocks") {
137
+        if (type == "blocks") {
88 138
             out <- sapply(null, function(nulldist) {
89
-                any(abs(nulldist$meanDiff) >= meanDiff &
139
+                # any(abs(nulldist$meanDiff) >= meanDiff &
140
+                # nulldist$width >= width)
141
+                any(abs(nulldist$areaStat) >= areaStat &
90 142
                         nulldist$width >= width)
91 143
             })
92 144
         }
93
-        if(type == "dmrs") {
145
+        if (type == "dmrs") {
94 146
             out <- sapply(null, function(nulldist) {
95
-                any(abs(nulldist$meanDiff) >= meanDiff &
96
-                    nulldist$n >= n)
147
+                # any(abs(nulldist$meanDiff) >= meanDiff &
148
+                #     nulldist$n >= n)
149
+                any(abs(nulldist$areaStat) >= areaStat &
150
+                        nulldist$n >= n)
97 151
             })
98 152
         }
99 153
         sum(out)
100 154
     })
101 155
     better
102 156
 }
103
-       
157
+
158
+# TODO: Simplify makeIdxMatrix() by using permute package
104 159
 makeIdxMatrix <- function(group1, group2, testIsSymmetric = TRUE, includeUnbalanced = TRUE) {
105 160
     groupBoth <- c(group1, group2)
106 161
     idxMatrix1 <- NULL
... ...
@@ -15,7 +15,7 @@ plotAnnoTrack <- function(gr, annoTrack) {
15 15
 
16 16
 plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addRegions = NULL,
17 17
                             annoTrack = NULL, col = NULL, lty = NULL, lwd = NULL, 
18
-                            BSseqTstat = NULL, stat = "tstat.corrected", stat.col = "black",
18
+                            BSseqStat = NULL, stat = "tstat.corrected", stat.col = "black",
19 19
                             stat.lwd = 1, stat.lty = 1, stat.ylim = c(-8,8),
20 20
                             mainWithWidth = TRUE, regionCol = alpha("red", 0.1), addTicks = TRUE,
21 21
                             addPoints = FALSE, pointsMinCov = 5, highlightMain = FALSE, verbose = TRUE) {
... ...
@@ -32,8 +32,8 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg
32 32
     }
33 33
     gr <- resize(gr, width = 2*extend + width(gr), fix = "center")
34 34
     BSseq <- subsetByOverlaps(BSseq, gr)
35
-    if(!is.null(BSseqTstat))
36
-        BSseqTstat <- subsetByOverlaps(BSseqTstat, gr)
35
+    if(!is.null(BSseqStat))
36
+        BSseqStat <- subsetByOverlaps(BSseqStat, gr)
37 37
     
38 38
     if(length(start(BSseq)) == 0)
39 39
         stop("No overlap between BSseq data and regions")
... ...
@@ -43,7 +43,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg
43 43
     for(ii in seq(along = gr)) {
44 44
         if(verbose) cat(sprintf("[plotManyRegions]   plotting region %d (out of %d)\n", ii, nrow(regions)))
45 45
         plotRegion(BSseq = BSseq, region = regions[ii,], extend = extend,
46
-                   col = col, lty = lty, lwd = lwd, main = main[ii], BSseqTstat = BSseqTstat,
46
+                   col = col, lty = lty, lwd = lwd, main = main[ii], BSseqStat = BSseqStat,
47 47
                    stat = stat, stat.col = stat.col, stat.lwd = stat.lwd,
48 48
                    stat.lty = stat.lty, stat.ylim = stat.ylim,
49 49
                    addRegions = addRegions, regionCol = regionCol, mainWithWidth = mainWithWidth,
... ...
@@ -211,7 +211,7 @@ plotManyRegions <- function(BSseq, regions = NULL, extend = 0, main = "", addReg
211 211
 
212 212
 plotRegion <- function(BSseq, region = NULL, extend = 0, main = "", addRegions = NULL, annoTrack = NULL,
213 213
                        col = NULL, lty = NULL, lwd = NULL,
214
-                       BSseqTstat = NULL, stat = "tstat.corrected", stat.col = "black",
214
+                       BSseqStat = NULL, stat = "tstat.corrected", stat.col = "black",
215 215
                        stat.lwd = 1, stat.lty = 1, stat.ylim = c(-8,8),
216 216
                        mainWithWidth = TRUE,
217 217
                        regionCol = alpha("red", 0.1), addTicks = TRUE, addPoints = FALSE,
... ...
@@ -219,7 +219,7 @@ plotRegion <- function(BSseq, region = NULL, extend = 0, main = "", addRegions =
219 219
     
220 220
     opar <- par(mar = c(0,4.1,0,0), oma = c(5,0,4,2), mfrow = c(1,1))
221 221
     on.exit(par(opar))
222
-    if(is.null(BSseqTstat))
222
+    if(is.null(BSseqStat))
223 223
         layout(matrix(1:2, ncol = 1), heights = c(2,1))
224 224
     else
225 225
         layout(matrix(1:3, ncol = 1), heights = c(2,2,1))
... ...
@@ -230,16 +230,25 @@ plotRegion <- function(BSseq, region = NULL, extend = 0, main = "", addRegions =
230 230
                     pointsMinCov = pointsMinCov, highlightMain = highlightMain)
231 231
     gr <- .bsGetGr(BSseq, region, extend)
232 232
     
233
-    if(!is.null(BSseqTstat)) {
234
-        BSseqTstat <- subsetByOverlaps(BSseqTstat, gr)
233
+    if(!is.null(BSseqStat)) {
234
+        BSseqStat <- subsetByOverlaps(BSseqStat, gr)
235
+        if(is(BSseqStat, "BSseqTstat")) {
236
+            stat.values <- getStats(BSseqStat, what = stat)
237
+            stat.type <- "tstat"
238
+        }
239
+        if(is(BSseqStat, "BSseqStat")) {
240
+            stat.type <- getStats(BSseqStat, what = "stat.type")
241
+            if(stat.type == "tstat")
242
+                stat.values <- getStats(BSseqStat, what = "stat")
243
+            if(stat.type == "fstat")
244
+                stat.values <- sqrt(getStats(BSseqStat, what = "stat"))
245
+        }
235 246
         plot(start(gr), 0.5, type = "n", xaxt = "n", yaxt = "n",
236
-             ylim = stat.ylim, xlim = c(start(gr), end(gr)), xlab = "", ylab = "t-stat")
247
+             ylim = stat.ylim, xlim = c(start(gr), end(gr)), xlab = "", ylab = stat.type)
237 248
         axis(side = 2, at = c(-5,0,5))
238 249
         abline(h = 0, col = "grey60")
239
-        mapply(function(stat, col, lty, lwd) {
240
-            .bsPlotLines(start(BSseqTstat), BSseqTstat@stats[, stat],
241
-                         lty = lty, plotRange = c(start(gr), end(gr)), col = col, lwd = lwd)
242
-        }, stat = stat, col = stat.col, lty = stat.lty, lwd = stat.lwd)
250
+        .bsPlotLines(start(BSseqStat), stat.values, lty = stat.lty, col = stat.col, lwd = stat.lwd,
251
+                     plotRange = c(start(gr), end(gr)))
243 252
     }
244 253
     
245 254
     if(!is.null(annoTrack))
... ...
@@ -13,7 +13,7 @@ read.bismark <- function(files,
13 13
     }
14 14
     fileType <- match.arg(fileType)
15 15
     if (verbose) {
16
-        message(paste0("Assuming file type is", fileType))
16
+        message(paste0("Assuming file type is ", fileType))
17 17
     }
18 18
     ## Process each file
19 19
     idxes <- seq_along(files)
... ...
@@ -119,7 +119,7 @@ read.bismarkCytosineReportRaw <- function(thisfile,
119 119
                   strand = out[[3]])
120 120
 
121 121
     ## Create BSseq instance from 'out'
122
-    BSseq(gr = gr,
122
+    BSseq(gr = gr, sampleNames = thisSampleName,
123 123
           M = as.matrix(out[[4L]]),
124 124
           Cov = as.matrix(out[[4L]] + out[[5L]]))
125 125
 }
... ...
@@ -2,6 +2,14 @@
2 2
 \title{bsseq news}
3 3
 \encoding{UTF-8}
4 4
 
5
+\section{Version 1.7.x}{
6
+  \itemize{
7
+    \item Fixing an error with reading multiple samples aligned with
8
+    bismark in the format "cytosineReport".
9
+  }
10
+}
11
+
12
+
5 13
 \section{Version 1.5.x}{
6 14
   \itemize{
7 15
     \item new function strandCollapse for collapsing forward and reverse