Browse code

fixed conflict

From: Kasper Daniel Hansen <kasperdanielhansen@gmail.com>

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/bsseq@110074 bc3139a8-67e5-0310-9ffc-ced21a209358

khansen authored on 29/10/2015 19:12:43
Showing 18 changed files

... ...
@@ -1,2 +1,3 @@
1 1
 ^.*\.Rproj$
2 2
 ^\.Rproj\.user$
3
+.travis.yml
... ...
@@ -2,3 +2,4 @@
2 2
 .Rhistory$
3 3
 .RData$
4 4
 auto/
5
+*~
5 6
new file mode 100644
... ...
@@ -0,0 +1,33 @@
1
+language: r
2
+sudo: required
3
+
4
+# Choose your operating system(s) (default: linux)
5
+os:
6
+  - linux
7
+  - osx
8
+
9
+# Bioconductor-specific stuff. If targetting bioc-devel then need to use
10
+# "bioc_use_devel: true", otherwise remove or comment-out this line.
11
+# TODO: Should --timings be added to r_check_args to mimic BioC build machines?
12
+bioc_required: true
13
+bioc_use_devel: true
14
+# On CRAN all warnings are treated as errors; this is not true on Bioconductor.
15
+warnings_are_errors: false
16
+
17
+# Need this if also using covr to get code coverage
18
+r_github_packages:
19
+  - jimhester/covr
20
+after_success:
21
+  - Rscript -e 'covr::codecov()'
22
+
23
+# bsseq-specific stuff
24
+r_build_args: "--no-build-vignettes --no-manual"
25
+# TODO: Travis is reporting an error when checking the vignette, hence the
26
+# inclusion of --no-vignettes in r_check_args
27
+r_check_args: "--no-vignettes"
28
+# Must 'manually' install packages in SUGGESTS in order to run full tests
29
+r_binary_packages:
30
+  - RUnit
31
+bioc_packages:
32
+  - bsseqData
33
+  - BiocStyle
... ...
@@ -1,5 +1,5 @@
1 1
 Package: bsseq
2
-Version: 1.7.0
2
+Version: 1.7.1
3 3
 Title: Analyze, manage and store bisulfite sequencing data
4 4
 Description: A collection of tools for analyzing and visualizing bisulfite
5 5
     sequencing data.
... ...
@@ -10,12 +10,13 @@ Depends:
10 10
     R (>= 2.15),
11 11
     methods,
12 12
     BiocGenerics,
13
-    IRanges (>= 2.1.10),
14 13
     GenomicRanges (>= 1.19.6),
15 14
     SummarizedExperiment (>= 0.1.1),
16 15
     parallel,
17
-    GenomeInfoDb
16
+    limma
18 17
 Imports:
18
+    IRanges (>= 2.1.10),
19
+    GenomeInfoDb,
19 20
     scales,
20 21
     stats,
21 22
     graphics,
... ...
@@ -29,7 +30,8 @@ Imports:
29 30
 Suggests:
30 31
     RUnit,
31 32
     bsseqData,
32
-    BiocStyle
33
+    BiocStyle,
34
+    knitr
33 35
 Collate:
34 36
     hasGRanges.R
35 37
     BSseq_class.R
... ...
@@ -47,8 +49,12 @@ Collate:
47 49
     fisher.R
48 50
     permutations.R
49 51
     BSmooth.fstat.R
52
+    BSseqStat_class.R
53
+    getStats.R
50 54
 License: Artistic-2.0
55
+VignetteBuilder: knitr
51 56
 URL: https://github.com/kasperdanielhansen/bsseq
52
-BugReports: https://github.com/kasperdanielhansen/bsseq/issues
53 57
 LazyData: yes
58
+LazyDataCompression: xz
59
+BugReports: https://github.com/kasperdanielhansen/bsseq/issues
54 60
 biocViews: DNAMethylation
... ...
@@ -31,6 +31,7 @@ import(S4Vectors)
31 31
 importFrom(gtools, "combinations")
32 32
 importFrom(data.table, "fread", "setnames")
33 33
 importFrom(R.utils, "isGzipped", "gunzip")
34
+import(limma)
34 35
 
35 36
 ##
36 37
 ## Exporting
... ...
@@ -40,6 +41,7 @@ importFrom(R.utils, "isGzipped", "gunzip")
40 41
 exportClasses("hasGRanges",
41 42
               "BSseq",
42 43
               "BSseqTstat",
44
+              "BSseqStat",
43 45
               "matrixOrNULL")
44 46
 
45 47
 exportMethods("[", "show",
... ...
@@ -64,7 +66,8 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
64 66
        "plotRegion", "plotManyRegions",
65 67
        "read.umtab", "read.umtab2", "read.bsmooth", "read.bismark",
66 68
        "poissonGoodnessOfFit", "binomialGoodnessOfFit",
67
-       "data.frame2GRanges", "BSseqTstat")
69
+       "data.frame2GRanges", "BSseqTstat",
70
+       "BSseqStat", "BSmooth.fstat", "smoothSds", "computeStat")
68 71
 
69 72
 S3method("print", "chisqGoodnessOfFit")
70 73
 S3method("plot", "chisqGoodnessOfFit")
... ...
@@ -1,6 +1,46 @@
1
-BSmooth.fstat <- function(BSseq, group1, group2, estimate.var = c("same", "paired", "group2"),
2
-                          local.correct = TRUE, maxGap = NULL, qSd = 0.75, k = 101, mc.cores = 1, verbose = TRUE){
3
-    smoothSd <- function(Sds, k) {
1
+BSmooth.fstat <- function(BSseq, design, contrasts, returnModelCoefficients = FALSE, verbose = TRUE){
2
+    stopifnot(is(BSseq, "BSseq"))
3
+    stopifnot(hasBeenSmoothed(BSseq))
4
+        
5
+    ## if(any(rowSums(getCoverage(BSseq)[, unlist(groups)]) == 0))
6
+    ##     warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first")
7
+    
8
+    if(verbose) cat("[BSmooth.fstat] fitting linear models ... ")
9
+    ptime1 <- proc.time()
10
+    allPs <- getMeth(BSseq, type = "smooth", what = "perBase",
11
+                     confint = FALSE)
12
+    fit <- lmFit(allPs, design)
13
+    fitC <- contrasts.fit(fit, contrasts)
14
+    ## Need
15
+    ##   fitC$coefficients, fitC$stdev.unscaled, fitC$sigma, fitC$cov.coefficients
16
+    ## actuall just need
17
+    ##   tstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma
18
+    ##   rawSds <- fitC$sigma
19
+    ##   cor.coefficients <- cov2cor(fitC$cov.coefficients)
20
+    rawSds <- fitC$sigma
21
+    cor.coefficients <- cov2cor(fitC$cov.coefficients)
22
+    rawTstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma
23
+    names(dimnames(rawTstats)) <- NULL
24
+    ptime2 <- proc.time()
25
+    stime <- (ptime2 - ptime1)[3]
26
+    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
27
+
28
+    parameters <- c(BSseq@parameters,
29
+                    list(design = design, contrasts = contrasts))
30
+    stats <- list(rawSds = rawSds,
31
+                  cor.coefficients = cor.coefficients,
32
+                  rawTstats = rawTstats)
33
+    if(returnModelCoefficients) {
34
+        stats$modelCoefficients <- fit$coefficients
35
+    }
36
+    out <- BSseqStat(gr = granges(BSseq),
37
+                     stats = stats, parameters = parameters)
38
+    out
39
+}
40
+
41
+smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1,
42
+                      maxGap = 10^8, verbose = TRUE) {
43
+    smoothSd <- function(Sds, k, qSd) {
4 44
         k0 <- floor(k/2)
5 45
         if(all(is.na(Sds))) return(Sds)
6 46
         thresSD <- pmax(Sds, quantile(Sds, qSd, na.rm = TRUE), na.rm = TRUE)
... ...
@@ -8,9 +48,65 @@ BSmooth.fstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
8 48
         sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k))
9 49
         sSds
10 50
     }
11
-    compute.correction <- function(idx, qSd = 0.75) {
12
-        xx <- start(BSseq)[idx]
13
-        yy <- tstat[idx]
51
+    if(is.null(maxGap))
52
+        maxGap <- BSseqStat@parameters[["maxGap"]]
53
+    if(is.null(maxGap))
54
+        stop("need to set argument 'maxGap'")
55
+    if(verbose) cat("[smoothSds] preprocessing ... ")
56
+    ptime1 <- proc.time()
57
+    clusterIdx <- makeClusters(granges(BSseqStat), maxGap = maxGap)
58
+    ptime2 <- proc.time()
59
+    stime <- (ptime2 - ptime1)[3]
60
+    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
61
+    smoothSds <- do.call("c",
62
+                         mclapply(clusterIdx, function(idx) {
63
+                             smoothSd(getStats(BSseqStat, what = "rawSds")[idx], k = k, qSd = qSd)
64
+                         }, mc.cores = mc.cores))
65
+    if("smoothSds" %in% names(getStats(BSseqStat)))
66
+        BSseqStat@stats[["smoothSds"]] <- stat
67
+    else
68
+        BSseqStat@stats <- c(getStats(BSseqStat), list(smoothSds = smoothSds))
69
+    BSseqStat
70
+}
71
+
72
+
73
+computeStat <- function(BSseqStat, coef = NULL) {
74
+    stopifnot(is(BSseqStat, "BSseqStat"))
75
+    if(is.null(coef)) {
76
+        coef <- 1:ncol(BSseqStat$rawTstats)
77
+    }
78
+    tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE]
79
+    tstats <- tstats * getStats(BSseqStat, what = "rawSds") /
80
+        getStats(BSseqStat, what = "smoothSds")
81
+    if(length(coef) > 1) {
82
+        cor.coefficients <- getStats(BSseqStat, what = "cor.coefficients")[coef,coef]
83
+        stat <- as.numeric(classifyTestsF(tstats, cor.coefficients,
84
+                                          fstat.only = TRUE))
85
+        stat.type <- "fstat"
86
+    } else {
87
+        stat <- as.numeric(tstats)
88
+        stat.type <- "tstat"
89
+    }
90
+    if("stat" %in% names(getStats(BSseqStat))) {
91
+        BSseqStat@stats[["stat"]] <- stat
92
+        BSseqStat@stats[["stat.type"]] <- stat.type
93
+    } else {
94
+        BSseqStat@stats <- c(getStats(BSseqStat),
95
+                             list(stat = stat, stat.type = stat.type))
96
+    }
97
+    BSseqStat
98
+}
99
+
100
+localCorrectStat <- function(BSseqStat, threshold = c(-15,15), mc.cores = 1, verbose = TRUE) {
101
+    compute.correction <- function(idx) {
102
+        xx <- start(BSseqTstat)[idx]
103
+        yy <- tstat[idx] ## FIXME
104
+        if(!is.null(threshold)) {
105
+            stopifnot(is.numeric(threshold) && length(threshold) == 2)
106
+            stopifnot(threshold[1] < 0 && threshold[2] > 0)
107
+            yy[yy < threshold[1]] <- threshold[1]
108
+            yy[yy > threshold[2]] <- threshold[2]
109
+        }
14 110
         suppressWarnings({
15 111
             drange <- diff(range(xx, na.rm = TRUE))
16 112
         })
... ...
@@ -24,115 +120,19 @@ BSmooth.fstat <- function(BSseq, group1, group2, estimate.var = c("same", "paire
24 120
         correction <- predict(fit, newdata = data.frame(xx.reg = xx))
25 121
         yy - correction 
26 122
     }
27
-
28
-    estimate.var <- match.arg(estimate.var)
29
-    stopifnot(is(BSseq, "BSseq"))
30
-    stopifnot(hasBeenSmoothed(BSseq))
31
-    if(is.character(group1)) {
32
-        stopifnot(all(group1 %in% sampleNames(BSseq)))
33
-        group1 <- match(group1, sampleNames(BSseq))
34
-    }
35
-    if(is.numeric(group1)) {
36
-        stopifnot(min(group1) >= 1 & max(group1) <= ncol(BSseq))
37
-    } else stop("problems with argument 'group1'")
38
-    if(is.character(group2)) {
39
-        stopifnot(all(group2 %in% sampleNames(BSseq)))
40
-        group2 <- match(group2, sampleNames(BSseq))
41
-    }    
42
-    if(is.numeric(group2)) {
43
-        stopifnot(min(group2) >= 1 & max(group2) <= ncol(BSseq))
44
-    } else stop("problems with argument 'group2'")
45
-    stopifnot(length(intersect(group1, group2)) == 0)
46
-    stopifnot(length(group1) > 0)
47
-    stopifnot(length(group2) > 0)
48
-    stopifnot(length(group1) + length(group2) >= 3)
49
-    if(estimate.var == "paired")
50
-        stopifnot(length(group1) == length(group2))
51
-    
52
-    if(any(rowSums(getCoverage(BSseq)[, c(group1, group2)]) == 0))
53
-        warning("Computing f-statistics at locations where there is no data; consider subsetting the 'BSseq' object first")
54
-    
55
-    if(is.null(maxGap))
56
-        maxGap <- BSseq@parameters$maxGap
57
-    if(is.null(maxGap))
58
-        stop("need to set argument 'maxGap'")
59
-    
60
-    if(verbose) cat("[BSmooth.fstat] preprocessing ... ")
123
+    maxGap <- BSseqStat$parameters$maxGap
124
+    if(verbose) cat("[BSmooth.tstat] preprocessing ... ")
61 125
     ptime1 <- proc.time()
62
-    clusterIdx <- makeClusters(BSseq, maxGap = maxGap)
126
+    clusterIdx <- makeClusters(BSseqStat$gr, maxGap = maxGap)
63 127
     ptime2 <- proc.time()
64 128
     stime <- (ptime2 - ptime1)[3]
65 129
     if(verbose) cat(sprintf("done in %.1f sec\n", stime))
66
-        
67
-    if(verbose) cat("[BSmooth.fstat] computing stats within groups ... ")
68
-    ptime1 <- proc.time()
69
-    allPs <- getMeth(BSseq, type = "smooth", what = "perBase",
70
-                     confint = FALSE)
71
-    group1.means <- rowMeans(allPs[, group1, drop = FALSE], na.rm = TRUE)
72
-    group2.means <- rowMeans(allPs[, group2, drop = FALSE], na.rm = TRUE)
73
-    ptime2 <- proc.time()
74
-    stime <- (ptime2 - ptime1)[3]
75
-    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
76
-    
77
-    if(verbose) cat("[BSmooth.fstat] computing stats across groups ... ")
78
-    ptime1 <- proc.time()
79
-    switch(estimate.var,
80
-           "group2" = {
81
-               rawSds <- rowSds(allPs[, group2, drop = FALSE], na.rm = TRUE)
82
-               smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
83
-                   smoothSd(rawSds[idx], k = k)
84
-               }, mc.cores = mc.cores))
85
-               scale <- sqrt(1/length(group1) + 1/length(group2))
86
-               tstat.sd <- smoothSds * scale
87
-           },
88
-           "same" = {
89
-               rawSds <- sqrt( ((length(group1) - 1) * rowVars(allPs[, group1, drop = FALSE]) +
90
-                                (length(group2) - 1) * rowVars(allPs[, group2, drop = FALSE])) /
91
-                              (length(group1) + length(group2) - 2))
92
-               smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
93
-                   smoothSd(rawSds[idx], k = k)
94
-               }, mc.cores = mc.cores))
95
-               scale <- sqrt(1/length(group1) + 1/length(group2))
96
-               tstat.sd <- smoothSds * scale
97
-           },
98
-           "paired" = {
99
-               rawSds <- rowSds(allPs[, group1, drop = FALSE] - allPs[, group2, drop = FALSE])
100
-               smoothSds <- do.call(c, mclapply(clusterIdx, function(idx) {
101
-                   smoothSd(rawSds[idx], k = k)
102
-               }, mc.cores = mc.cores))
103
-               scale <- sqrt(1/length(group1))
104
-               tstat.sd <- smoothSds * scale
105
-           })
106
-    tstat <- (group1.means - group2.means) / tstat.sd
107
-    is.na(tstat)[tstat.sd == 0] <- TRUE
108
-    if(local.correct) {
109
-        tstat.corrected <- do.call(c, mclapply(clusterIdx,
110
-                                               compute.correction, qSd = qSd,
111
-                                               mc.cores = mc.cores))
112
-    }
113
-    ptime2 <- proc.time()
114
-    stime <- (ptime2 - ptime1)[3]
115
-    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
116
-    
117
-    if(local.correct) {
118
-        stats <- cbind(rawSds, tstat.sd, group2.means, group1.means,
119
-                       tstat, tstat.corrected)
120
-        colnames(stats) <- c("rawSds", "tstat.sd",
121
-                             "group2.means", "group1.means", "tstat",
122
-                             "tstat.corrected")
123
- 
124
-    } else {
125
-        stats <- cbind(rawSds, tstat.sd, group2.means, group1.means,
126
-                       tstat)
127
-        colnames(stats) <- c("rawSds", "tstat.sd",
128
-                             "group2.means", "group1.means", "tstat")
129
-    }
130
-    
131
-    parameters <- c(BSseq@parameters,
132
-                    list(tstatText = sprintf("BSmooth.fstat (local.correct = %s, maxGap = %d)",
133
-                         local.correct, maxGap),
134
-                         group1 = group1, group2 = group2, k = k, qSd = qSd,
135
-                         local.correct = local.correct, maxGap = maxGap))
136
-    out <- BSseqTstat(gr = granges(BSseq), stats = stats, parameters = parameters)
137
-    out
130
+    stat <- BSseqStat$stat
131
+    stat.corrected <- do.call(c, mclapply(clusterIdx, compute.correction,
132
+                                          mc.cores = mc.cores))
133
+    BSseqTstat@stats <- cbind(getStats(BSseqTstat),
134
+                              "tstat.corrected" = stat.corrected)
135
+    BSseqTstat@parameters$local.local <- TRUE
136
+    BSseqTstat
138 137
 }
138
+
139 139
new file mode 100644
... ...
@@ -0,0 +1,92 @@
1
+setClass("BSseqStat", contains = "hasGRanges", 
2
+         representation(stats = "list",
3
+                        parameters = "list")
4
+         )
5
+
6
+setValidity("BSseqStat", function(object) {
7
+    msg <- NULL
8
+    if(is.null(names(object@stats)) || any(names(object@stats) == "") ||
9
+       anyDuplicated(names(object@stats)))
10
+        msg <- validMsg(msg, "the 'stats' list needs to be named with unique names.")
11
+    for(name in c("rawSds", "smoothsSds", "stat")) {
12
+        if(name %in% names(object@stats) && !is.vector(object@stats[[name]]))
13
+            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to be a vector", name))
14
+        if(name %in% names(object@stats) && length(object@stats[[name]]) != length(object@gr))
15
+            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to have the same length as slot 'gr'", name))
16
+    }
17
+    for(name in c("rawTstats")) {
18
+        if(name %in% names(object@stats) && !is.matrix(object@stats[[name]]))
19
+            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to be a matrix", name))
20
+        if(name %in% names(object@stats) && nrow(object@stats[[name]]) != length(object@gr))
21
+            msg <- validMsg(msg, sprintf("component '%s' of slot 'stats' have to have the same number of rows as slot 'gr' is long", name))
22
+    }
23
+    if(is.null(msg)) TRUE else msg
24
+})
25
+
26
+setMethod("show", signature(object = "BSseqStat"),
27
+          function(object) {
28
+              cat("An object of type 'BSseqStat' with\n")
29
+              cat(" ", length(object), "methylation loci\n")
30
+              cat("based on smoothed data:\n")
31
+              cat(" ", object@parameters$smoothText, "\n")
32
+              cat("with parameters\n")
33
+              cat(" ", object@parameters$tstatText, "\n")
34
+          })
35
+
36
+setMethod("[", "BSseqStat", function(x, i, ...) {
37
+    if(missing(i))
38
+        stop("need [i] for subsetting")
39
+    if(missing(i))
40
+        return(x)
41
+    x@gr <- x@gr[i]
42
+    x@stats <- lapply(names(x@stats), function(nam) {
43
+        if(nam %in% c("rawTstats")) {
44
+            stopifnot(is.matrix(x@stats[[nam]]))
45
+            return(x@stats[[nam]][i,,drop=FALSE])
46
+        }
47
+        if(nam %in% c("rawSds", "smoothSds", "stat")) {
48
+            stopifnot(is.vector(x@stats[[nam]]))
49
+            return(x@stats[[nam]][i])
50
+        }
51
+        x@stats[[nam]]
52
+    })
53
+    x
54
+})
55
+
56
+BSseqStat <- function(gr = NULL, stats = NULL, parameters = NULL) {
57
+    out <- new("BSseqStat")
58
+    out@gr <- gr
59
+    out@stats <- stats
60
+    out@parameters <- parameters
61
+    out
62
+}
63
+
64
+
65
+## summary.BSseqStat <- function(object, ...) {
66
+##     quant <- quantile(getStats(object)[, "tstat.corrected"],
67
+##                       prob = c(0.0001, 0.001, 0.01, 0.5, 0.99, 0.999, 0.9999))
68
+##     quant <- t(t(quant))
69
+##     colnames(quant) <- "quantiles"
70
+##     out <- list(quantiles = quant)
71
+##     class(out) <- "summary.BSseqStat"
72
+##     out
73
+## }
74
+
75
+## print.summary.BSseqStat <- function(x, ...) {
76
+##     print(as.matrix(x$quantiles))
77
+## }
78
+
79
+## plot.BSseqStat <- function(x, y, ...) {
80
+##     tstat <- getStats(x)[, "tstat"]
81
+##     plot(density(tstat), xlim = c(-10,10), col = "blue", main = "")
82
+##     if("tstat.corrected" %in% colnames(getStats(x))) {
83
+##         tstat.cor <- getStats(x)[, "tstat.corrected"]
84
+##         lines(density(tstat.cor), col = "black")
85
+##         legend("topleft", legend = c("uncorrected", "corrected"), lty = c(1,1),
86
+##                col = c("blue", "black"))
87
+##     } else {
88
+##         legend("topleft", legend = c("uncorrected"), lty = 1,
89
+##                col = c("blue"))
90
+##     }
91
+## }
92
+
... ...
@@ -65,42 +65,3 @@ plot.BSseqTstat <- function(x, y, ...) {
65 65
     }
66 66
 }
67 67
 
68
-getStats <- function(BSseqTstat, regions = NULL, stat = "tstat.corrected") {
69
-    stopifnot(is(BSseqTstat, "BSseqTstat"))
70
-    if(is.null(regions))
71
-        return(BSseqTstat@stats)
72
-    if(class(regions) == "data.frame")
73
-        regions <- data.frame2GRanges(regions)
74
-    stopifnot(stat %in% colnames(BSseqTstat@stats))
75
-    stopifnot(length(stat) == 1)
76
-    stopifnot(is(regions, "GenomicRanges"))
77
-    ov <- findOverlaps(BSseqTstat, regions)
78
-    ov.sp <- split(queryHits(ov), subjectHits(ov))
79
-    getRegionStats <- function(idx) {
80
-        mat <- BSseqTstat@stats[idx,, drop=FALSE]
81
-        areaStat <- sum(mat[, stat])
82
-        maxStat <- max(mat[, stat])
83
-        c(areaStat, maxStat)
84
-    }
85
-    stats <- matrix(NA, ncol = 2, nrow = length(regions))
86
-    colnames(stats) <- c("areaStat", "maxStat")
87
-    tmp <- lapply(ov.sp, getRegionStats)
88
-    stats[as.integer(names(tmp)),] <- do.call(rbind, tmp)
89
-    out <- as.data.frame(stats)
90
-    if(! stat %in% c("tstat.corrected", "tstat"))
91
-        return(out)
92
-    getRegionStats_ttest <- function(idx) {
93
-        mat <- BSseqTstat@stats[idx,, drop=FALSE]
94
-        group1.mean <- mean(mat[, "group1.means"])
95
-        group2.mean <- mean(mat[, "group2.means"])
96
-        meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"])
97
-        tstat.sd <- mean(mat[, "tstat.sd"])
98
-        c(meanDiff, group1.mean, group2.mean, tstat.sd)
99
-    }
100
-    stats_ttest<- matrix(NA, ncol = 4, nrow = length(regions))
101
-    colnames(stats_ttest) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd")
102
-    tmp <- lapply(ov.sp, getRegionStats_ttest)
103
-    stats_ttest[as.integer(names(tmp)),] <- do.call(rbind, tmp)
104
-    out <- cbind(out, as.data.frame(stats_ttest))
105
-    out
106
-}
... ...
@@ -1,17 +1,23 @@
1
-dmrFinder <- function(BSseqTstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
1
+dmrFinder <- function(bstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
2 2
                       maxGap = 300, stat = "tstat.corrected", verbose = TRUE) {
3
+    if(is(bstat, "BSseqTstat")) {
4
+        if(! stat %in% colnames(bstat@stats))
5
+            stop("'stat' needs to be a column of 'bstat@stats'")
6
+        dmrStat <- bstat@stats[, stat]
7
+    }
8
+    if(is(bstat, "BSseqStat")) {
9
+        dmrStat <- getStats(bstat, what = "stat")
10
+    }
3 11
     subverbose <- max(as.integer(verbose) - 1L, 0L)
4
-    if(! stat %in% colnames(BSseqTstat@stats))
5
-        stop("'stat' needs to be a column of 'BSseqTstat@stats'")
6 12
     if(is.null(cutoff))
7
-        cutoff <- quantile(BSseqTstat@stats[, stat], qcutoff)
13
+        cutoff <- quantile(dmrStat, qcutoff)
8 14
     if(length(cutoff) == 1)
9 15
         cutoff <- c(-cutoff, cutoff)
10
-    direction <- as.integer(BSseqTstat@stats[, stat] >= cutoff[2])
11
-    direction[BSseqTstat@stats[, stat] <= cutoff[1]] <- -1L
16
+    direction <- as.integer(dmrStat >= cutoff[2])
17
+    direction[dmrStat <= cutoff[1]] <- -1L
12 18
     direction[is.na(direction)] <- 0L
13
-    chrs <- as.character(seqnames(BSseqTstat@gr))
14
-    positions <- start(BSseqTstat)
19
+    chrs <- as.character(seqnames(bstat))
20
+    positions <- start(bstat)
15 21
     regions <- regionFinder3(direction, chr = chrs, positions = positions,
16 22
                              maxGap = maxGap, verbose = subverbose)
17 23
     if(is.null(regions$down) && is.null(regions$up))
... ...
@@ -22,12 +28,19 @@ dmrFinder <- function(BSseqTstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
22 28
     regions$width <- regions$end - regions$start + 1
23 29
     regions$invdensity <- regions$width / regions$n
24 30
     regions$chr <- as.character(regions$chr)
25
-    stats <- getStats(BSseqTstat, regions, stat = stat)
26
-    regions <- cbind(regions, stats)
27
-    if(stat %in% c("tstat.corrected", "tstat")) {
28
-        regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo")
31
+    if(is(bstat, "BSseqTstat")) {
32
+        stats <- getStats(bstat, regions, stat = stat)
33
+        regions <- cbind(regions, stats)
34
+        if(stat %in% c("tstat.corrected", "tstat")) {
35
+            regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo")
36
+        }
37
+        regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),]
38
+    }
39
+    if(is(bstat, "BSseqStat")) {
40
+        stats <- getStats(bstat, regions)
41
+        regions <- cbind(regions, stats)
42
+        regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),]
29 43
     }
30
-    regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),]
31 44
     regions
32 45
 }
33 46
 
34 47
new file mode 100644
... ...
@@ -0,0 +1,75 @@
1
+getStats <- function(bstat, regions = NULL, ...) {
2
+    stopifnot(is(bstat, "BSseqTstat") || is(bstat, "BSseqStat"))
3
+    if(is(bstat, "BSseqTstat"))
4
+        return(getStats_BSseqTstat(bstat, regions = regions, ...))
5
+    getStats_BSseqStat(bstat, regions = regions, ...)
6
+}
7
+
8
+getStats_BSseqStat <- function(BSseqStat, regions = NULL, what = NULL) {
9
+    stopifnot(is(BSseqStat, "BSseqStat"))
10
+    if(!is.null(what)) {
11
+        stopifnot(what %in% names(BSseqStat@stats))
12
+        return(BSseqStat@stats[[what]])
13
+    }
14
+    if(is.null(regions))
15
+        return(BSseqStat@stats)
16
+    ## Now we have regions and no what
17
+    if(class(regions) == "data.frame")
18
+        regions <- data.frame2GRanges(regions)
19
+    ov <- findOverlaps(BSseqStat, regions)
20
+    ov.sp <- split(queryHits(ov), subjectHits(ov))
21
+    ## We need areaStat and maxStat
22
+    ## Could need averageMeth in each group?
23
+    ## Need to have a specific design for that
24
+    ## Could at least get contrast coefficient
25
+    getRegionStats <- function(idx) {
26
+        areaStat <- sum(BSseqStat@stats$stat[idx], na.rm = TRUE)
27
+        maxStat <- max(BSseqStat@stats$stat[idx], na.rm = TRUE)
28
+        c(areaStat, maxStat)
29
+    }
30
+    regionStats <- matrix(NA, ncol = 2, nrow = length(regions))
31
+    colnames(regionStats) <- c("areaStat", "maxStat")
32
+    tmp <- lapply(ov.sp, getRegionStats)
33
+    regionStats[as.integer(names(tmp)),] <- do.call(rbind, tmp)
34
+    regionStats
35
+}
36
+
37
+getStats_BSseqTstat <- function(BSseqTstat, regions = NULL, stat = "tstat.corrected") {
38
+    stopifnot(is(BSseqTstat, "BSseqTstat"))
39
+    if(is.null(regions))
40
+        return(BSseqTstat@stats)
41
+    if(class(regions) == "data.frame")
42
+        regions <- data.frame2GRanges(regions)
43
+    stopifnot(stat %in% colnames(BSseqTstat@stats))
44
+    stopifnot(length(stat) == 1)
45
+    stopifnot(is(regions, "GenomicRanges"))
46
+    ov <- findOverlaps(BSseqTstat, regions)
47
+    ov.sp <- split(queryHits(ov), subjectHits(ov))
48
+    getRegionStats <- function(idx) {
49
+        mat <- BSseqTstat@stats[idx,, drop=FALSE]
50
+        areaStat <- sum(mat[, stat])
51
+        maxStat <- max(mat[, stat])
52
+        c(areaStat, maxStat)
53
+    }
54
+    stats <- matrix(NA, ncol = 2, nrow = length(regions))
55
+    colnames(stats) <- c("areaStat", "maxStat")
56
+    tmp <- lapply(ov.sp, getRegionStats)
57
+    stats[as.integer(names(tmp)),] <- do.call(rbind, tmp)
58
+    out <- as.data.frame(stats)
59
+    if(! stat %in% c("tstat.corrected", "tstat"))
60
+        return(out)
61
+    getRegionStats_ttest <- function(idx) {
62
+        mat <- BSseqTstat@stats[idx,, drop=FALSE]
63
+        group1.mean <- mean(mat[, "group1.means"])
64
+        group2.mean <- mean(mat[, "group2.means"])
65
+        meanDiff <- mean(mat[, "group1.means"] - mat[, "group2.means"])
66
+        tstat.sd <- mean(mat[, "tstat.sd"])
67
+        c(meanDiff, group1.mean, group2.mean, tstat.sd)
68
+    }
69
+    stats_ttest<- matrix(NA, ncol = 4, nrow = length(regions))
70
+    colnames(stats_ttest) <- c("meanDiff", "group1.mean", "group2.mean", "tstat.sd")
71
+    tmp <- lapply(ov.sp, getRegionStats_ttest)
72
+    stats_ttest[as.integer(names(tmp)),] <- do.call(rbind, tmp)
73
+    out <- cbind(out, as.data.frame(stats_ttest))
74
+    out
75
+}
... ...
@@ -1,8 +1,8 @@
1
-getNullPermutation <- function(BSseq, idxMatrix1, idxMatrix2,
2
-                               estimate.var, local.correct,
3
-                               cutoff, stat, maxGap, mc.cores = 1) {
1
+getNullDistribution_BSmooth.tstat <- function(BSseq, idxMatrix1, idxMatrix2,
2
+                                              estimate.var, local.correct,
3
+                                              cutoff, stat, maxGap, mc.cores = 1) {
4 4
     stopifnot(nrow(idxMatrix1) == nrow(idxMatrix2))
5
-    cat("[getNullPermutation] performing", nrow(idxMatrix1), "permutations\n")
5
+    message(sprintf("[getNullDistribution_BSmooth.tstat] performing %d permutations\n", nrow(idxMatrix1)))
6 6
     nullDist <- mclapply(1:nrow(idxMatrix1), function(ii) {
7 7
         ptime1 <- proc.time()
8 8
         BS.tstat <- BSmooth.tstat(BSseq, estimate.var = estimate.var,
... ...
@@ -13,31 +13,50 @@ getNullPermutation <- function(BSseq, idxMatrix1, idxMatrix2,
13 13
         dmrs0 <- dmrFinder(BS.tstat, stat = stat, cutoff = cutoff, maxGap = maxGap)
14 14
         ptime2 <- proc.time()
15 15
         stime <- (ptime2 - ptime1)[3]
16
-        cat(sprintf("[getNullPermutation] completing permutation %d in %.1f sec\n",
17
-                    ii, stime))
16
+        message(sprintf("[getNullDistribution_BSmooth.tstat] completing permutation %d in %.1f sec\n", ii, stime))
18 17
         dmrs0
19 18
     }, mc.cores = min(nrow(idxMatrix1), mc.cores), mc.preschedule = FALSE)
20 19
     nullDist
21 20
 }
22 21
 
23
-getNullBlocks <- function(BSseq, idxMatrix1, idxMatrix2, estimate.var = "same",
22
+getNullDistribution_BSmooth.fstat <- function(BSseq, idxMatrix, design, contrasts,
23
+                                              coef = NULL, cutoff, maxGap.sd, maxGap.dmr, mc.cores = 1) {
24
+    message(sprintf("[getNullDistribution_BSmooth.fstat] performing %d permutations\n", nrow(idxMatrix)))
25
+    nullDist <- mclapply(1:nrow(idxMatrix), function(ii) {
26
+        ptime1 <- proc.time()
27
+        bstat <- BSmooth.fstat(BSseq[, idxMatrix[ii,]], design = design, contrasts = contrasts)
28
+        bstat <- smoothSds(bstat, maxGap = maxGap.sd)
29
+        bstat <- computeStat(bstat, coef = coef)
30
+        dmrs0 <- dmrFinder(bstat, cutoff = cutoff, maxGap = maxGap.dmr)
31
+        ptime2 <- proc.time()
32
+        stime <- (ptime2 - ptime1)[3]
33
+        message(sprintf("[getNullDistribution_BSmooth.fstat] completing permutation %d in %.1f sec\n", ii, stime))
34
+        dmrs0
35
+    }, mc.cores = min(nrow(idxMatrix), mc.cores), mc.preschedule = FALSE)
36
+    nullDist
37
+}
38
+
39
+getNullBlocks_BSmooth.tstat <- function(BSseq, idxMatrix1, idxMatrix2, estimate.var = "same",
24 40
                           mc.cores = 1) {
25
-    getNullPermutation(BSseq = BSseq, idxMatrix1 = idxMatrix1,
41
+    getNullDistribution_BSmooth.tstat(BSseq = BSseq, idxMatrix1 = idxMatrix1,
26 42
                        idxMatrix2 = idxMatrix2, local.correct = FALSE,
27 43
                        estimate.var = estimate.var,
28 44
                        cutoff = c(-2,2), stat = "tstat", maxGap = 10000,
29 45
                        mc.cores = mc.cores)
30 46
 }
31 47
 
32
-getNullDmrs <- function(BSseq, idxMatrix1, idxMatrix2, estimate.var = "same",
48
+getNullDmrs_BSmooth.tstat <- function(BSseq, idxMatrix1, idxMatrix2, estimate.var = "same",
33 49
                         mc.cores = 1) {
34
-    getNullPermutation(BSseq = BSseq, idxMatrix1 = idxMatrix1,
50
+    getNullDistribution_BSmooth.tstat(BSseq = BSseq, idxMatrix1 = idxMatrix1,
35 51
                        idxMatrix2 = idxMatrix2, local.correct = TRUE,
36 52
                        estimate.var = estimate.var,
37 53
                        cutoff = c(-4.6,4.6), stat = "tstat.corrected", maxGap = 300,
38 54
                        mc.cores = mc.cores)
39 55
 }
40 56
 
57
+
58
+
59
+
41 60
 subsetDmrs <- function(xx) {
42 61
     if(is.null(xx) || is(xx, "try-error"))
43 62
         return(NULL)
... ...
@@ -176,17 +195,21 @@ makeIdxMatrix <- function(group1, group2, testIsSymmetric = TRUE, includeUnbalan
176 195
         } else {
177 196
             idxMatrix1 <- rbind(group1, group2,
178 197
                                 combineMat(subsetByMatrix(group1, combinations(6,3)),
179
-                                           subsetByMatrix(group2, combinations(6,2))))
198
+                                           subsetByMatrix(group2, combinations(6,3))))
180 199
         }
181 200
         if(includeUnbalanced) {
182
-            newMatrix <- combineMat(subsetByMatrix(group1, combinations(6,5)),
201
+            newMatrix1 <- combineMat(subsetByMatrix(group1, combinations(6,4)),
202
+                                    subsetByMatrix(group2, combinations(6,2)))
203
+            newMatrix2 <- combineMat(subsetByMatrix(group1, combinations(6,5)),
183 204
                                     as.matrix(group2, ncol = 1))
184
-            idxMatrix1 <- rbind(idxMatrix1, newMatrix)
205
+            idxMatrix1 <- rbind(idxMatrix1, newMatrix1, newMatrix2)
185 206
         }
186 207
         if(includeUnbalanced && !testIsSymmetric) {
187
-            newMatrix <- combineMat(as.matrix(group1, ncol = 1),
188
-                                    subsetByMatrix(group2, combinations(6,3)))
189
-            idxMatrix1 <- rbind(idxMatrix1, newMatrix)
208
+            newMatrix1 <- combineMat(subsetByMatrix(group1, combinations(6,2)),
209
+                                     subsetByMatrix(group2, combinations(6,4)))
210
+            newMatrix2 <- combineMat(as.matrix(group1, ncol = 1),
211
+                                     subsetByMatrix(group2, combinations(6,5)))
212
+            idxMatrix1 <- rbind(idxMatrix1, newMatrix1, newMatrix2)
190 213
         }
191 214
     }
192 215
     if(is.null(idxMatrix1))
... ...
@@ -3,8 +3,16 @@ This is the developer version of Bioconductor package [bsseq](http://bioconducto
3 3
 
4 4
 ```r
5 5
 source('http://bioconductor.org/biocLite.R')
6
-biocLite(bsseq')
6
+useDevel(TRUE)
7
+biocLite('bsseq')
7 8
 ```
8 9
 
9 10
 ## R CMD check results
10
-Bioconductor: [Multiple platform build/check report](http://master.bioconductor.org/checkResults/devel/bioc-LATEST/bsseq/)
11
+
12
+## Software status
13
+
14
+| Resource:     | Bioconductor        | Travis CI     |
15
+| ------------- | ------------------- | ------------- |
16
+| _Platforms:_  | _Multiple_          | _Linux_       |
17
+| R CMD check   | <a href="http://bioconductor.org/checkResults/release/bioc-LATEST/bsseq/"><img border="0" src="http://bioconductor.org/shields/build/release/bioc/bsseq.svg" alt="Build status"></a> (release)</br><a href="http://bioconductor.org/checkResults/devel/bioc-LATEST/bsseq/"><img border="0" src="http://bioconductor.org/shields/build/devel/bioc/bsseq.svg" alt="Build status"></a> (devel) | <a href="https://travis-ci.org/kasperdanielhansen/bsseq"><img src="https://travis-ci.org/kasperdanielhansen/bsseq.svg" alt="Build status"></a> |
18
+| Test coverage |                     | <a href="https://codecov.io/github/kasperdanielhansen/bsseq?branch=master"><img src="https://codecov.io/github/kasperdanielhansen/bsseq/coverage.svg?branch=master" alt="Coverage Status"/></a>   |                  |
... ...
@@ -1,13 +1,12 @@
1 1
 bibentry("Article",
2 2
          title = "{BSmooth: from whole genome bisulfite sequencing reads to differentially methylated regions}",
3
-         author = "Hansen, Kasper D. and Langmead, Benjamin and Irizarry, Rafael A.",
3
+         author = c(person(c("Kasper", "D."), "Hansen"),
4
+                    person(c("Benjamin"), "Langmead"),
5
+                    person(c("Rafael", "A."), "Irizarry")),
4 6
          journal = "Genome Biology",
5 7
          year = "2012",
6 8
 	 volume = "13",
9
+         number = "10",
7 10
 	 pages = "R83",
8
-  	 textVersion = 
9
-         paste("KD Hansen, B Langmead and RA Irizarry.",
10
-               "BSmooth: from whole genome bisulfite sequencing reads to differentially methylated regions.", 
11
-               "Genome Biology, 13:R83 (2012)")
12
-)
13
-
11
+         doi = "10.1186/gb-2012-13-10-r83",
12
+         pubmed = "23034175")
... ...
@@ -82,3 +82,21 @@ test_makeIdxMatrix_perm5 <- function() {
82 82
     checkEquals(nrow(res), choose(10,5) / 2)
83 83
 }
84 84
 
85
+test_makeIdxMatrix_perm6 <- function() {
86
+    grp1 <- paste0("A", 1:6)
87
+    grp2 <- paste0("B", 1:6)
88
+
89
+    res <- bsseq:::makeIdxMatrix(grp1, grp2,
90
+                                 testIsSymmetric = FALSE, includeUnbalanced = TRUE)[[1]]
91
+    checkEquals(anyDuplicated(res), 0)
92
+    checkEquals(res[1,], grp1)
93
+    checkEquals(res[2,], grp2)
94
+    checkEquals(nrow(res), choose(12,6))
95
+
96
+    res <- bsseq:::makeIdxMatrix(grp1, grp2,
97
+                                 testIsSymmetric = TRUE, includeUnbalanced = TRUE)[[1]]
98
+    checkEquals(anyDuplicated(res), 0)
99
+    checkEquals(res[1,], grp1)
100
+    checkEquals(nrow(res), choose(12,6) / 2)
101
+}
102
+
... ...
@@ -11,11 +11,11 @@
11 11
   (low, high) cutoff.
12 12
 }
13 13
 \usage{
14
-dmrFinder(BSseqTstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
14
+dmrFinder(bstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
15 15
   maxGap=300, stat = "tstat.corrected", verbose = TRUE)
16 16
 }
17 17
 \arguments{
18
-  \item{BSseqTstat}{An object of class \code{BSseqTstat}.}
18
+  \item{bstat}{An object of class \code{BSseqStat} or \code{BSseqTstat}.}
19 19
   \item{cutoff}{The cutoff of the t-statistics.  This should be a vector
20 20
     of length two giving the (low, high) cutoff.  If \code{NULL}, see
21 21
     \code{qcutoff}.} 
... ...
@@ -8,19 +8,26 @@
8 8
   object.
9 9
 }
10 10
 \usage{
11
-getStats(BSseqTstat, regions = NULL, stat = "tstat.corrected")
11
+getStats(bstat, regions = NULL, ...)
12 12
 }
13 13
 \arguments{
14
-  \item{BSseqTstat}{An object of class \code{BSseqTstat}.}
14
+  \item{bstat}{An object of class \code{BSseqStat} or \code{BSseqTstat}.}
15 15
   \item{regions}{An optional \code{data.frame} or
16 16
     \code{GenomicRanges} object specifying a number of genomic
17
-    regions.} 
18
-  \item{stat}{Which statistics column should be obtained.}
17
+    regions.}
18
+  \item{...}{Additional arguments passed to the different backends based
19
+    on the class of \code{bstat}; see Details.}
19 20
 }
20 21
 \value{
21 22
   An object of class \code{data.frame} possible restricted to the
22 23
   regions specified.
23 24
 }
25
+\details{
26
+  Additional argument when the \code{bstat} object is of class \code{BSseqTstat}:
27
+  \describe{
28
+    \item{stat}{Which statistics column should be obtained.}
29
+  }
30
+}
24 31
 \author{
25 32
   Kasper Daniel Hansen \email{khansen@jhsph.edu}
26 33
 }
... ...
@@ -1,8 +1,9 @@
1 1
 %\VignetteIndexEntry{The bsseq user's guide}
2 2
 %\VignetteDepends{bsseq}
3 3
 %\VignettePackage{bsseq}
4
+%\VignetteEngine{knitr::knitr}
4 5
 \documentclass[12pt]{article}
5
-<<style-Sweave, eval=TRUE, echo=FALSE, results=tex>>=
6
+<<style-knitr, eval=TRUE, echo=FALSE, results="asis">>=
6 7
 BiocStyle::latex()
7 8
 @
8 9
 \newcommand{\bold}[1]{\textbf{#1}} % Fix for R bibentry
... ...
@@ -17,7 +18,7 @@ BiocStyle::latex()
17 18
 
18 19
 The packages required for this vignette are
19 20
 
20
-<<load,results=hide>>=
21
+<<load, warning=FALSE, message=FALSE>>=
21 22
 library(bsseq)
22 23
 @ 
23 24
 
... ...
@@ -91,7 +92,7 @@ methylated position (DMP) referring to a single loci.
91 92
 
92 93
 If you use this package, please cite our BSmooth paper:
93 94
 
94
-<<citation,echo=FALSE,results=tex>>=
95
+<<citation,echo=FALSE,results="asis">>=
95 96
 print(citation("bsseq"), style = "latex")
96 97
 @ 
97 98
 
... ...
@@ -329,7 +330,7 @@ combine the evidence from both strands, CpGs on the ``-'' strand needs to have 1
329 330
 their position.  A full script showing how these files are used to create \Rcode{BS.chr22} can be
330 331
 found in \texttt{inst/scripts/get\_BS.chr22.R} in the \Rpackage{bsseq} package.  The path of this
331 332
 file may be found by
332
-<<locateScript>>=
333
+<<locateScript,results="hide">>=
333 334
 system.file("scripts", "get_BS.chr22.R", package = "bsseq")
334 335
 @ 
335 336
 
... ...
@@ -360,15 +361,14 @@ with the bsseq package'', which also finding DMRs and plotting them.
360 361
 
361 362
 Fisher's exact tests may be efficiently computed using the function \Rcode{fisherTests}.
362 363
 
363
-Binomial and poisson goodness of fit tests statistics may be computed using
364
-\Rcode{binomialGoodnessOfFit} and \Rcode{poissonGoodnessOfFit}.
364
+Binomial and poisson goodness of fit tests statistics may be computed using \Rcode{binomialGoodnessOfFit} and \Rcode{poissonGoodnessOfFit}.
365 365
 
366 366
 \bibliography{bsseq}
367 367
 
368 368
 
369 369
 \section*{SessionInfo}
370 370
 
371
-<<sessionInfo,results=tex,eval=TRUE,echo=FALSE>>=
371
+<<sessionInfo,results="asis",eval=TRUE,echo=FALSE>>=
372 372
 toLatex(sessionInfo())
373 373
 @ 
374 374
 
... ...
@@ -2,8 +2,9 @@
2 2
 %\VignetteDepends{bsseq}
3 3
 %\VignetteDepends{bsseqData}
4 4
 %\VignettePackage{bsseq}
5
+%\VignetteEngine{knitr::knitr}
5 6
 \documentclass[12pt]{article}
6
-<<style-Sweave, eval=TRUE, echo=FALSE, results=tex>>=
7
+<<style-knitr, eval=TRUE, echo=FALSE, results="asis">>=
7 8
 BiocStyle::latex()
8 9
 @
9 10
 \newcommand{\bold}[1]{\textbf{#1}} % Fix for R bibentry
... ...
@@ -16,29 +17,18 @@ BiocStyle::latex()
16 17
 
17 18
 \section*{Introduction}
18 19
 
19
-This document discusses the ins and outs of an analysis of a whole-genome shotgun bisulfite
20
-sequencing (WGBS) dataset, using the BSmooth algorithm, which was first used in \cite{Hansen:2011}
21
-and more formally presented and evaluated in \cite{Hansen:2012}.  The intention with the document is
22
-to focus on analysis-related tasks and questions.  Basic usage of the \Rpackage{bsseq} package is
23
-covered in ``The bsseq user's guide''.  It may be useful to consult the user's guide while reading
24
-this analysis guide.
20
+This document discusses the ins and outs of an analysis of a whole-genome shotgun bisulfite sequencing (WGBS) dataset, using the BSmooth algorithm, which was first used in \cite{Hansen:2011} and more formally presented and evaluated in \cite{Hansen:2012}.  The intention with the document is to focus on analysis-related tasks and questions.  Basic usage of the \Rpackage{bsseq} package is covered in ``The bsseq user's guide''.  It may be useful to consult the user's guide while reading this analysis guide.
25 21
 
26
-In this vignette we analyze chromosome 21 and 22 from \cite{Hansen:2011}.  This is primary data from
27
-3 patients with colon cancer.  For each patient we have normal colon tissue as well as cancer colon.
28
-The samples were run on ABI SOLiD and we generated 50bp single-end reads.  The reads were aligned
29
-using the Merman aligner in the BSmooth suite (\url{http://www.rafalab.org/bsmooth}).  See the
30
-primary publication for more details \cite{Hansen:2011}.
22
+In this vignette we analyze chromosome 21 and 22 from \cite{Hansen:2011}.  This is primary data from 3 patients with colon cancer.  For each patient we have normal colon tissue as well as cancer colon.  The samples were run on ABI SOLiD and we generated 50bp single-end reads.  The reads were aligned using the Merman aligner in the BSmooth suite (\url{http://www.rafalab.org/bsmooth}).  See the primary publication for more details \cite{Hansen:2011}.
31 23
 
32 24
 This data is contained in the \Rpackage{bsseqData}
33 25
 
34
-<<load,results=hide>>=
26
+<<load,warnings=FALSE,messages=FALSE>>=
35 27
 library(bsseq)
36 28
 library(bsseqData)
37 29
 @ 
38 30
 
39
-The \Rpackage{bsseqData} contains a script, \texttt{inst/script/create\_BS.cancer.R}, describing how
40
-this data is created from the Merman alignment output (also contained in the package).  Note that
41
-the current version of the BSmooth pipeline uses a slightly different alignment output format.
31
+The \Rpackage{bsseqData} contains a script, \texttt{inst/script/create\_BS.cancer.R}, describing how this data is created from the Merman alignment output (also contained in the package).  Note that the current version of the BSmooth pipeline uses a slightly different alignment output format.
42 32
 
43 33
 The following object contains the unsmoothed ``raw'' summarized alignment data.
44 34
 <<showData>>=
... ...
@@ -49,7 +39,7 @@ pData(BS.cancer.ex)
49 39
 @ 
50 40
 
51 41
 If you use this package, please cite our BSmooth paper
52
-<<citation,echo=FALSE,results=tex>>=
42
+<<citation,echo=FALSE,results="asis">>=
53 43
 print(citation("bsseq"), style = "latex")
54 44
 @
55 45
 
... ...
@@ -72,18 +62,9 @@ BS.cancer.ex.fit <- updateObject(BS.cancer.ex.fit)
72 62
 BS.cancer.ex.fit
73 63
 @ 
74 64
 
75
-This step uses parallelization where each sample is run on a separate core using \Rcode{mclapply}
76
-from the \Rpackage{parallel} package.  This form of parallelization is built into \Rpackage{bsseq},
77
-and (as written) requires access to a machine with 6 cores and enough RAM.  The smoothing step is
78
-being done completely independently on each sample, so if you have a lot of samples (or other
79
-circumstances), an alternative is to split the computations manually.  A later subsection shows some
80
-example code for doing that.
65
+This step uses parallelization where each sample is run on a separate core using \Rcode{mclapply} from the \Rpackage{parallel} package.  This form of parallelization is built into \Rpackage{bsseq}, and (as written) requires access to a machine with 6 cores and enough RAM.  The smoothing step is being done completely independently on each sample, so if you have a lot of samples (or other circumstances), an alternative is to split the computations manually.  A later subsection shows some example code for doing that.
81 66
 
82
-Let us discuss coverage and representation.  The \Rcode{BS.cancer.ex} object contains all annotated
83
-CpGs on human chromosome 21 and 22, whether or not there is actual data.  Since we have multiple
84
-samples, we can roughly divide the genome into 3 categories: CpGs where all samples have data, CpGs
85
-where none of the samples have data and CpGs where some, but not all, of the samples have data.
86
-Examining the object at hand, we get
67
+Let us discuss coverage and representation.  The \Rcode{BS.cancer.ex} object contains all annotated CpGs on human chromosome 21 and 22, whether or not there is actual data.  Since we have multiple samples, we can roughly divide the genome into 3 categories: CpGs where all samples have data, CpGs where none of the samples have data and CpGs where some, but not all, of the samples have data.  Examining the object at hand, we get
87 68
 
88 69
 <<cpgNumbers>>=
89 70
 ## The average coverage of CpGs on the two chromosomes
... ...
@@ -174,17 +155,14 @@ load("BS1.fit.rda")
174 155
 load("BS2.fit.rda")
175 156
 BS.fit <- combine(BS1.fit, BS2.fit)
176 157
 @ 
177
-This still requires that you have one node with enough RAM to hold all samples in memory. 
158
+
159
+This still requires that you have one node with enough RAM to hold all samples in memory.
178 160
 
179 161
 
180 162
 \section*{Computing t-statistics}
181 163
 
182
-Before computing t-statistics, we will remove CpGs with little or no coverage.  If this is not done,
183
-you may find many DMRs in areas of the genome with very little coverage, which are most likely false
184
-positives.  It is open to personal preferences exactly which CpGs to remove, but for this analysis
185
-we will only keep CpGs where at least 2 cancer samples and at least 2 normal samples have at least
186
-2x in coverage.  For readability, we store the coverage in a separate matrix (this is just due to
187
-line breaks in Sweave)
164
+Before computing t-statistics, we will remove CpGs with little or no coverage.  If this is not done, you may find many DMRs in areas of the genome with very little coverage, which are most likely false positives.  It is open to personal preferences exactly which CpGs to remove, but for this analysis we will only keep CpGs where at least 2 cancer samples and at least 2 normal samples have at least 2x in coverage.  For readability, we store the coverage in a separate matrix (this is just due to line breaks in Sweave)
165
+
188 166
 <<keepLoci>>=
189 167
 BS.cov <- getCoverage(BS.cancer.ex.fit)
190 168
 keepLoci.ex <- which(rowSums(BS.cov[, BS.cancer.ex$Type == "cancer"] >= 2) >= 2 &
... ...
@@ -192,8 +170,8 @@ keepLoci.ex <- which(rowSums(BS.cov[, BS.cancer.ex$Type == "cancer"] >= 2) >= 2
192 170
 length(keepLoci.ex)
193 171
 BS.cancer.ex.fit <- BS.cancer.ex.fit[keepLoci.ex,]
194 172
 @ 
195
-(the \Rcode{keepLoci.ex}  is also available for direct inspection in the \Rpackage{bsseqData}
196
-package.)
173
+
174
+(the \Rcode{keepLoci.ex} is also available for direct inspection in the \Rpackage{bsseqData} package.)
197 175
 
198 176
 
199 177
 We are now ready to compute t-statistics, by
... ...
@@ -206,36 +184,22 @@ BS.cancer.ex.tstat <- BSmooth.tstat(BS.cancer.ex.fit,
206 184
                                     verbose = TRUE)
207 185
 BS.cancer.ex.tstat
208 186
 @ 
209
-(the \Rcode{BS.cancer.ex.tstat}  is also available for direct inspection in the \Rpackage{bsseqData}
210
-package.)
211 187
 
188
+(the \Rcode{BS.cancer.ex.tstat} is also available for direct inspection in the \Rpackage{bsseqData} package.)
212 189
 
213 190
 
214
-The arguments to \Rcode{BSmooth.tstat} are simple.  \Rcode{group1} and \Rcode{group2} contain the
215
-sample names of the two groups being compared (it is always group1 - group2), and indices may be
216
-used instead of sample names.  \Rcode{estimate.var} describes which samples are being used to
217
-estimate the variability.  Because this is a cancer dataset, and cancer have higher variability than
218
-normals, we only use the normal samples to estimate the variability.  Other choices of
219
-\Rcode{estimate.var} are \Rcode{same} (assume same variability in each group) and \Rcode{paired} (do
220
-a paired t-test).  The argument \Rcode{local.correct} describes whether we should use a large-scale
221
-(low-frequency) mean correction.  This is especially important in cancer where we have found many
222
-large-scale methylation differences between cancer and normals.
191
+
192
+The arguments to \Rcode{BSmooth.tstat} are simple.  \Rcode{group1} and \Rcode{group2} contain the sample names of the two groups being compared (it is always group1 - group2), and indices may be used instead of sample names.  \Rcode{estimate.var} describes which samples are being used to estimate the variability.  Because this is a cancer dataset, and cancer have higher variability than normals, we only use the normal samples to estimate the variability.  Other choices of \Rcode{estimate.var} are \Rcode{same} (assume same variability in each group) and \Rcode{paired} (do a paired t-test).  The argument \Rcode{local.correct} describes whether we should use a large-scale (low-frequency) mean correction.  This is especially important in cancer where we have found many large-scale methylation differences between cancer and normals.
223 193
 
224 194
 We can look at the marginal distribution of the t-statistic by
225 195
 
226
-\setkeys{Gin}{width=0.5\textwidth}
227
-<<plotTstat,fig=TRUE,width=5,height=5>>=
196
+<<plotTstat,fig.width=4,fig.height=4>>=
228 197
 plot(BS.cancer.ex.tstat)
229 198
 @ 
230
-\setkeys{Gin}{width=0.8\textwidth}
231
-
232 199
 
233
-The ``blocks'' of hypomethylation are clearly visible in the marginal distribution of the
234
-uncorrected t-statistics.
200
+The ``blocks'' of hypomethylation are clearly visible in the marginal distribution of the uncorrected t-statistics.
235 201
 
236
-Even in comparisons where we do not observe these large-scale methylation differences, it often
237
-improves the marginal distribution of the t-statistics to locally correct them (``improves'' in the
238
-sense of making them more symmetric).
202
+Even in comparisons where we do not observe these large-scale methylation differences, it often improves the marginal distribution of the t-statistics to locally correct them (``improves'' in the sense of making them more symmetric).
239 203
 
240 204
 \section*{Finding DMRs}
241 205
 
... ...
@@ -276,20 +240,14 @@ pData(BS.cancer.ex.fit) <- pData
276 240
 
277 241
 Once this is setup, we can plot a single DMR like
278 242
 
279
-\setkeys{Gin}{width=1\textwidth}
280
-<<plotDmr,fig=TRUE,width=10,height=5>>=
243
+<<plotDmr,fig.width=8,fig.height=4>>=
281 244
 plotRegion(BS.cancer.ex.fit, dmrs[1,], extend = 5000, addRegions = dmrs)
282 245
 @ 
283
-\setkeys{Gin}{width=0.8\textwidth}
284
-\vspace*{-6\baselineskip}
285 246
 
286
-\Rcode{extend} tells us how many bp to extend to either side of the plotting region.
287
-\Rcode{addRegions} is a \Rclass{data.frame} or \Rcode{GRanges} listing additional regions that
288
-should be highlighted.
247
+\Rcode{extend} tells us how many bp to extend to either side of the plotting region.  \Rcode{addRegions} is a \Rclass{data.frame} or \Rcode{GRanges} listing additional regions that should be highlighted.
248
+
249
+Typically, we plot hundreds of DMRs in a single PDF file and use external tools to look at them.  For this purpose, \Rcode{plotManyRegions} is very useful since it is much faster than plotting individual DMRs with \Rcode{plotRegion}.  An example (not run) is
289 250
 
290
-Typically, we plot hundreds of DMRs in a single PDF file and use external tools to look at them.
291
-For this purpose, \Rcode{plotManyRegions} is very useful since it is much faster than plotting
292
-individual DMRs with \Rcode{plotRegion}.  An example (not run) is
293 251
 <<plotManyRegions,eval=FALSE>>=
294 252
 pdf(file = "dmrs_top200.pdf", width = 10, height = 5)
295 253
 plotManyRegions(BS.cancer.ex.fit, dmrs[1:200,], extend = 5000, 
... ...
@@ -331,14 +289,13 @@ space and would add complications to the internal data structures.
331 289
 
332 290
 \section*{SessionInfo}
333 291
 
334
-<<sessionInfo,results=tex,eval=TRUE,echo=FALSE>>=
292
+<<sessionInfo,results="asis",eval=TRUE,echo=FALSE>>=
335 293
 toLatex(sessionInfo())
336 294
 @ 
337 295
 
338 296
 \end{document}
339 297
 
340 298
 % Local Variables:
341
-% eval: (add-hook 'LaTeX-mode-hook '(lambda () (if (string= (buffer-name) "bsseq_analysis.Rnw") (setq fill-column 100))))
342 299
 % LocalWords: LocalWords bisulfite methylation methylated CpG CpGs DMR bsseq bp 
343 300
 % LocalWords: DMRs differentially ABI SOLiD dataset WGBS BSmooth 
344 301
 % LocalWords: parallelization Bioconductor hypomethylation genomic pos PDF thresholding