Browse code

Improve `findCytosines()`

- Make generic with methods for _BSgenome_ and _DNAStringSet_ (e.g., a FASTA file imported with `rtracklayer::import()`)

Peter Hickey authored on 13/06/2018 14:17:09
Showing 3 changed files

... ...
@@ -31,7 +31,9 @@ Imports:
31 31
     DelayedArray (>= 0.5.34),
32 32
     Rcpp,
33 33
     BiocParallel,
34
-    readr
34
+    readr,
35
+    BSgenome,
36
+    Biostrings
35 37
 Suggests:
36 38
     testthat,
37 39
     bsseqData,
... ...
@@ -42,6 +42,8 @@ import(DelayedArray)
42 42
 import(BiocParallel)
43 43
 importFrom(readr, "cols", "cols_only", "col_character", "col_integer",
44 44
            "col_skip", "col_double", "col_factor", "read_tsv", "tokenizer_tsv")
45
+importFrom(Biostrings, "DNAString", "vmatchPattern", "reverseComplement")
46
+
45 47
 ##
46 48
 ## Exporting
47 49
 ##
... ...
@@ -66,7 +68,8 @@ exportMethods("[", "show",
66 68
               "sampleNames", "sampleNames<-",
67 69
               "pData", "pData<-",
68 70
               "findOverlaps", "subsetByOverlaps",
69
-              "combine", "updateObject")
71
+              "combine", "updateObject",
72
+              "findCytosines")
70 73
 
71 74
 export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
72 75
        "collapseBSseq", "orderBSseq", "hasBeenSmoothed", "chrSelectBSseq",
... ...
@@ -76,7 +79,8 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
76 79
        "read.umtab", "read.umtab2", "read.bsmooth", "read.bismark",
77 80
        "poissonGoodnessOfFit", "binomialGoodnessOfFit",
78 81
        "data.frame2GRanges", "BSseqTstat",
79
-       "BSseqStat")
82
+       "BSseqStat",
83
+       "findCytosines")
80 84
 
81 85
 S3method("print", "chisqGoodnessOfFit")
82 86
 S3method("plot", "chisqGoodnessOfFit")
... ...
@@ -1,75 +1,56 @@
1
-# TODO: Need to update NAMESPACE
2
-# TODO: Search both forward and reverse strands in a single search.
3
-# TODO: Default value of seqlevels as missing or NULL? Check what other
4
-#       functions with this arg use.
5
-findCytosines <- function(BSgenome, context = c("CG", "CA", "CC", "CT"),
6
-                          seqlevels, verbose = getOption("verbose")) {
7
-    stopifnot(is(BSgenome, "BSgenome"))
8
-    context <- match.arg(context)
9
-    context <- DNAString(context)
10
-    seqinfo <- seqinfo(BSgenome)
11
-    if (missing(seqlevels)) {
12
-        seqlevels <- seqlevels(seqinfo)
13
-    } else {
14
-        seqinfo <- seqinfo[seqlevels]
1
+# Exported generics ------------------------------------------------------------
2
+
3
+setGeneric(
4
+    "findCytosines",
5
+    function(x, context, seqlevels) standardGeneric("findCytosines"),
6
+    signature = "x")
7
+
8
+# Exported methods -------------------------------------------------------------
9
+
10
+setMethod(
11
+    "findCytosines",
12
+    "BSgenome",
13
+    function(x, context, seqlevels = seqlevels(x)) {
14
+        # NOTE: vmatchPattern,BSgenome-method returns a GRanges instance.
15
+        gr <- vmatchPattern(
16
+            pattern = context,
17
+            subject = x,
18
+            exclude = setdiff(seqlevels(x), seqlevels),
19
+            fixed = "subject")
20
+        # NOTE: Want just the position of the cytosine.
21
+        resize(gr, width = 1L, fix = "start")
22
+    }
23
+)
24
+
25
+# TODO: Replace with vmatchPDict() when/if this is available, where the pdict
26
+#       argument is a DNAStringSet including the original context and its
27
+#       reverse complement.
28
+setMethod(
29
+    "findCytosines",
30
+    "DNAStringSet",
31
+    function(x, context, seqlevels = seqlevels(x)) {
32
+        context <- DNAString(context)
33
+        x <- x[seqlevels]
34
+        fwd_gr <- as(
35
+            vmatchPattern(
36
+                pattern = context,
37
+                subject = x,
38
+                fixed = "subject"),
39
+            "GRanges")
40
+        strand(fwd_gr) <- "+"
41
+        rev_gr <- as(
42
+            vmatchPattern(
43
+                pattern = reverseComplement(context),
44
+                subject = x,
45
+                fixed = "subject"),
46
+            "GRanges")
47
+        strand(rev_gr) <- "-"
48
+        gr <- c(fwd_gr, rev_gr)
49
+        # NOTE: Want just the position of the cytosine.
50
+        resize(gr, width = 1L, fix = "start")
15 51
     }
16
-    bsparams <- new(
17
-        "BSParams",
18
-        X = BSgenome,
19
-        FUN = matchPattern,
20
-        exclude = setdiff(seqlevels(BSgenome), seqlevels))
21
-    current_verbose <- getOption("verbose")
22
-    options(verbose = verbose)
23
-    on.exit(options(verbose = current_verbose), add = TRUE)
24
-    fwd_strand <- IRangesList(
25
-        endoapply(bsapply(bsparams, pattern = context), as, "IRanges"))
26
-    n <- sum(as.numeric(lengths(fwd_strand)))
27
-    fwd_strand <- .FWGRanges(
28
-        seqnames = Rle(
29
-            factor(names(fwd_strand), levels = seqlevels(seqinfo)),
30
-            lengths(fwd_strand)),
31
-        ranges = .FWIRanges(
32
-            start = unlist(start(fwd_strand), use.names = FALSE),
33
-            width = 1L),
34
-        strand = Rle(strand("+"), n),
35
-        seqinfo = seqinfo,
36
-        elementMetadata = S4Vectors:::make_zero_col_DataFrame(n))
37
-    rev_strand <- IRangesList(
38
-        endoapply(
39
-            bsapply(bsparams, pattern = reverseComplement(context)),
40
-            as,
41
-            "IRanges"))
42
-    n <- sum(as.numeric(lengths(rev_strand)))
43
-    rev_strand <- .FWGRanges(
44
-        seqnames = Rle(
45
-            factor(names(rev_strand), levels = seqlevels(seqinfo)),
46
-            lengths(rev_strand)),
47
-        ranges = .FWIRanges(
48
-            start = unlist(start(rev_strand), use.names = FALSE),
49
-            width = 1L),
50
-        strand = Rle(strand("-"), n),
51
-        seqinfo = seqinfo,
52
-        elementMetadata = S4Vectors:::make_zero_col_DataFrame(n))
53
-    sort(c(fwd_strand, rev_strand))
54
-}
52
+)
55 53
 
56
-# TODO: findCytosines from a FASTA file (e.g., for lambda phage)
57
-# lambda <- import("http://www.ebi.ac.uk/ena/data/view/J02459&display=fasta&download=fasta&filename=J02459.fasta")
54
+# TODOs ------------------------------------------------------------------------
58 55
 
59
-#
60
-# hg19_si <- suppressWarnings(keepStandardChromosomes(
61
-#     seqinfo(BSgenome.Hsapiens.UCSC.hg19)))
62
-# params <- new("BSParams",
63
-#               X = BSgenome.Hsapiens.UCSC.hg19,
64
-#               FUN = matchPattern,
65
-#               exclude = setdiff(seqnames(BSgenome.Hsapiens.UCSC.hg19),
66
-#                                 seqnames(hg19_si)))
67
-# options(verbose = TRUE)
68
-# hg19_cpgs <- IRangesList(
69
-#     endoapply(bsapply(params, pattern = "CG"), as, "IRanges"))
70
-# options(verbose = FALSE)
71
-# hg19_cpgs <- GRanges(seqnames = Rle(names(hg19_cpgs), lengths(hg19_cpgs)),
72
-#                      ranges = unlist(hg19_cpgs, use.names = FALSE),
73
-#                      strand = "*",
74
-#                      seqinfo = hg19_si)
75
-# width(hg19_cpgs) <- 1L
56
+# TODO: Default value of seqlevels isn't working.