Browse code

Rename findCytosines as findLoci and document

Peter Hickey authored on 17/09/2018 11:02:52
Showing 5 changed files

... ...
@@ -68,7 +68,7 @@ Collate:
68 68
     collapseBSseq.R
69 69
     FWIRanges-class.R
70 70
     FWGRanges-class.R
71
-    findCytosines.R
71
+    findLoci.R
72 72
 License: Artistic-2.0
73 73
 VignetteBuilder: knitr
74 74
 URL: https://github.com/kasperdanielhansen/bsseq
... ...
@@ -71,8 +71,7 @@ exportMethods("[", "show",
71 71
               "sampleNames", "sampleNames<-",
72 72
               "pData", "pData<-",
73 73
               "findOverlaps", "subsetByOverlaps",
74
-              "combine", "updateObject",
75
-              "findCytosines")
74
+              "combine", "updateObject")
76 75
 
77 76
 export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
78 77
        "collapseBSseq", "orderBSseq", "hasBeenSmoothed", "chrSelectBSseq",
... ...
@@ -83,7 +82,7 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats",
83 82
        "poissonGoodnessOfFit", "binomialGoodnessOfFit",
84 83
        "data.frame2GRanges", "BSseqTstat",
85 84
        "BSseqStat",
86
-       "findCytosines")
85
+       "findLoci")
87 86
 
88 87
 S3method("print", "chisqGoodnessOfFit")
89 88
 S3method("plot", "chisqGoodnessOfFit")
90 89
deleted file mode 100644
... ...
@@ -1,73 +0,0 @@
1
-# Exported generics ------------------------------------------------------------
2
-
3
-setGeneric(
4
-    "findCytosines",
5
-    function(x, context, seqlevels,
6
-             strand = c("*", "+", "-")) standardGeneric("findCytosines"),
7
-    signature = "x")
8
-
9
-# Exported methods -------------------------------------------------------------
10
-
11
-setMethod(
12
-    "findCytosines",
13
-    "BSgenome",
14
-    function(x, context, seqlevels = seqlevels(x), strand = c("*", "+", "-")) {
15
-        strand <- match.arg(strand)
16
-        # NOTE: vmatchPattern,BSgenome-method returns a GRanges instance and
17
-        #       automatically checks both forward and reverse strands.
18
-        gr <- vmatchPattern(
19
-            pattern = context,
20
-            subject = x,
21
-            exclude = setdiff(seqlevels(x), seqlevels),
22
-            fixed = "subject")
23
-        if (strand == "+") {
24
-            gr <- gr[strand(gr) == "+"]
25
-        } else if (strand == "-") {
26
-            gr <- gr[strand(gr) == "-"]
27
-        }
28
-        # NOTE: Want just the position of the cytosine.
29
-        resize(gr, width = 1L, fix = "start")
30
-    }
31
-)
32
-
33
-# TODO: Replace with vmatchPDict() when/if this is available, where the pdict
34
-#       argument is a DNAStringSet including the original context and its
35
-#       reverse complement.
36
-setMethod(
37
-    "findCytosines",
38
-    "DNAStringSet",
39
-    function(x, context, seqlevels = seqlevels(x), strand = c("*", "+", "-")) {
40
-        context <- DNAString(context)
41
-        x <- x[seqlevels]
42
-        strand <- match.arg(strand)
43
-        if (strand %in% c("*", "+")) {
44
-            fwd_gr <- as(
45
-                vmatchPattern(
46
-                    pattern = context,
47
-                    subject = x,
48
-                    fixed = "subject"),
49
-                "GRanges")
50
-            strand(fwd_gr) <- "+"
51
-        } else {
52
-            fwd_gr <- GRanges()
53
-        }
54
-        if (strand %in% c("*", "-")) {
55
-            rev_gr <- as(
56
-                vmatchPattern(
57
-                    pattern = reverseComplement(context),
58
-                    subject = x,
59
-                    fixed = "subject"),
60
-                "GRanges")
61
-            strand(rev_gr) <- "-"
62
-        } else {
63
-            rev_gr <- GRanges()
64
-        }
65
-        gr <- c(fwd_gr, rev_gr)
66
-        # NOTE: Want just the position of the cytosine.
67
-        resize(gr, width = 1L, fix = "start")
68
-    }
69
-)
70
-
71
-# TODOs ------------------------------------------------------------------------
72
-
73
-# TODO: Default value of seqlevels isn't working.
74 0
new file mode 100644
... ...
@@ -0,0 +1,73 @@
1
+# Exported functions -----------------------------------------------------------
2
+
3
+findLoci <- function(pattern, subject, include = seqlevels(subject),
4
+                     strand = c("*", "+", "-"), fixed = "subject",
5
+                     resize = TRUE) {
6
+    pattern <- DNAString(pattern)
7
+    strand <- match.arg(strand)
8
+
9
+    if (is.character(subject) || is(subject, "RTLFile")) {
10
+        if (!requireNamespace("rtracklayer", quietly = TRUE)) {
11
+            stop("rtracklayer package required for importing 'subject'.")
12
+        }
13
+        subject <- try(rtracklayer::import(subject), silent = TRUE)
14
+        if (!is(subject, "DNAStringSet")) {
15
+            stop("Unable to import 'subject' as a DNAStringSet.")
16
+        }
17
+    }
18
+
19
+    if (is(subject, "BSgenome")) {
20
+        # NOTE: vmatchPattern,BSgenome-method returns a GRanges instance and
21
+        #       automatically checks both forward and reverse strands.
22
+        gr <- vmatchPattern(
23
+            pattern = pattern,
24
+            subject = subject,
25
+            # NOTE: This must be a regular expression;
26
+            #       see https://github.com/Bioconductor/BSgenome/issues/1.
27
+            exclude = paste0("^", setdiff(seqnames(subject), include), "$"),
28
+            fixed = fixed)
29
+        if (identical(strand, "+")) {
30
+            gr <- gr[strand(gr) == "+"]
31
+        } else if (identical(strand, "-")) {
32
+            gr <- gr[strand(gr) == "-"]
33
+        }
34
+    } else if (is(subject, "DNAStringSet")) {
35
+        subject <- subject[include]
36
+        if (strand %in% c("*", "+")) {
37
+            fwd_gr <- as(
38
+                vmatchPattern(
39
+                    pattern = pattern,
40
+                    subject = subject,
41
+                    fixed = fixed),
42
+                "GRanges")
43
+            strand(fwd_gr) <- "+"
44
+        } else {
45
+            fwd_gr <- GRanges()
46
+        }
47
+        if (strand %in% c("*", "-")) {
48
+            rev_gr <- as(
49
+                vmatchPattern(
50
+                    pattern = reverseComplement(pattern),
51
+                    subject = subject,
52
+                    fixed = fixed),
53
+                "GRanges")
54
+            strand(rev_gr) <- "-"
55
+        } else {
56
+            rev_gr <- GRanges()
57
+        }
58
+        gr <- c(fwd_gr, rev_gr)
59
+    } else {
60
+        stop("Cannot handle 'subject' of class ", class(subject), ".")
61
+    }
62
+
63
+    if (resize) {
64
+        gr <- resize(gr, width = 1L, fix = "start")
65
+    }
66
+    gr
67
+}
68
+
69
+# TODOs ------------------------------------------------------------------------
70
+
71
+# TODO: Default value of seqlevels may not be working.
72
+# TODO: What happens if subject is a non-filepath character (e.g., "CATGCG") or
73
+#       a DNAString?
0 74
new file mode 100644
... ...
@@ -0,0 +1,76 @@
1
+\name{findLoci}
2
+\alias{findLoci}
3
+\title{
4
+  Find methylation loci in a genome
5
+}
6
+\description{
7
+    This is a convenience function to find methylation loci, such as CpGs, in a reference genome.
8
+    The result is useful as the value of the \code{loci} argument for \code{\link{read.bismark}()}.
9
+}
10
+\usage{
11
+findLoci(pattern,
12
+         subject,
13
+         include = seqlevels(subject),
14
+         strand = c("*", "+", "-"),
15
+         fixed = "subject",
16
+         resize = TRUE)
17
+}
18
+\arguments{
19
+  \item{pattern}{
20
+    A string specifying the pattern to search for, e.g. \code{"CG"}.
21
+    Can contain IUPAC ambiguity codes, e.g., \code{"CH"}.
22
+  }
23
+  \item{subject}{
24
+    A string containing a file path to the genome sequence, in FASTA or 2bit format, to be searched.
25
+    Alternatively, a \linkS4class{BSgenome} or \linkS4class{DNAStringSet} object storing the genome sequence to be searched.
26
+  }
27
+  \item{include}{
28
+    A character vector indicating the seqlevels of \code{subject} to be used.
29
+  }
30
+  \item{strand}{
31
+    A character scaler specifying the strand of \code{subject} to be used.
32
+    If \code{strand = "*"}, then both the positive (\code{strand = "+"}) and negative (\code{strand = "-"} strands will be searched.)
33
+    It is assumed that \code{subject} contains the sequence with respect to the \code{+} strand.
34
+  }
35
+  \item{fixed}{
36
+    If \code{"subject"} (the default), IUPAC ambiguity codes in the pattern only are interpreted as wildcards, e.g., a \code{pattern} containing \code{CH} matches a \code{subject} containing \code{CA} but not vice versa.
37
+    See \code{?Biostrings::`\link[Biostrings]{lowlevel-matching}`} for more information
38
+  }
39
+  \item{resize}{
40
+    A logical scalar specifying whether the ranges should be shifted to have width 1 and anchored by the start of the locus, e.g., resize a CpG dinucleotide to give the co-ordinates of the cytosine.
41
+  }
42
+}
43
+\details{
44
+  This function provides a convenience wrapper for finding methylation loci in a genome, based on running \code{\link[Biostrings]{vmatchPattern}()}.
45
+  Users requiring finer-grained control should directly use the \code{\link[Biostrings]{vmatchPattern}()} function and coerce the result to a \linkS4class{GRanges} object.
46
+}
47
+\value{
48
+  A \linkS4class{GRanges} object storing the found loci.
49
+}
50
+\author{
51
+  Peter F. Hickey
52
+}
53
+\seealso{
54
+  \itemize{
55
+    \item \code{Biostrings::\link[Biostrings]{vmatchPattern}()}
56
+    \item \code{?BSgenome::`\link[BSgenome]{BSgenome-utils}`}
57
+    }
58
+}
59
+\examples{
60
+  library(Biostrings)
61
+  # Find CpG dinucleotides on the both strands of an artificial sequence
62
+  my_seq <- DNAStringSet("ATCAGTCGC")
63
+  names(my_seq) <- "test"
64
+  findLoci(pattern = "CG", subject = my_seq)
65
+  # Find CHG trinucleotides on the both strands of an artificial sequence
66
+  findLoci(pattern = "CHG", subject = my_seq)
67
+
68
+  # Find CpG dinucleotides on the + strand of chr17 from the human genome (hg38)
69
+  if (requireNamespace("BSgenome.Hsapiens.UCSC.hg38")) {
70
+    findLoci(
71
+        pattern = "CG",
72
+        subject = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,
73
+        include = "chr17",
74
+        strand = "+")
75
+  }
76
+}