... | ... |
@@ -71,8 +71,7 @@ exportMethods("[", "show", |
71 | 71 |
"sampleNames", "sampleNames<-", |
72 | 72 |
"pData", "pData<-", |
73 | 73 |
"findOverlaps", "subsetByOverlaps", |
74 |
- "combine", "updateObject", |
|
75 |
- "findCytosines") |
|
74 |
+ "combine", "updateObject") |
|
76 | 75 |
|
77 | 76 |
export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats", |
78 | 77 |
"collapseBSseq", "orderBSseq", "hasBeenSmoothed", "chrSelectBSseq", |
... | ... |
@@ -83,7 +82,7 @@ export("BSseq", "getMeth", "getCoverage", "getBSseq", "getStats", |
83 | 82 |
"poissonGoodnessOfFit", "binomialGoodnessOfFit", |
84 | 83 |
"data.frame2GRanges", "BSseqTstat", |
85 | 84 |
"BSseqStat", |
86 |
- "findCytosines") |
|
85 |
+ "findLoci") |
|
87 | 86 |
|
88 | 87 |
S3method("print", "chisqGoodnessOfFit") |
89 | 88 |
S3method("plot", "chisqGoodnessOfFit") |
90 | 89 |
deleted file mode 100644 |
... | ... |
@@ -1,73 +0,0 @@ |
1 |
-# Exported generics ------------------------------------------------------------ |
|
2 |
- |
|
3 |
-setGeneric( |
|
4 |
- "findCytosines", |
|
5 |
- function(x, context, seqlevels, |
|
6 |
- strand = c("*", "+", "-")) standardGeneric("findCytosines"), |
|
7 |
- signature = "x") |
|
8 |
- |
|
9 |
-# Exported methods ------------------------------------------------------------- |
|
10 |
- |
|
11 |
-setMethod( |
|
12 |
- "findCytosines", |
|
13 |
- "BSgenome", |
|
14 |
- function(x, context, seqlevels = seqlevels(x), strand = c("*", "+", "-")) { |
|
15 |
- strand <- match.arg(strand) |
|
16 |
- # NOTE: vmatchPattern,BSgenome-method returns a GRanges instance and |
|
17 |
- # automatically checks both forward and reverse strands. |
|
18 |
- gr <- vmatchPattern( |
|
19 |
- pattern = context, |
|
20 |
- subject = x, |
|
21 |
- exclude = setdiff(seqlevels(x), seqlevels), |
|
22 |
- fixed = "subject") |
|
23 |
- if (strand == "+") { |
|
24 |
- gr <- gr[strand(gr) == "+"] |
|
25 |
- } else if (strand == "-") { |
|
26 |
- gr <- gr[strand(gr) == "-"] |
|
27 |
- } |
|
28 |
- # NOTE: Want just the position of the cytosine. |
|
29 |
- resize(gr, width = 1L, fix = "start") |
|
30 |
- } |
|
31 |
-) |
|
32 |
- |
|
33 |
-# TODO: Replace with vmatchPDict() when/if this is available, where the pdict |
|
34 |
-# argument is a DNAStringSet including the original context and its |
|
35 |
-# reverse complement. |
|
36 |
-setMethod( |
|
37 |
- "findCytosines", |
|
38 |
- "DNAStringSet", |
|
39 |
- function(x, context, seqlevels = seqlevels(x), strand = c("*", "+", "-")) { |
|
40 |
- context <- DNAString(context) |
|
41 |
- x <- x[seqlevels] |
|
42 |
- strand <- match.arg(strand) |
|
43 |
- if (strand %in% c("*", "+")) { |
|
44 |
- fwd_gr <- as( |
|
45 |
- vmatchPattern( |
|
46 |
- pattern = context, |
|
47 |
- subject = x, |
|
48 |
- fixed = "subject"), |
|
49 |
- "GRanges") |
|
50 |
- strand(fwd_gr) <- "+" |
|
51 |
- } else { |
|
52 |
- fwd_gr <- GRanges() |
|
53 |
- } |
|
54 |
- if (strand %in% c("*", "-")) { |
|
55 |
- rev_gr <- as( |
|
56 |
- vmatchPattern( |
|
57 |
- pattern = reverseComplement(context), |
|
58 |
- subject = x, |
|
59 |
- fixed = "subject"), |
|
60 |
- "GRanges") |
|
61 |
- strand(rev_gr) <- "-" |
|
62 |
- } else { |
|
63 |
- rev_gr <- GRanges() |
|
64 |
- } |
|
65 |
- gr <- c(fwd_gr, rev_gr) |
|
66 |
- # NOTE: Want just the position of the cytosine. |
|
67 |
- resize(gr, width = 1L, fix = "start") |
|
68 |
- } |
|
69 |
-) |
|
70 |
- |
|
71 |
-# TODOs ------------------------------------------------------------------------ |
|
72 |
- |
|
73 |
-# TODO: Default value of seqlevels isn't working. |
74 | 0 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,73 @@ |
1 |
+# Exported functions ----------------------------------------------------------- |
|
2 |
+ |
|
3 |
+findLoci <- function(pattern, subject, include = seqlevels(subject), |
|
4 |
+ strand = c("*", "+", "-"), fixed = "subject", |
|
5 |
+ resize = TRUE) { |
|
6 |
+ pattern <- DNAString(pattern) |
|
7 |
+ strand <- match.arg(strand) |
|
8 |
+ |
|
9 |
+ if (is.character(subject) || is(subject, "RTLFile")) { |
|
10 |
+ if (!requireNamespace("rtracklayer", quietly = TRUE)) { |
|
11 |
+ stop("rtracklayer package required for importing 'subject'.") |
|
12 |
+ } |
|
13 |
+ subject <- try(rtracklayer::import(subject), silent = TRUE) |
|
14 |
+ if (!is(subject, "DNAStringSet")) { |
|
15 |
+ stop("Unable to import 'subject' as a DNAStringSet.") |
|
16 |
+ } |
|
17 |
+ } |
|
18 |
+ |
|
19 |
+ if (is(subject, "BSgenome")) { |
|
20 |
+ # NOTE: vmatchPattern,BSgenome-method returns a GRanges instance and |
|
21 |
+ # automatically checks both forward and reverse strands. |
|
22 |
+ gr <- vmatchPattern( |
|
23 |
+ pattern = pattern, |
|
24 |
+ subject = subject, |
|
25 |
+ # NOTE: This must be a regular expression; |
|
26 |
+ # see https://github.com/Bioconductor/BSgenome/issues/1. |
|
27 |
+ exclude = paste0("^", setdiff(seqnames(subject), include), "$"), |
|
28 |
+ fixed = fixed) |
|
29 |
+ if (identical(strand, "+")) { |
|
30 |
+ gr <- gr[strand(gr) == "+"] |
|
31 |
+ } else if (identical(strand, "-")) { |
|
32 |
+ gr <- gr[strand(gr) == "-"] |
|
33 |
+ } |
|
34 |
+ } else if (is(subject, "DNAStringSet")) { |
|
35 |
+ subject <- subject[include] |
|
36 |
+ if (strand %in% c("*", "+")) { |
|
37 |
+ fwd_gr <- as( |
|
38 |
+ vmatchPattern( |
|
39 |
+ pattern = pattern, |
|
40 |
+ subject = subject, |
|
41 |
+ fixed = fixed), |
|
42 |
+ "GRanges") |
|
43 |
+ strand(fwd_gr) <- "+" |
|
44 |
+ } else { |
|
45 |
+ fwd_gr <- GRanges() |
|
46 |
+ } |
|
47 |
+ if (strand %in% c("*", "-")) { |
|
48 |
+ rev_gr <- as( |
|
49 |
+ vmatchPattern( |
|
50 |
+ pattern = reverseComplement(pattern), |
|
51 |
+ subject = subject, |
|
52 |
+ fixed = fixed), |
|
53 |
+ "GRanges") |
|
54 |
+ strand(rev_gr) <- "-" |
|
55 |
+ } else { |
|
56 |
+ rev_gr <- GRanges() |
|
57 |
+ } |
|
58 |
+ gr <- c(fwd_gr, rev_gr) |
|
59 |
+ } else { |
|
60 |
+ stop("Cannot handle 'subject' of class ", class(subject), ".") |
|
61 |
+ } |
|
62 |
+ |
|
63 |
+ if (resize) { |
|
64 |
+ gr <- resize(gr, width = 1L, fix = "start") |
|
65 |
+ } |
|
66 |
+ gr |
|
67 |
+} |
|
68 |
+ |
|
69 |
+# TODOs ------------------------------------------------------------------------ |
|
70 |
+ |
|
71 |
+# TODO: Default value of seqlevels may not be working. |
|
72 |
+# TODO: What happens if subject is a non-filepath character (e.g., "CATGCG") or |
|
73 |
+# a DNAString? |
0 | 74 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,76 @@ |
1 |
+\name{findLoci} |
|
2 |
+\alias{findLoci} |
|
3 |
+\title{ |
|
4 |
+ Find methylation loci in a genome |
|
5 |
+} |
|
6 |
+\description{ |
|
7 |
+ This is a convenience function to find methylation loci, such as CpGs, in a reference genome. |
|
8 |
+ The result is useful as the value of the \code{loci} argument for \code{\link{read.bismark}()}. |
|
9 |
+} |
|
10 |
+\usage{ |
|
11 |
+findLoci(pattern, |
|
12 |
+ subject, |
|
13 |
+ include = seqlevels(subject), |
|
14 |
+ strand = c("*", "+", "-"), |
|
15 |
+ fixed = "subject", |
|
16 |
+ resize = TRUE) |
|
17 |
+} |
|
18 |
+\arguments{ |
|
19 |
+ \item{pattern}{ |
|
20 |
+ A string specifying the pattern to search for, e.g. \code{"CG"}. |
|
21 |
+ Can contain IUPAC ambiguity codes, e.g., \code{"CH"}. |
|
22 |
+ } |
|
23 |
+ \item{subject}{ |
|
24 |
+ A string containing a file path to the genome sequence, in FASTA or 2bit format, to be searched. |
|
25 |
+ Alternatively, a \linkS4class{BSgenome} or \linkS4class{DNAStringSet} object storing the genome sequence to be searched. |
|
26 |
+ } |
|
27 |
+ \item{include}{ |
|
28 |
+ A character vector indicating the seqlevels of \code{subject} to be used. |
|
29 |
+ } |
|
30 |
+ \item{strand}{ |
|
31 |
+ A character scaler specifying the strand of \code{subject} to be used. |
|
32 |
+ If \code{strand = "*"}, then both the positive (\code{strand = "+"}) and negative (\code{strand = "-"} strands will be searched.) |
|
33 |
+ It is assumed that \code{subject} contains the sequence with respect to the \code{+} strand. |
|
34 |
+ } |
|
35 |
+ \item{fixed}{ |
|
36 |
+ If \code{"subject"} (the default), IUPAC ambiguity codes in the pattern only are interpreted as wildcards, e.g., a \code{pattern} containing \code{CH} matches a \code{subject} containing \code{CA} but not vice versa. |
|
37 |
+ See \code{?Biostrings::`\link[Biostrings]{lowlevel-matching}`} for more information |
|
38 |
+ } |
|
39 |
+ \item{resize}{ |
|
40 |
+ A logical scalar specifying whether the ranges should be shifted to have width 1 and anchored by the start of the locus, e.g., resize a CpG dinucleotide to give the co-ordinates of the cytosine. |
|
41 |
+ } |
|
42 |
+} |
|
43 |
+\details{ |
|
44 |
+ This function provides a convenience wrapper for finding methylation loci in a genome, based on running \code{\link[Biostrings]{vmatchPattern}()}. |
|
45 |
+ Users requiring finer-grained control should directly use the \code{\link[Biostrings]{vmatchPattern}()} function and coerce the result to a \linkS4class{GRanges} object. |
|
46 |
+} |
|
47 |
+\value{ |
|
48 |
+ A \linkS4class{GRanges} object storing the found loci. |
|
49 |
+} |
|
50 |
+\author{ |
|
51 |
+ Peter F. Hickey |
|
52 |
+} |
|
53 |
+\seealso{ |
|
54 |
+ \itemize{ |
|
55 |
+ \item \code{Biostrings::\link[Biostrings]{vmatchPattern}()} |
|
56 |
+ \item \code{?BSgenome::`\link[BSgenome]{BSgenome-utils}`} |
|
57 |
+ } |
|
58 |
+} |
|
59 |
+\examples{ |
|
60 |
+ library(Biostrings) |
|
61 |
+ # Find CpG dinucleotides on the both strands of an artificial sequence |
|
62 |
+ my_seq <- DNAStringSet("ATCAGTCGC") |
|
63 |
+ names(my_seq) <- "test" |
|
64 |
+ findLoci(pattern = "CG", subject = my_seq) |
|
65 |
+ # Find CHG trinucleotides on the both strands of an artificial sequence |
|
66 |
+ findLoci(pattern = "CHG", subject = my_seq) |
|
67 |
+ |
|
68 |
+ # Find CpG dinucleotides on the + strand of chr17 from the human genome (hg38) |
|
69 |
+ if (requireNamespace("BSgenome.Hsapiens.UCSC.hg38")) { |
|
70 |
+ findLoci( |
|
71 |
+ pattern = "CG", |
|
72 |
+ subject = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38, |
|
73 |
+ include = "chr17", |
|
74 |
+ strand = "+") |
|
75 |
+ } |
|
76 |
+} |