git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@68872 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,14 @@ |
1 |
+Package: MotifDb |
|
2 |
+Type: Package |
|
3 |
+Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs |
|
4 |
+Version: 0.99.80 |
|
5 |
+Date: 2012-08-12 |
|
6 |
+Author: Paul Shannon |
|
7 |
+Maintainer: <pshannon@fhcrc.org> |
|
8 |
+Depends: R (>= 2.15.0), methods, IRanges, Biostrings |
|
9 |
+Suggests: RUnit, MotIV, seqLogo |
|
10 |
+Imports: BiocGenerics, rtracklayer |
|
11 |
+Description: More than 2000 annotated position frequency matrices from five public source, for multiple organisms |
|
12 |
+License: Artistic-2.0 |
|
13 |
+LazyLoad: yes |
|
14 |
+biocViews: GenomicSequence, MotifAnnotation |
0 | 15 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,16 @@ |
1 |
+exportClasses (MotifList) |
|
2 |
+exportMethods ( |
|
3 |
+ subset, |
|
4 |
+ export, |
|
5 |
+ show, |
|
6 |
+ query |
|
7 |
+ ) |
|
8 |
+ |
|
9 |
+export( |
|
10 |
+ MotifDb |
|
11 |
+) |
|
12 |
+ |
|
13 |
+importMethodsFrom(IRanges, rbind, eval, sapply, subset, values) |
|
14 |
+importFrom(IRanges, DataFrame) |
|
15 |
+importFrom(rtracklayer, export) |
|
16 |
+ |
0 | 17 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,265 @@ |
1 |
+setGeneric('query', signature='object', function(object, queryString, ignore.case=TRUE) |
|
2 |
+ standardGeneric ('query')) |
|
3 |
+#------------------------------------------------------------------------------- |
|
4 |
+setClass ('MotifList', |
|
5 |
+ contains='SimpleList', |
|
6 |
+ representation (elementMetadata='DataFrame'), |
|
7 |
+ prototype(elementType='matrix') |
|
8 |
+ ) |
|
9 |
+ |
|
10 |
+setValidity('MotifList', function(object) { |
|
11 |
+ msg = NULL |
|
12 |
+ ## what makes for a valid MotifList? |
|
13 |
+ if (length (object) == 0) |
|
14 |
+ return (TRUE) |
|
15 |
+ if (is.null(names(object))) |
|
16 |
+ msg = c(msg, '"names()" must not be NULL') |
|
17 |
+ else if (any(duplicated(names(object)))) |
|
18 |
+ msg = c(msg, 'all "names()" must be unique') |
|
19 |
+ if (!all(sapply(object, is.matrix))) |
|
20 |
+ msg = c(msg, 'all matrices must be of class "matrix"') |
|
21 |
+ if (!all(sapply(object, nrow) == 4)) |
|
22 |
+ msg = c(msg, 'all matrices must have 4 rows') |
|
23 |
+ # all columns of all matrices should be normalized, summing to one. |
|
24 |
+ # in fact, 2/2086 matrices |
|
25 |
+ # Cparvum-UniPROBE-Cgd2_3490.UP00395 |
|
26 |
+ # Pfalciparum-UniPROBE-PF14_0633.UP00394 |
|
27 |
+ # fail to pass this test. rounding to one decimal place allows these matrices, |
|
28 |
+ # also, to pass. round (0.98, digits=1) --> 1.0 |
|
29 |
+ # see the UniPROBE manpage for the full story on these two matrices |
|
30 |
+ ok = sapply(object, function(elt) all (round (colSums(elt), digits=1) == 1)) |
|
31 |
+ if (!all(ok)) |
|
32 |
+ msg = c(msg, 'all elements must have colSums equal to 1') |
|
33 |
+ |
|
34 |
+ if (is.null(msg)) TRUE else msg |
|
35 |
+}) |
|
36 |
+#------------------------------------------------------------------------------- |
|
37 |
+MotifList = function (matrices=list(), tbl.metadata=data.frame ()) |
|
38 |
+{ |
|
39 |
+ if (nrow (tbl.metadata) == 0) |
|
40 |
+ df = DataFrame () |
|
41 |
+ else |
|
42 |
+ df = DataFrame (tbl.metadata, row.names = rownames (tbl.metadata)) |
|
43 |
+ |
|
44 |
+ object = new ('MotifList', listData=matrices, elementMetadata = df) |
|
45 |
+ |
|
46 |
+ if (length (matrices) == 0) |
|
47 |
+ names (object) = character () |
|
48 |
+ else |
|
49 |
+ names (object) = rownames (tbl.metadata) |
|
50 |
+ |
|
51 |
+ object |
|
52 |
+ |
|
53 |
+} # ctor |
|
54 |
+#------------------------------------------------------------------------------- |
|
55 |
+setMethod ('subset', signature = 'MotifList', |
|
56 |
+ |
|
57 |
+ function (x, subset, select, drop = FALSE, ...) { |
|
58 |
+ |
|
59 |
+ if (missing (subset)) |
|
60 |
+ i = TRUE |
|
61 |
+ else { |
|
62 |
+ i = eval(substitute (subset), elementMetadata (x), parent.frame ()) |
|
63 |
+ i = try(as.logical(i), silent=TRUE) |
|
64 |
+ if (inherits(i, 'try-error')) |
|
65 |
+ stop('"subset" must be coercible to logical') |
|
66 |
+ i = i & !is.na(i) |
|
67 |
+ } # else |
|
68 |
+ return (x [i]) |
|
69 |
+ }) |
|
70 |
+ |
|
71 |
+#------------------------------------------------------------------------------- |
|
72 |
+# transpose 4-row matrices to 4-column, so that there are as many rows as |
|
73 |
+# there are nucleotides in the motif sequence. meme requires normalized matrices |
|
74 |
+# exactly as we store them |
|
75 |
+transformMatrixToMemeRepresentation = function (m) |
|
76 |
+{ |
|
77 |
+ return (t (m)) |
|
78 |
+ |
|
79 |
+} # transformMatrixToMemeRepresentation |
|
80 |
+#------------------------------------------------------------------------------- |
|
81 |
+# http://stuff.mit.edu/afs/athena/software/meme_v3.5.4/etc/meme-explanation.html |
|
82 |
+# The motif itself is a position-specific probability matrix giving, for each |
|
83 |
+# position in the pattern, the observed frequency ('probability') of each |
|
84 |
+# possible letter. The probability matrix is printed 'sideways'--columns |
|
85 |
+# correspond to the letters in the alphabet (in the same order as shown in |
|
86 |
+# the simplified motif) and rows corresponding to the positions of the motif, |
|
87 |
+# position one first. The motif is preceded by a line starting with |
|
88 |
+# 'letter-probability matrix:' and containing the length of the alphabet, |
|
89 |
+# width of the motif, number of occurrences of the motif, and the E-value |
|
90 |
+# of the motif. |
|
91 |
+matrixToMemeText = function (matrices) |
|
92 |
+{ |
|
93 |
+ matrix.count = length (matrices) |
|
94 |
+ |
|
95 |
+ # incoming matrices have nucleotide rows, position columns. meme |
|
96 |
+ # format, however, requires position-rows, and nucleotide-columns |
|
97 |
+ # calculate the number of lines of text by counting columns |
|
98 |
+ total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol))) |
|
99 |
+ predicted.line.count = 12 + (3 * length (matrices)) + |
|
100 |
+ total.transposed.matrix.rows |
|
101 |
+ #s = vector ('character', predicted.line.count) |
|
102 |
+ s = character (predicted.line.count) |
|
103 |
+ |
|
104 |
+ s [1] = 'MEME version 4' |
|
105 |
+ s [2] = '' |
|
106 |
+ s [3] = 'ALPHABET= ACGT' |
|
107 |
+ s [4] = '' |
|
108 |
+ s [5] = 'strands: + -' |
|
109 |
+ s [6] = '' |
|
110 |
+ s [7] = 'Background letter frequencies' |
|
111 |
+ s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 ' |
|
112 |
+ s [9] = '' |
|
113 |
+ |
|
114 |
+ index = 10 |
|
115 |
+ for (name in names (matrices)) { |
|
116 |
+ # transpose the frequency matrix version of the incoming matrix, |
|
117 |
+ # hence 'tfMat' |
|
118 |
+ tfMat = transformMatrixToMemeRepresentation (matrices [[name]]) |
|
119 |
+ # meme output may be used by tomtom, which uses matrix names as |
|
120 |
+ # part of image filenames. removed all file-system-unfriendly |
|
121 |
+ # characters here |
|
122 |
+ fixed.name = gsub ('\\/', '_', name) |
|
123 |
+ s [index] = sprintf ('MOTIF %s', fixed.name) |
|
124 |
+ index = index + 1 |
|
125 |
+ new.line = |
|
126 |
+ sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020', |
|
127 |
+ nrow (tfMat), 45, 8.1e-020) |
|
128 |
+ s [index] = new.line |
|
129 |
+ index = index + 1 |
|
130 |
+ for (r in 1:nrow (tfMat)) { |
|
131 |
+ new.row = sprintf (' %12.10f %12.10f %12.10f %12.10f', tfMat [r,1], |
|
132 |
+ tfMat [r,2], tfMat [r,3], tfMat [r,4]) |
|
133 |
+ s [index] = new.row |
|
134 |
+ index = index + 1 |
|
135 |
+ } |
|
136 |
+ s [index] = '' |
|
137 |
+ index = index + 1 |
|
138 |
+ } # for name |
|
139 |
+ |
|
140 |
+ invisible (s) |
|
141 |
+ |
|
142 |
+} # matrixToMemeText |
|
143 |
+#------------------------------------------------------------------------------- |
|
144 |
+# connection is a character string, create a file by that name, open the file. |
|
145 |
+# dispatch to export which dispatches on con='connection' |
|
146 |
+ |
|
147 |
+setMethod ('export', signature=c(object='MotifList', con='character', |
|
148 |
+ format='character'), |
|
149 |
+ |
|
150 |
+ function (object, con, format, ...) { |
|
151 |
+ ## do minimum work unique to this method, then dispatch to avoid |
|
152 |
+ ## code duplication |
|
153 |
+ con = file (con, 'w') |
|
154 |
+ on.exit(close(con)) |
|
155 |
+ export(object, con, format, ...) |
|
156 |
+ }) |
|
157 |
+ |
|
158 |
+#------------------------------------------------------------------------------- |
|
159 |
+# write to connection with specified format |
|
160 |
+# format includes TRANSFAC, meme (also good for tomtom), and tsv |
|
161 |
+ |
|
162 |
+setMethod ('export', signature=c(object='MotifList', con='connection', |
|
163 |
+ format='character'), |
|
164 |
+ |
|
165 |
+ function (object, con, format, ...) { |
|
166 |
+ |
|
167 |
+ fmt = match.arg (tolower (format), c ('meme', 'transfac')) |
|
168 |
+ ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need |
|
169 |
+ ## for test |
|
170 |
+ ## let the user manage opened cons |
|
171 |
+ if (!isOpen(con)) { |
|
172 |
+ open(con) |
|
173 |
+ on.exit(close(con)) |
|
174 |
+ } |
|
175 |
+ if (fmt == 'meme') |
|
176 |
+ text = matrixToMemeText (object) |
|
177 |
+ cat (text, sep='\n', file=con) |
|
178 |
+ }) |
|
179 |
+ |
|
180 |
+#------------------------------------------------------------------------------- |
|
181 |
+# write to connection, using default format, ??? for matrix list, tsv for |
|
182 |
+# metadata |
|
183 |
+setMethod ('export', signature=c(object='MotifList', con='missing', |
|
184 |
+ format='character'), |
|
185 |
+ |
|
186 |
+ function (object, con, format, ...) { |
|
187 |
+ |
|
188 |
+ fmt = match.arg (tolower (format), c ('meme')) # , 'transfac' |
|
189 |
+ if (fmt == 'meme') { |
|
190 |
+ text = paste (matrixToMemeText (object), collapse='\n') |
|
191 |
+ cat (text) |
|
192 |
+ invisible (text) |
|
193 |
+ } |
|
194 |
+ }) |
|
195 |
+ |
|
196 |
+#------------------------------------------------------------------------------- |
|
197 |
+setMethod('show', 'MotifList', |
|
198 |
+ |
|
199 |
+ function(object) { |
|
200 |
+ |
|
201 |
+ msg = sprintf ('MotifDb object of length %d', length (object)) |
|
202 |
+ cat (msg, '\n', sep='') |
|
203 |
+ if (length (object) == 0) |
|
204 |
+ return () |
|
205 |
+ |
|
206 |
+ cat ('| Created from downloaded public sources: 2012-Jul6', '\n', sep='') |
|
207 |
+ |
|
208 |
+ tbl.dataSource = as.data.frame (table (values (object)$dataSource)) |
|
209 |
+ tbl.org = as.data.frame (table (values (object)$organism)) |
|
210 |
+ tbl.org = head (tbl.org [order (tbl.org$Freq, decreasing=TRUE),]) |
|
211 |
+ totalMatrixCount = length (object) |
|
212 |
+ totalOrganismCount = length (unique (values (object)$organism)) |
|
213 |
+ dataSourceCount = nrow (tbl.dataSource) |
|
214 |
+ |
|
215 |
+ source.singular.or.plural = 'sources' |
|
216 |
+ if (dataSourceCount == 1) |
|
217 |
+ source.singular.or.plural = 'source' |
|
218 |
+ |
|
219 |
+ msg = sprintf ('| %d position frequency matrices from %d %s:', |
|
220 |
+ totalMatrixCount, dataSourceCount, source.singular.or.plural) |
|
221 |
+ cat (msg, '\n', sep='') |
|
222 |
+ for (r in 1:nrow (tbl.dataSource)) { |
|
223 |
+ dataSource = tbl.dataSource$Var1 [r] |
|
224 |
+ matrixCount = tbl.dataSource$Freq [r] |
|
225 |
+ msg = sprintf ('| %18s: %4d', dataSource, matrixCount) |
|
226 |
+ cat (msg, '\n', sep='') |
|
227 |
+ } # for r |
|
228 |
+ msg = sprintf ('| %d organism/s', totalOrganismCount) |
|
229 |
+ cat (msg, '\n', sep='') |
|
230 |
+ for (r in 1:nrow (tbl.org)) { |
|
231 |
+ organism = tbl.org$Var1 [r] |
|
232 |
+ orgCount = tbl.org$Freq [r] |
|
233 |
+ msg = sprintf ('| %18s: %4d', organism, orgCount) |
|
234 |
+ cat (msg, '\n', sep='') |
|
235 |
+ } # for r |
|
236 |
+ otherOrgCount = totalMatrixCount - sum (tbl.org$Freq) |
|
237 |
+ if (otherOrgCount > 0) { |
|
238 |
+ msg = sprintf ('| %18s: %4d', 'other', otherOrgCount) |
|
239 |
+ cat (msg, '\n', sep='') |
|
240 |
+ } |
|
241 |
+ if (!is.null (names (object))) { |
|
242 |
+ all.names = names (object) |
|
243 |
+ count = length (all.names) |
|
244 |
+ if (count <= 10) |
|
245 |
+ cat (paste (all.names, '\n'), sep='') |
|
246 |
+ else { |
|
247 |
+ cat (paste (all.names [1:5], '\n'), sep='') |
|
248 |
+ cat ('...', '\n', sep='') |
|
249 |
+ end = length (all.names) |
|
250 |
+ start = end - 4 |
|
251 |
+ cat (paste (all.names [start:end], '\n'), sep='') |
|
252 |
+ } |
|
253 |
+ } |
|
254 |
+ }) |
|
255 |
+#------------------------------------------------------------------------------- |
|
256 |
+setMethod ('query', 'MotifList', |
|
257 |
+ |
|
258 |
+ function (object, queryString, ignore.case=TRUE) { |
|
259 |
+ indices = unique (as.integer (unlist (sapply (colnames (values (object)), |
|
260 |
+ function (colname) |
|
261 |
+ grep (queryString, values (object)[, colname], |
|
262 |
+ ignore.case=ignore.case))))) |
|
263 |
+ object [indices] |
|
264 |
+ }) |
|
265 |
+#------------------------------------------------------------------------------- |
0 | 266 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,36 @@ |
1 |
+MotifDb <- NULL |
|
2 |
+#------------------------------------------------------------------------------- |
|
3 |
+.MotifDb = function (loadAllSources=TRUE, quiet=TRUE) |
|
4 |
+{ |
|
5 |
+ mdb = MotifList () |
|
6 |
+ |
|
7 |
+ if (loadAllSources) { |
|
8 |
+ data.path = system.file ('extdata', package='MotifDb') # e (system.file (package='MotifDb'), 'data', sep='/') |
|
9 |
+ data.files = dir (data.path, full.names=TRUE) |
|
10 |
+ |
|
11 |
+ if (length (data.files) > 0) |
|
12 |
+ for (data.file in data.files) { |
|
13 |
+ # define these to keep 'check' happy. they are loaded by 'load' |
|
14 |
+ tbl.md = NA; matrices = NA; |
|
15 |
+ variables = load (data.file) |
|
16 |
+ mdb = append (mdb, MotifList (matrices, tbl.md)) |
|
17 |
+ if (!quiet) |
|
18 |
+ message (noquote (sprintf ('added %s (%d) matrices, length now: %d', |
|
19 |
+ basename (data.file), length (matrices), length (mdb)))) |
|
20 |
+ } # for data.file |
|
21 |
+ |
|
22 |
+ if (!quiet) { |
|
23 |
+ print (table (values (mdb)$dataSource)) |
|
24 |
+ } |
|
25 |
+ } # if loadAllSources |
|
26 |
+ |
|
27 |
+ return (mdb) |
|
28 |
+ |
|
29 |
+} # MotifDb |
|
30 |
+#------------------------------------------------------------------------------- |
|
31 |
+.onLoad <- function(libname, pkgname) |
|
32 |
+{ |
|
33 |
+ MotifDb <<- .MotifDb (loadAllSources=TRUE, quiet=TRUE) |
|
34 |
+} |
|
35 |
+#------------------------------------------------------------------------------- |
|
36 |
+ |
0 | 37 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,436 @@ |
1 |
+\documentclass{article} |
|
2 |
+%% %\VignetteIndexEntry{MotifDb Overview} |
|
3 |
+%% %\VignettePackage{MotifDb} |
|
4 |
+\usepackage[noae]{Sweave} |
|
5 |
+\usepackage[left=0.5in,top=0.5in,right=0.5in,bottom=0.75in,nohead,nofoot]{geometry} |
|
6 |
+\usepackage{hyperref} |
|
7 |
+\usepackage[noae]{Sweave} |
|
8 |
+\usepackage{color} |
|
9 |
+\usepackage{graphicx} |
|
10 |
+\usepackage{caption} |
|
11 |
+\usepackage{subcaption} |
|
12 |
+ |
|
13 |
+\definecolor{Blue}{rgb}{0,0,0.5} |
|
14 |
+\definecolor{Green}{rgb}{0,0.5,0} |
|
15 |
+ |
|
16 |
+\RecustomVerbatimEnvironment{Sinput}{Verbatim}{% |
|
17 |
+ xleftmargin=1em,% |
|
18 |
+ fontsize=\small,% |
|
19 |
+ fontshape=sl,% |
|
20 |
+ formatcom=\color{Blue}% |
|
21 |
+ } |
|
22 |
+\RecustomVerbatimEnvironment{Soutput}{Verbatim}{% |
|
23 |
+ xleftmargin=0em,% |
|
24 |
+ fontsize=\scriptsize,% |
|
25 |
+ formatcom=\color{Blue}% |
|
26 |
+ } |
|
27 |
+\RecustomVerbatimEnvironment{Scode}{Verbatim}{xleftmargin=2em} |
|
28 |
+ |
|
29 |
+ |
|
30 |
+ |
|
31 |
+\renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} |
|
32 |
+\fvset{listparameters={\setlength{\topsep}{6pt}}} |
|
33 |
+% These determine the rules used to place floating objects like figures |
|
34 |
+% They are only guides, but read the manual to see the effect of each. |
|
35 |
+\renewcommand{\topfraction}{.99} |
|
36 |
+\renewcommand{\bottomfraction}{.99} |
|
37 |
+\renewcommand{\textfraction}{0.0} |
|
38 |
+ |
|
39 |
+\title{MotifDb} |
|
40 |
+\author{Paul Shannon} |
|
41 |
+ |
|
42 |
+\begin{document} |
|
43 |
+ |
|
44 |
+\maketitle |
|
45 |
+\begin{abstract} |
|
46 |
+Many kinds of biological activity are regulated by the binding of proteins to their cognate |
|
47 |
+substrates. Of particular interest is the sequence-specific binding of transcription factors to DNA, often in |
|
48 |
+regulatory regions just upstream of the transcription start site of a gene. These binding events play a pivotal |
|
49 |
+role in regulating gene expression. Sequence specificity among closely related binding sites is nearly always incomplete: some variety |
|
50 |
+in the DNA sequence is routinely observed. For this reason, these inexact binding sequence patterns are commonly |
|
51 |
+described as \emph{motifs} represented numerically as frequency matrices, and visualized as sequence logos. Despite their importance |
|
52 |
+in current research, there has been until now no single, annotated, comprehensive collection of publicly available motifs. |
|
53 |
+The current package provides such a collection, offering more than two thousand annotated matrices from multiple organisms, within the |
|
54 |
+context of the Bioconductor project. The matrices can be filtered and selected on the basis of their metadata, used with other |
|
55 |
+Bioconductor packages (MotIV for motif comparison, seqLogo for visualization) or easily exported for use with |
|
56 |
+standard software and websites such as those provided by the MEME Suite\footnote{http://meme.sdsc.edu/meme/doc/meme.html}. |
|
57 |
+\end{abstract} |
|
58 |
+ |
|
59 |
+\tableofcontents |
|
60 |
+ |
|
61 |
+\section{Introduction and Basic Operations} |
|
62 |
+ |
|
63 |
+The first step is to load the necessary packages: |
|
64 |
+ |
|
65 |
+<<libraries>>= |
|
66 |
+library (MotifDb) |
|
67 |
+library (MotIV) |
|
68 |
+library (seqLogo) |
|
69 |
+@ |
|
70 |
+ |
|
71 |
+<<hiddenCode results=hide, echo=FALSE>>= |
|
72 |
+MotIV.toTable = function (match) { |
|
73 |
+ if (length (match@bestMatch) == 0) |
|
74 |
+ return (NA) |
|
75 |
+ |
|
76 |
+ alignments = match@bestMatch[[1]]@aligns |
|
77 |
+ |
|
78 |
+ df = data.frame (stringsAsFactors=FALSE) |
|
79 |
+ for (alignment in alignments) { |
|
80 |
+ x = alignment |
|
81 |
+ name = x@TF@name |
|
82 |
+ eVal = x@evalue |
|
83 |
+ sequence = x@sequence |
|
84 |
+ match = x@match |
|
85 |
+ strand = x@strand |
|
86 |
+ df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, |
|
87 |
+ match=match, strand=strand, stringsAsFactors=FALSE)) |
|
88 |
+ } # for alignment |
|
89 |
+ return (df) |
|
90 |
+ } # MotIV.toTable |
|
91 |
+@ |
|
92 |
+ |
|
93 |
+%% MotifDb provides two kinds of loosely linked data: position frequency matrices, and metadata about each matrix. The matrix |
|
94 |
+%% names, and the rownames of the metadata table, are identical, so it is easy to map back and forth between |
|
95 |
+%% the two. Some measure of convenience is gained by extracting these two kinds of data into separate variables, |
|
96 |
+%% as we shall see. The cost in extra memory should not significant. |
|
97 |
+%% |
|
98 |
+%% <<all.matrices>>= |
|
99 |
+%% matrices.all = as.list (MotifDb) |
|
100 |
+%% metadata <- values (MotifDb) |
|
101 |
+%% @ |
|
102 |
+There are more than two thousand matrices, from five sources: |
|
103 |
+<<sources>>= |
|
104 |
+length (MotifDb) |
|
105 |
+sort (table (values (MotifDb)$dataSource), decreasing=TRUE) |
|
106 |
+@ |
|
107 |
+And 22 organisms (though the majority of the matrices come from just four): |
|
108 |
+<<organisms>>= |
|
109 |
+sort (table (values (MotifDb)$organism), decreasing=TRUE) |
|
110 |
+@ |
|
111 |
+ |
|
112 |
+With these categories of metadata |
|
113 |
+<<metadata>>= |
|
114 |
+colnames (values (MotifDb)) |
|
115 |
+@ |
|
116 |
+\section{Selection} |
|
117 |
+ |
|
118 |
+There are three ways to extract subsets of interest from the MotifDb collection. All three operate upon the MotifDb metadata, |
|
119 |
+matching values in one or more of those fifteen attributes (listed just above), and returning the subset of MotifDb which |
|
120 |
+meet the specified criteria. The three techniques: \emph{query}, \emph{subset} and \emph{grep} |
|
121 |
+ |
|
122 |
+\subsection{query} |
|
123 |
+This is the simplest technique to use, and will suffice in many circumstances. For example, if you want |
|
124 |
+all of the human matrices: |
|
125 |
+<<queryHuman>>= |
|
126 |
+query (MotifDb, 'hsapiens') |
|
127 |
+@ |
|
128 |
+If you want all matrices associated with \textbf{\emph{Sox}} transcription factors, regardless of dataSource or organism: |
|
129 |
+<<querySox>>= |
|
130 |
+query (MotifDb, 'sox') |
|
131 |
+@ |
|
132 |
+For all yeast transcription factors with a homeo domain |
|
133 |
+<<queryYeastHomeo>>= |
|
134 |
+query (query (MotifDb, 'cerevisiae'), 'homeo') |
|
135 |
+@ |
|
136 |
+The last example may inspire more confidence in the precision of the result than is justified, and for a couple |
|
137 |
+of reasons. First, the assignment of protein binding domains to specific categories is, as of 2012, an ad hoc |
|
138 |
+and incomplete process. Second, the query commands matches the supplied character string to \emph{all} metadata |
|
139 |
+columns. In this case, 'homeo' appears both in the \emph{bindingDomain} column and the \emph{tfFamily} column, |
|
140 |
+and the above \emph{query} will return matches from both. |
|
141 |
+Searching and filtering should always be accompanined by close scrutiny of the data, such as these commands |
|
142 |
+illustrate: |
|
143 |
+ |
|
144 |
+<<homeoVariety>>= |
|
145 |
+unique (grep ('homeo', values(MotifDb)$bindingDomain, ignore.case=T, v=T)) |
|
146 |
+unique (grep ('homeo', values(MotifDb)$tfFamily, ignore.case=T, v=T)) |
|
147 |
+@ |
|
148 |
+\subsection{grep} |
|
149 |
+This selection method (and the next, \emph{subset}) require that you address metadata columns explicitly. This is a little more |
|
150 |
+work, but the requisite direct engagement with the metadata is worthwhile. Repeating the 'query' examples from above, |
|
151 |
+you can see how more knowedge of MotifDb metadata is required. |
|
152 |
+<<grepHuman>>= |
|
153 |
+mdb.human <- MotifDb [grep ('Hsapiens', values (MotifDb)$organism)] |
|
154 |
+mdb.sox <- MotifDb [grep ('sox', values (MotifDb)$geneSymbol, ignore.case=TRUE)] |
|
155 |
+yeast.indices = grepl ('scere', values (MotifDb)$organism, ignore.case=TRUE) |
|
156 |
+homeo.indices.domain = grepl ('homeo', values (MotifDb)$bindingDomain, ignore.case=TRUE) |
|
157 |
+homeo.indices.family = grepl ('homeo', values (MotifDb)$tfFamily, ignore.case=TRUE) |
|
158 |
+yeast.homeo.indices = yeast.indices & (homeo.indices.domain | homeo.indices.family) |
|
159 |
+yeast.homeoDb = MotifDb [yeast.homeo.indices] |
|
160 |
+@ |
|
161 |
+ |
|
162 |
+An alternate and somewhat more compact approach: |
|
163 |
+<<withHomeo>>= |
|
164 |
+yeast.homeo.indices <- with(values(MotifDb), |
|
165 |
+ grepl('scere', organism, ignore.case=TRUE) & |
|
166 |
+ (grepl('homeo', bindingDomain, ignore.case=TRUE) | |
|
167 |
+ grepl('homeo', tfFamily, ignore.case=TRUE))) |
|
168 |
+ |
|
169 |
+@ |
|
170 |
+\subsection{subset} |
|
171 |
+MotifDb::subset emulates the R base data.frame \emph{subset} command, which is not unlike an SQL select function. |
|
172 |
+Unfortunately -- and just like the R base subset function -- this MotifDb method cannot be used reliably within a script: |
|
173 |
+\emph{It is only reliable when called interactively.} Here, with mixed success (as you will see) , we use MotifDb::subset to |
|
174 |
+reproduce the \emph{query} and \emph{grep} selections shown above. |
|
175 |
+ |
|
176 |
+<<subsetHuman>>= |
|
177 |
+if (interactive ()) |
|
178 |
+ subset (MotifDb, organism=='Hsapiens') |
|
179 |
+@ |
|
180 |
+One can easily find all the 'sox' genes with the subset command, avoiding possible upper/lower case conflicts by passing |
|
181 |
+the metadata's geneSymbol column through the function 'tolower': |
|
182 |
+<<subsetSox>>= |
|
183 |
+if (interactive ()) |
|
184 |
+ subset (MotifDb, tolower (geneSymbol) == 'sox4') |
|
185 |
+@ |
|
186 |
+Similarly, subset has limited application for a permissive 'homeo' search. |
|
187 |
+But for the retrieval by explicitly specified search terms, subset works very well: |
|
188 |
+<<subsetYeastHomeo>>= |
|
189 |
+if (interactive ()) |
|
190 |
+ subset (MotifDb, organism=='Scerevisiae' & bindingDomain=='Homeo') |
|
191 |
+@ |
|
192 |
+ |
|
193 |
+\subsection{The Egr1 Case Study} |
|
194 |
+ |
|
195 |
+We now do a simple geneSymbol search, followed by an examination of the sub-MotifDb the search returns. We are looking for all matrices |
|
196 |
+associated with the well-known and highly conserved zinc-finger transcription factor, Egr1. |
|
197 |
+There are two of these in MotifDb, both from mouse, and each from a different data source. |
|
198 |
+ |
|
199 |
+<<findEgr1>>= |
|
200 |
+ # subset is convenient: |
|
201 |
+if (interactive ()) |
|
202 |
+ as.list (subset (MotifDb, tolower (geneSymbol) == 'egr1')) |
|
203 |
+ # grep returns indices which allow for more flexibility |
|
204 |
+indices = grep ('egr1', values (MotifDb)$geneSymbol, ignore.case=TRUE) |
|
205 |
+length (indices) |
|
206 |
+@ |
|
207 |
+There are a variety of ways to examine and extract data from this object, a MotifList of length 2. |
|
208 |
+<<MotifDbViews>>= |
|
209 |
+MotifDb [indices] |
|
210 |
+@ |
|
211 |
+ |
|
212 |
+Now view the matrices as a named list: |
|
213 |
+<<as.list>>= |
|
214 |
+as.list (MotifDb [indices]) |
|
215 |
+@ |
|
216 |
+and finally, the metadata associated with these two matrices, transposed, for easy reading and comparison: |
|
217 |
+<<as.metadata>>= |
|
218 |
+noquote (t (as.data.frame (values (MotifDb [indices])))) |
|
219 |
+@ |
|
220 |
+ |
|
221 |
+We used the \emph{grep} function above to find rows in the metadata table whose \emph{geneSymbol} column includes the string 'Egr1'. |
|
222 |
+If you wish to identify matrices (and/or their attendant metadata) based upon a richer combination of criteria, for instance: |
|
223 |
+ |
|
224 |
+\begin{enumerate} |
|
225 |
+ \item organism (\emph{Mmusculus}) |
|
226 |
+ \item gene symbol (\emph{Egr1}) |
|
227 |
+ \item data source (\emph{JASPAR\_CORE}) |
|
228 |
+\end{enumerate} |
|
229 |
+ |
|
230 |
+the grep solution, while serviceable, becomes a little awkward: |
|
231 |
+<<egr1-multi-grep>>= |
|
232 |
+ |
|
233 |
+geneSymbol.rows = grep ('Egr1', values (MotifDb)$geneSymbol, ignore.case=TRUE) |
|
234 |
+organism.rows = grep ('Mmusculus', values (MotifDb)$organism, ignore.case=TRUE) |
|
235 |
+source.rows = grep ('JASPAR', values (MotifDb)$dataSource, ignore.case=TRUE) |
|
236 |
+egr1.mouse.jaspar.rows = intersect (geneSymbol.rows, |
|
237 |
+ intersect (organism.rows, source.rows)) |
|
238 |
+print (egr1.mouse.jaspar.rows) |
|
239 |
+egr1.motif <- MotifDb [egr1.mouse.jaspar.rows] |
|
240 |
+@ |
|
241 |
+<<MotifDbViews>>= |
|
242 |
+@ |
|
243 |
+ |
|
244 |
+Far more concise, and fully reliable as an interactive command (though \emph{not} if used in a |
|
245 |
+script\footnote{See the help page of the base R command subset for detail), is the \emph{subset} command}): |
|
246 |
+<<subsetSearchForEgr1>>= |
|
247 |
+if (interactive ()) { |
|
248 |
+ egr1.motif <- subset (MotifDb, organism=='Mmusculus' & |
|
249 |
+ dataSource=='JASPAR_CORE' & |
|
250 |
+ geneSymbol=='Egr1') |
|
251 |
+ } |
|
252 |
+@ |
|
253 |
+Whichever method you use, this next chunk of code displays the matrix, and then the metadata for mouse JASPAR Egr1, the latter |
|
254 |
+textually-transformed for easy reading within the size constraints of this page. |
|
255 |
+<<examine-egr1>>= |
|
256 |
+egr1.motif |
|
257 |
+as.list (egr1.motif) |
|
258 |
+noquote (t (as.data.frame (values (egr1.motif)))) |
|
259 |
+@ |
|
260 |
+ |
|
261 |
+ |
|
262 |
+Next we use the bioconductor \emph{seqLogo} package to display this motif. |
|
263 |
+ |
|
264 |
+<<egr1, fig=TRUE, include=FALSE>>= |
|
265 |
+seqLogo (as.list (egr1.motif)[[1]]) |
|
266 |
+@ |
|
267 |
+ |
|
268 |
+\begin{figure}[htpb!] |
|
269 |
+ \centering |
|
270 |
+ \includegraphics[width=0.3\textwidth]{MotifDb-egr1} |
|
271 |
+ \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1} |
|
272 |
+\end{figure} |
|
273 |
+ |
|
274 |
+\section{Motif Matching} |
|
275 |
+We will look for the ten position frequency matrices which are the best match to JASPAR's mouse EGR1, using |
|
276 |
+the MotIV package. We actually request the top eleven hits from the entire MotifDb, since the first hit |
|
277 |
+should be the target matrix itself, since that is of necessity found in the full MotifDb. |
|
278 |
+ |
|
279 |
+<<motifmatch>>= |
|
280 |
+egr1.hits <- motifMatch (as.list (egr1.motif) [1], as.list (MotifDb), top=11) |
|
281 |
+# 'MotIV.toTable' -- defined above (and hidden) -- will become part of MotIV in the upcoming release |
|
282 |
+tbl.hits <- MotIV.toTable (egr1.hits) |
|
283 |
+print (tbl.hits) |
|
284 |
+@ |
|
285 |
+ |
|
286 |
+The \emph{sequence} column in this table is the \emph{consensus sequence} -- with heterogeneity left out -- for the |
|
287 |
+matrix it describes. |
|
288 |
+ |
|
289 |
+\vspace{10 mm} |
|
290 |
+ |
|
291 |
+\textbf{\emph{Puzzling: the strand of the match reported above is opposite of what I expected, and opposite of what seqLogo displays. |
|
292 |
+ This is a question for the MotIV developers.}} |
|
293 |
+ |
|
294 |
+\vspace{10 mm} |
|
295 |
+ |
|
296 |
+The six logos appear below, beginning with the logo of the query matrix, \emph{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}, including |
|
297 |
+two other mouse matrices, and two zinc-finger fly matrices. Examining the three mouse matrices and their metadata reveals that |
|
298 |
+all three (geneSymbol differences aside) describe the same protein: |
|
299 |
+<<three.mice.metadata>>= |
|
300 |
+if (interactive ()) |
|
301 |
+ noquote (t (as.data.frame (subset (values (MotifDb), geneId=='13653')))) |
|
302 |
+@ |
|
303 |
+Zinc finger protein domains are classified into many \emph{fold groups}; their respective cognate DNA sequence may classify similarly. |
|
304 |
+That two fly matrices significantly match three reports of the mouse Egr1 motif suggests impressive conservation of this |
|
305 |
+binding pattern, or convergent evolution. |
|
306 |
+ |
|
307 |
+Let us look at the metadata for the first fly match, whose geneId is \textbf{FBgn0003499}: |
|
308 |
+<<fly.Sr.metadata>>= |
|
309 |
+noquote (t (as.data.frame (values (MotifDb)[grep ('FBgn0003499', values (MotifDb)$geneId),]))) |
|
310 |
+@ Note that the SANGER motif, based on 18 sequences, had a high fidelity match to mouse Egr1 (see above, 10e-12), but |
|
311 |
+that the SOLEXA motif, based upon 2316 sequences, did not (in work not shown, it appears 22nd in the an expanded |
|
312 |
+motifMatch hit list, with a eval of 10e-5). It is possible that the SOLEXA motif is more accurate, and that a close |
|
313 |
+examination of this case, including sequence logos, position frequency matrices, and the search parameters of |
|
314 |
+motifMatch, will be instructive. Repeating the search with \emph{tomtom} might also be illuminating -- either as |
|
315 |
+confirmation of MotIV and the default parameterization we used, or as a correction to it. Here we see the facilities |
|
316 |
+for exploratory data analysis MotifDb provides, and the opportunities for data analysis which result. |
|
317 |
+ |
|
318 |
+ |
|
319 |
+<<logo1, fig=TRUE, include=FALSE, echo=FALSE>>= |
|
320 |
+ seqLogo (MotifDb [[tbl.hits$name[1]]]) |
|
321 |
+@ |
|
322 |
+ |
|
323 |
+<<logo2, fig=TRUE, include=FALSE, echo=FALSE>>= |
|
324 |
+ seqLogo (MotifDb [[tbl.hits$name[2]]]) |
|
325 |
+@ |
|
326 |
+ |
|
327 |
+<<logo3, fig=TRUE, include=FALSE, echo=FALSE>>= |
|
328 |
+ seqLogo (MotifDb [[tbl.hits$name[3]]]) |
|
329 |
+@ |
|
330 |
+ |
|
331 |
+<<logo4, fig=TRUE, include=FALSE, echo=FALSE>>= |
|
332 |
+ seqLogo (MotifDb [[tbl.hits$name[4]]]) |
|
333 |
+@ |
|
334 |
+ |
|
335 |
+<<logo5, fig=TRUE, include=FALSE, echo=FALSE>>= |
|
336 |
+ seqLogo (MotifDb [[tbl.hits$name[5]]]) |
|
337 |
+@ |
|
338 |
+ |
|
339 |
+<<logo6, fig=TRUE, include=FALSE, echo=FALSE>>= |
|
340 |
+ seqLogo (MotifDb [[tbl.hits$name[6]]]) |
|
341 |
+@ |
|
342 |
+ |
|
343 |
+\begin{figure}[htpb!] |
|
344 |
+ \centering |
|
345 |
+ \begin{subfigure}[b]{0.38\textwidth} |
|
346 |
+ \includegraphics[width=\textwidth]{MotifDb-logo1} |
|
347 |
+ \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1} |
|
348 |
+ \label{fig:Egr1-MA0162.1} |
|
349 |
+ \end{subfigure}% |
|
350 |
+ \begin{subfigure}[b]{0.38\textwidth} |
|
351 |
+ \includegraphics[width=\textwidth]{MotifDb-logo2} |
|
352 |
+ \caption{Dme-FFS-sr\_SANGER\_5\_FBgn0003499\\(abbreviated)} |
|
353 |
+ \label{fig:Egr1-logo2} |
|
354 |
+ \end{subfigure}% |
|
355 |
+\end{figure} |
|
356 |
+ |
|
357 |
+\begin{figure}[htpb!] |
|
358 |
+ \centering |
|
359 |
+ \begin{subfigure}[b]{0.38\textwidth} |
|
360 |
+ \includegraphics[width=\textwidth]{MotifDb-logo3} |
|
361 |
+ \caption{Mmusculus-UniPROBE-Zif268.UP00400} |
|
362 |
+ \label{fig:Egr1-logo3} |
|
363 |
+ \end{subfigure}% |
|
364 |
+ \begin{subfigure}[b]{0.38\textwidth} |
|
365 |
+ \includegraphics[width=\textwidth]{MotifDb-logo4} |
|
366 |
+ \caption{Dme-FFS-klu\_SANGER\_10\_FBgn0013469} |
|
367 |
+ \label{fig:Egr1-logo4} |
|
368 |
+ \end{subfigure}% |
|
369 |
+\end{figure} |
|
370 |
+ |
|
371 |
+ |
|
372 |
+\begin{figure}[htpb!] |
|
373 |
+ \centering |
|
374 |
+ \begin{subfigure}[b]{0.38\textwidth} |
|
375 |
+ \includegraphics[width=\textwidth]{MotifDb-logo5} |
|
376 |
+ \caption{Mmusculus-UniPROBE-Egr1.UP00007} |
|
377 |
+ \label{fig:Egr1-logo5} |
|
378 |
+ \end{subfigure}% |
|
379 |
+ \begin{subfigure}[b]{0.38\textwidth} |
|
380 |
+ \centering |
|
381 |
+ \includegraphics[width=\textwidth]{MotifDb-logo6} |
|
382 |
+ \caption{Dme-FFS-klu\_SOLEXA\_5\_FBgn0013469} |
|
383 |
+ \label{fig:Egr1-logo6} |
|
384 |
+ \end{subfigure}% |
|
385 |
+\end{figure} |
|
386 |
+ |
|
387 |
+\newpage |
|
388 |
+ |
|
389 |
+\section{Exporting to the MEME Suite} |
|
390 |
+Some users of this package may wish to export the data -- both matrices and metadata -- so that they may be used in |
|
391 |
+other programs. The MEME suite, among others, is broadly useful, continuously improved and well-regarded throughout the |
|
392 |
+bioinformatics community. The code below exports all of the MotifDb matrices as a text file in the MEME format, and all |
|
393 |
+of the metadata as a tab-delimited text file. |
|
394 |
+ |
|
395 |
+<<export>>= |
|
396 |
+matrix.output.file = tempfile () # substitute your preferred filename here |
|
397 |
+meme.text = export (MotifDb, matrix.output.file, 'meme') |
|
398 |
+ |
|
399 |
+metadata.output.file = tempfile () # substitute your preferred filename here |
|
400 |
+write.table (as.data.frame (values (MotifDb)), file=metadata.output.file, sep='\t', |
|
401 |
+ row.names=TRUE, col.names=TRUE, quote=FALSE) |
|
402 |
+@ |
|
403 |
+ |
|
404 |
+\section{Future Work} |
|
405 |
+This first version of MotifDb collects into one R package all of the best-known public domain protein-DNA binding matrices, with |
|
406 |
+as much metadata as could be gleaned from the five providers. However, not all of these matrices are equally supported by data |
|
407 |
+and by no means are all are accompanied by complete metadata. |
|
408 |
+ |
|
409 |
+With the passage of time our knowledge of protein-DNA binding sequence motifs will improve. They will be derived from more |
|
410 |
+binding events, with more precision and specificity, and accompanied by more (and better understood) contextual detail. Cooperative binding, |
|
411 |
+mentioned only in a few times in the current (July 2012) version of this package, will be well-represented. Metadata will improve. |
|
412 |
+Better assignment of binding domains to consensus categories will be especially useful when it is available. Three-dimensional |
|
413 |
+models of specific proteins binding to specific DNA may someday become commonplace. |
|
414 |
+ |
|
415 |
+ |
|
416 |
+\section{References} |
|
417 |
+ |
|
418 |
+\begin{itemize} |
|
419 |
+ |
|
420 |
+\item Portales-Casamar E, Thongjuea S, Kwon AT, Arenillas D, Zhao X, Valen E, Yusuf D, Lenhard B, Wasserman WW, Sandelin A. JASPAR 2010: the greatly expanded open-access database of transcription factor binding profiles. Nucleic Acids Res. 2010 Jan;38(Database issue):D105-10. Epub 2009 Nov 11. |
|
421 |
+ |
|
422 |
+\item Robasky K, Bulyk ML. UniPROBE, update 2011: expanded content and search tools in the online database of protein-binding microarray data on protein-DNA interactions. Nucleic Acids Res. 2011 Jan;39(Database issue):D124-8. Epub 2010 Oct 30. |
|
423 |
+ |
|
424 |
+\item Spivak AT, Stormo GD. ScerTF: a comprehensive database of benchmarked position weight matrices for Saccharomyces species. Nucleic Acids Res. 2012 Jan;40(Database issue):D162-8. Epub 2011 Dec 2. |
|
425 |
+ |
|
426 |
+\item Xie Z, Hu S, Blackshaw S, Zhu H, Qian J. hPDI: a database of experimental human protein-DNA interactions. Bioinformatics. 2010 Jan 15;26(2):287-9. Epub 2009 Nov 9. |
|
427 |
+ |
|
428 |
+\item Zhu LJ, et al. 2011. FlyFactorSurvey: a database of Drosophila transcription factor binding specificities determined using the bacterial one-hybrid system. Nucleic Acids Res. 2011 Jan;39(Database issue):D111-7. Epub 2010 Nov 19. |
|
429 |
+ |
|
430 |
+ |
|
431 |
+\end{itemize} |
|
432 |
+ |
|
433 |
+ |
|
434 |
+\end{document} |
|
435 |
+ |
|
436 |
+ |
10 | 447 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,85 @@ |
1 |
+default: build install |
|
2 |
+ |
|
3 |
+help: |
|
4 |
+ egrep "^#" makefile | sed "s/^#//" |
|
5 |
+ |
|
6 |
+clean: |
|
7 |
+ - rm ../../MotifDb_*tar.gz |
|
8 |
+ |
|
9 |
+ |
|
10 |
+# --- quickbuild: no vignette |
|
11 |
+# |
|
12 |
+quickbuild: |
|
13 |
+ (cd ../..; R CMD build --no-vignettes MotifDb) |
|
14 |
+ |
|
15 |
+ |
|
16 |
+# --- build |
|
17 |
+# |
|
18 |
+build: |
|
19 |
+ (cd ../..; R CMD build --no-vignettes MotifDb) |
|
20 |
+ |
|
21 |
+# --- install |
|
22 |
+# |
|
23 |
+install: |
|
24 |
+ (cd ../..; R CMD install MotifDb) |
|
25 |
+ |
|
26 |
+# --- check |
|
27 |
+# |
|
28 |
+check: clean build install |
|
29 |
+ (cd ../..; R CMD check --no-manual --no-vignettes --no-codoc --no-examples --no-manual MotifDb) |
|
30 |
+ |
|
31 |
+# --- checkfull |
|
32 |
+# |
|
33 |
+checkfull: |
|
34 |
+ (cd ../..; R CMD build MotifDb) |
|
35 |
+ (cd ../..; R CMD check MotifDb) |
|
36 |
+ |
|
37 |
+ |
|
38 |
+# --- vanillaTest |
|
39 |
+# run all the unit tests, in a clean context |
|
40 |
+# |
|
41 |
+ |
|
42 |
+vanillaTest: build install |
|
43 |
+ - rm vanillaTest.out |
|
44 |
+ R --vanilla < vanillaTest.R > vanillaTest.out 2>&1 |
|
45 |
+ |
|
46 |
+# --- vt |
|
47 |
+# run all the unit tests, in a clean context |
|
48 |
+# |
|
49 |
+ |
|
50 |
+vt: vanillaTest |
|
51 |
+ |
|
52 |
+ |
|
53 |
+# --- checkvig |
|
54 |
+# check just the vignette |
|
55 |
+# |
|
56 |
+ |
|
57 |
+checkvig: |
|
58 |
+ (cd ../..; R CMD check --no-manual --no-codoc --no-tests --no-examples MotifDb) |
|
59 |
+ |
|
60 |
+ |
|
61 |
+# --- tangle |
|
62 |
+# extract the R code from the vignette file |
|
63 |
+# |
|
64 |
+ |
|
65 |
+tangle: |
|
66 |
+ (cd ../vignettes; R CMD Stangle MotifDb.Rnw) |
|
67 |
+ |
|
68 |
+ |
|
69 |
+ |
|
70 |
+# --- sweave |
|
71 |
+# creates MotifDb.tex, runs all embedded examples |
|
72 |
+# run this before the pdf target |
|
73 |
+# |
|
74 |
+sweave: |
|
75 |
+ (cd ../vignettes; R CMD Sweave MotifDb.Rnw --pdf) |
|
76 |
+ |
|
77 |
|
|
78 |
+# make and open MotifDb.pdf, the vignette |
|
79 |
+# |
|
80 |
+ |
|
81 |
+pdf: sweave |
|
82 |
+ (cd ../vignettes; open MotifDb.pdf) |
|
83 |
+ |
|
84 |
+ |
|
85 |
+ |
0 | 86 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,304 @@ |
1 |
+Index: MotifList-class.R |
|
2 |
+=================================================================== |
|
3 |
+--- MotifList-class.R (revision 67138) |
|
4 |
+@@ -4,10 +4,30 @@ |
|
5 |
+ prototype(elementType="matrix") |
|
6 |
+ ) |
|
7 |
+ |
|
8 |
+-setGeneric ('add', signature='obj', function (obj, matrices, tbl.metadata) standardGeneric ('add')) |
|
9 |
++setValidity("MotifList", function(object) { |
|
10 |
++ msg <- NULL |
|
11 |
++ |
|
12 |
++ ## what makes for a valid MotifList? |
|
13 |
++ if (is.null(names(object))) |
|
14 |
++ msg <- c(msg, "'names()' must not be NULL") |
|
15 |
++ else if (any(duplicated(names(object)))) |
|
16 |
++ msg <- c(msg, "all 'names()' must be unique") |
|
17 |
++ ## all(sapply(object, class) == "matrix") works for 0-length object |
|
18 |
++ if (!all(sapply(object, class) == "matrix")) |
|
19 |
++ msg <- c(msg, "all elements must be of class 'matrix'") |
|
20 |
++ if (!all(sapply(object, nrow) == 4)) |
|
21 |
++ msg <- c(msg, "all elements must have 4 rows") |
|
22 |
++ ok <- sapply(object, function(elt) all(colSums(elt) == 1)) |
|
23 |
++ if (!all(ok)) |
|
24 |
++ msg <- c(msg, "all elements must have colSums equal to 1") |
|
25 |
++ |
|
26 |
++ if (is.null(msg)) TRUE else msg |
|
27 |
++}) |
|
28 |
++ |
|
29 |
++setGeneric ('add', function (obj, matrices, tbl.metadata) standardGeneric ('add'), |
|
30 |
++ signature='obj') |
|
31 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
32 |
+ MotifList = function (matrices=list(), |
|
33 |
+- names=character(), # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a |
|
34 |
+ providerNames=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
35 |
+ providerIds=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
36 |
+ geneSymbols=character(), # ABF2, Rhox11 |
|
37 |
+@@ -22,27 +42,9 @@ |
|
38 |
+ dataSources=character(), # ScerTF, UniPROBE |
|
39 |
+ experimentTypes=character(), # NA, protein-binding microarray |
|
40 |
+ pubmedIDs=character(), # 19111667, 1858359 |
|
41 |
+- tfFamilies=character()) # NA, NA |
|
42 |
++ tfFamilies=character()) |
|
43 |
+ { |
|
44 |
+- names (matrices) = names |
|
45 |
+- count = length (matrices) |
|
46 |
+- stopifnot (length (names) == count) |
|
47 |
+- stopifnot (length (providerNames) == count) |
|
48 |
+- stopifnot (length (providerIds) == count) |
|
49 |
+- stopifnot (length (geneSymbols) == count) |
|
50 |
+- stopifnot (length (geneIds) == count) |
|
51 |
+- stopifnot (length (geneIdTypes) == count) |
|
52 |
+- stopifnot (length (proteinIds) == count) |
|
53 |
+- stopifnot (length (proteinIdTypes) == count) |
|
54 |
+- stopifnot (length (sequenceCounts) == count) |
|
55 |
+- stopifnot (length (organisms) == count) |
|
56 |
+- stopifnot (length (bindingDomains) == count) |
|
57 |
+- stopifnot (length (bindingSequences) == count) |
|
58 |
+- stopifnot (length (dataSources) == count) |
|
59 |
+- stopifnot (length (experimentTypes) == count) |
|
60 |
+- stopifnot (length (pubmedIDs) == count) |
|
61 |
+- stopifnot (length (tfFamilies) == count) |
|
62 |
+- |
|
63 |
++ ## these are redundant with DataFrame validity method |
|
64 |
+ elementMetadata = DataFrame (#name=names, |
|
65 |
+ providerName=providerNames, |
|
66 |
+ providerId=providerIds, |
|
67 |
+@@ -63,45 +65,33 @@ |
|
68 |
+ |
|
69 |
+ } # ctor |
|
70 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
71 |
++## quiet vs. verbose |
|
72 |
+ MotifDb = function (loadAllSources=TRUE, quiet=TRUE) |
|
73 |
+ { |
|
74 |
+ mdb <- MotifList () |
|
75 |
+ if (loadAllSources) { |
|
76 |
+- data.path = paste (system.file (package='MotifDb'), 'data', sep='/') |
|
77 |
++ data.path = system.file(package="MotifDb", "data") |
|
78 |
+ data.files = dir (data.path, full.names=TRUE) |
|
79 |
+- sources.to.exclude = c ('ScerTF.RData') # pending minor revisions |
|
80 |
+- removers = as.integer (sapply (sources.to.exclude, function (source) grep (source, data.files))) |
|
81 |
+- if (length (removers) > 0) |
|
82 |
+- data.files = data.files [-removers] |
|
83 |
+- |
|
84 |
+- count = 0 |
|
85 |
+- if (length (data.files) > 0) |
|
86 |
++ sources.to.exclude = "ScerTF.RData" # pending minor revisions |
|
87 |
++ data.files = data.files[!basename(data.files) %in% sources.to.exclude] |
|
88 |
+ for (data.file in data.files) { |
|
89 |
++ ## see globalVariables(tbl.md, matrices) |
|
90 |
+ tbl.md = NA; matrices = NA; # define these to keep 'check' happy. they are loaded by 'load' |
|
91 |
+- variables = load (data.file) |
|
92 |
+- mdb = add (mdb, matrices, tbl.md) |
|
93 |
+- filename.tokens = strsplit (data.file, '\\/') [[1]] |
|
94 |
+- shortFilename = filename.tokens [length (filename.tokens)] |
|
95 |
+- if (!quiet) |
|
96 |
+- print (noquote (sprintf ('added %s (%d) matrices, length now: %d', shortFilename, length (matrices), length (mdb)))) |
|
97 |
++ load (data.file) |
|
98 |
++ mdb <- append(mdb, do.call("MotifList", c(matrices, tbl.md))) |
|
99 |
++ if (!quiet) |
|
100 |
++ message(sprintf ('added %s (%d) matrices, length now: %d', |
|
101 |
++ basename(data.file), length (matrices), |
|
102 |
++ length (mdb))) |
|
103 |
+ } # for data.file |
|
104 |
+ |
|
105 |
+- if (!quiet) { |
|
106 |
++ if (!quiet) |
|
107 |
+ print (table (mdb$dataSource)) |
|
108 |
+- } |
|
109 |
+ } # if loadAllSources |
|
110 |
+ |
|
111 |
+- return (mdb) |
|
112 |
+- |
|
113 |
++ mdb |
|
114 |
+ } # MotifDb |
|
115 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
116 |
+-setMethod ('length', signature = 'MotifList', |
|
117 |
+- |
|
118 |
+- function (x) { |
|
119 |
+- return (length (x@listData)) |
|
120 |
+- }) |
|
121 |
+- |
|
122 |
+-#------------------------------------------------------------------------------------------------------------------------ |
|
123 |
+ setMethod ('subset', signature = 'MotifList', |
|
124 |
+ |
|
125 |
+ function (x, subset, select, drop = FALSE, ...) { |
|
126 |
+@@ -115,42 +105,26 @@ |
|
127 |
+ stop("'subset' must be coercible to logical") |
|
128 |
+ i <- i & !is.na(i) |
|
129 |
+ } # else |
|
130 |
+- return (x [i]) |
|
131 |
++ x [i] |
|
132 |
+ }) |
|
133 |
+ |
|
134 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
135 |
++## 'add' is too general a name; implement as do.call + append, but |
|
136 |
++## then not much added value so omit method? |
|
137 |
+ setMethod ('add', signature = 'MotifList', |
|
138 |
+ |
|
139 |
+ function (obj, matrices, tbl.metadata) { |
|
140 |
+ |
|
141 |
+- full.names = names (matrices) # full in this sense: organism-source-providerName[-possibleUniquifier] |
|
142 |
+- size = length (matrices) |
|
143 |
+- empty.chars = rep (NA_character_, size) |
|
144 |
+- empty.ints = rep (NA_integer_, size) |
|
145 |
+- new.list = MotifList (matrices, |
|
146 |
+- names=full.names, |
|
147 |
+- providerNames=tbl.metadata$providerName, # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
148 |
+- providerIds=tbl.metadata$providerId, # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
149 |
+- geneSymbols=tbl.metadata$geneSymbol, # ABF2, Rhox11n |
|
150 |
+- geneIds=tbl.metadata$geneId, |
|
151 |
+- geneIdTypes=tbl.metadata$geneIdType, |
|
152 |
+- proteinIds=tbl.metadata$proteinId, |
|
153 |
+- proteinIdTypes=tbl.metadata$proteinIdType, |
|
154 |
+- sequenceCounts=tbl.metadata$sequenceCount, # often NA |
|
155 |
+- organisms=tbl.metadata$organism, # Scerevisiae, Mmusculus |
|
156 |
+- bindingDomains=tbl.metadata$bindingDomain, # NA, Homeo |
|
157 |
+- bindingSequences=tbl.metadata$bindingSequence, |
|
158 |
+- dataSources=tbl.metadata$dataSource, # ScerTF, UniPROBE |
|
159 |
+- experimentTypes=tbl.metadata$experimentType, # NA, protein-binding microarray, SELEX, |
|
160 |
+- pubmedIDs=tbl.metadata$pubmedID, # 19111667, 1858359 |
|
161 |
+- tfFamilies=tbl.metadata$tfFamily) # NA |
|
162 |
+- obj@listData = c (obj@listData, new.list@listData) |
|
163 |
+- elementMetadata (obj) = rbind (elementMetadata (obj), elementMetadata (new.list)) |
|
164 |
+- return (obj) |
|
165 |
++ if (!is.list(matrices)) # allow solo matrix |
|
166 |
++ matrices <- list(matrices) |
|
167 |
++ ## 1:1 mapping between tbl.metadata (data.frame?) column names |
|
168 |
++ ## and MotifDb elementMetadata |
|
169 |
++ new.list <- do.call("MotifList", c(matrices, tbl.metadata)) |
|
170 |
++ append(obj, new.list) |
|
171 |
+ }) |
|
172 |
+ |
|
173 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
174 |
+-setMethod("$", signature="MotifList", |
|
175 |
++setMethod("$", signature="MotifList", # is this a good idea? |
|
176 |
+ |
|
177 |
+ function(x, name) { |
|
178 |
+ elementMetadata (x)[[name]] |
|
179 |
+@@ -201,45 +175,38 @@ |
|
180 |
+ { |
|
181 |
+ matrix.count = length (matrices) |
|
182 |
+ |
|
183 |
+- # incoming matrices have nucleotide rows, position columns. meme format, however, requires position-rows, and nucleotide-columns |
|
184 |
+- # calculate the number of lines of text by counting columns |
|
185 |
+- total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol))) |
|
186 |
+- predicted.line.count = 12 + (3 * length (matrices)) + total.transposed.matrix.rows |
|
187 |
+- s = vector ('character', predicted.line.count) |
|
188 |
+- |
|
189 |
++ ## incoming matrices have nucleotide rows, position columns. meme |
|
190 |
++ ## format, however, requires position-rows, and nucleotide-columns |
|
191 |
++ ## calculate the number of lines of text by counting columns |
|
192 |
+ #printf ("length of pre-allocated vector: %d", length (s)) |
|
193 |
+- s [1] = 'MEME version 4' |
|
194 |
+- s [2] = '' |
|
195 |
+- s [3] = 'ALPHABET= ACGT' |
|
196 |
+- s [4] = '' |
|
197 |
+- s [5] = 'strands: + -' |
|
198 |
+- s [6] = '' |
|
199 |
+- s [7] = 'Background letter frequencies' |
|
200 |
+- s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 ' |
|
201 |
+- s [9] = '' |
|
202 |
+ |
|
203 |
+- index = 10 |
|
204 |
+- for (name in names (matrices)) { |
|
205 |
+- # transpose the frequency matrix version of the incoming matrix, hence 'tfMat' |
|
206 |
+- tfMat = transformMatrixToMemeRepresentation (matrices [[name]]) |
|
207 |
+- # meme output may be used by tomtom, which uses matrix names as part of image filenames. |
|
208 |
+- # removed all file-system-unfriendly characters here |
|
209 |
++ header = c( |
|
210 |
++ 'MEME version 4', |
|
211 |
++ '', |
|
212 |
++ 'ALPHABET= ACGT', |
|
213 |
++ '', |
|
214 |
++ 'strands: + -', |
|
215 |
++ '', |
|
216 |
++ 'Background letter frequencies', |
|
217 |
++ 'A 0.250 C 0.250 G 0.250 T 0.250 ', |
|
218 |
++ '') |
|
219 |
++ |
|
220 |
++ tfStrings <- Map(function(name, tfMat) { |
|
221 |
++ tfMat = transformMatrixToMemeRepresentation (tfMat) |
|
222 |
++ ## meme output may be used by tomtom, which uses matrix names as |
|
223 |
++ ## part of image filenames. removed all file-system-unfriendly |
|
224 |
++ ## characters here |
|
225 |
+ fixed.name = gsub ('\\/', '_', name) |
|
226 |
+- s [index] = sprintf ('MOTIF %s', fixed.name) |
|
227 |
+- index = index + 1 |
|
228 |
+- s [index] = sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020', nrow (tfMat), 45, 8.1e-020) |
|
229 |
+- index = index + 1 |
|
230 |
+- for (r in 1:nrow (tfMat)) { |
|
231 |
+- new.row = sprintf (' %12.10f %12.10f %12.10f %12.10f', tfMat [r,1], tfMat [r,2], tfMat [r,3], tfMat [r,4]) |
|
232 |
+- s [index] = new.row |
|
233 |
+- index = index + 1 |
|
234 |
+- } |
|
235 |
+- s [index] = '' |
|
236 |
+- index = index + 1 |
|
237 |
+- } # for name |
|
238 |
++ id <- sprintf ('MOTIF %s', fixed.name) |
|
239 |
++ desc <- |
|
240 |
++ sprintf ('letter-probability matrix: alength= 4 w= %d nsites= 45 E=8.1e-020', |
|
241 |
++ nrow (tfMat)) |
|
242 |
++ mat <- sprintf (' %12.10f %12.10f %12.10f %12.10f', |
|
243 |
++ tfMat[,1], tfMat[,2], tfMat[,3], tfMat[,4]) |
|
244 |
++ c(id, desc, mat, "") |
|
245 |
++ }, names(matricies), matricies) |
|
246 |
+ |
|
247 |
+- invisible (s) |
|
248 |
+- |
|
249 |
++ invisible(c(header, unlist(tfStrings))) |
|
250 |
+ } # matrixToMemeText |
|
251 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
252 |
+ # write to connection with specified format |
|
253 |
+@@ -250,14 +217,11 @@ |
|
254 |
+ setMethod ('export', signature=c(object='MotifList', con='character', format='character'), |
|
255 |
+ |
|
256 |
+ function (object, con, format, ...) { |
|
257 |
+- |
|
258 |
+- fmt = match.arg (tolower (format), c ('meme', 'transfac')) |
|
259 |
+- if (fmt %in% c ('meme', 'transfac')) { |
|
260 |
+- text = matrixToMemeText (object) |
|
261 |
+- } |
|
262 |
++ ## do minimum work unique to this method, then dispatch to avoid |
|
263 |
++ ## code duplication |
|
264 |
+ con = file (con, 'w') |
|
265 |
+- cat (text, sep='\n', file=con) |
|
266 |
+- close (con) |
|
267 |
++ on.exit(close(con)) |
|
268 |
++ export(object, con, format, ...) |
|
269 |
+ }) |
|
270 |
+ |
|
271 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
272 |
+@@ -271,23 +235,16 @@ |
|
273 |
+ function (object, con, format, ...) { |
|
274 |
+ |
|
275 |
+ fmt = match.arg (tolower (format), c ('meme', 'transfac')) |
|
276 |
+- if (fmt %in% c ('meme', 'transfac')) { |
|
277 |
+- text = matrixToMemeText (object) |
|
278 |
+- } |
|
279 |
++ ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need |
|
280 |
++ ## for test |
|
281 |
++ ## let the user manage opened cons |
|
282 |
++ if (!isOpen(con)) { |
|
283 |
++ open(con) |
|
284 |
++ on.exit(close(con)) |
|
285 |
++ } |
|
286 |
+ cat (text, sep='\n', file=con) |
|
287 |
+- close (con) |
|
288 |
+ }) |
|
289 |
+ |
|
290 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
291 |
+ # write to connection, using default format, ??? for matrix list, tsv for metadata |
|
292 |
+-setMethod ('export', signature=c(object='MotifList', con='missing', format='character'), |
|
293 |
+- |
|
294 |
+- function (object, con, format, ...) { |
|
295 |
+- |
|
296 |
+- fmt = match.arg (tolower (format), c ('meme')) # , 'transfac')) |
|
297 |
+- if (fmt == 'meme') |
|
298 |
+- text = matrixToMemeText (object) |
|
299 |
+- return (text) |
|
300 |
+- }) |
|
301 |
+- |
|
302 |
+-#------------------------------------------------------------------------------------------------------------------------ |
|
303 |
++## already handled by rtracklayer? |
0 | 304 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,111 @@ |
1 |
+library (MotifDb) |
|
2 |
+library (MotIV) |
|
3 |
+library (seqLogo) |
|
4 |
+library (RUnit) |
|
5 |
+#----------------------------------------------------------------------------------------- |
|
6 |
+demo.MotIV = function () |
|
7 |
+{ |
|
8 |
+ print ('--- demo.MotIV') |
|
9 |
+ mdb = MotifDb (quiet=TRUE) |
|
10 |
+ db.tmp = mdb@listData |
|
11 |
+ |
|
12 |
+ # match motif 1 against the entire MotifDb collection |
|
13 |
+ motif.hits = motifMatch (db.tmp [1], database=db.tmp) |
|
14 |
+ |
|
15 |
+ # the long way to extract the matrix name. see MotIV.toTable below for more convenient way |
|
16 |
+ checkEquals (motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names (db.tmp)[1]) |
|
17 |
+ |
|
18 |
+ # jaspar, uniprobe and ScerTF contribute a total of 1035 matrices |
|
19 |
+ checkTrue (length (mdb) >= 1035) |
|
20 |
+ |
|
21 |
+ target = db.tmp [1035] |
|
22 |
+ printf ('target matrix: %s', names (target)) |
|
23 |
+ motif.hits = motifMatch (db.tmp [1035], database=db.tmp) |
|
24 |
+ tbl.hits = MotIV.toTable (motif.hits) |
|
25 |
+ checkEquals (names (db.tmp)[1035], tbl.hits [1, 'name']) # first hit should be the target itself |
|
26 |
+ |
|
27 |
+ return (tbl.hits) |
|
28 |
+ |
|
29 |
+} # demo.MotIV |
|
30 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
31 |
+display.matrices= function () |
|
32 |
+{ |
|
33 |
+ mdb = MotifDb (quiet=TRUE) |
|
34 |
+ printf ('--- %s', 'ScerTF-Scerevisiae-PDR8-badis') |
|
35 |
+ print (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]]) |
|
36 |
+ printf ('--- %s', 'JASPAR_CORE-Scerevisiae-YDR520C-MA0422.1') |
|
37 |
+ print (subset (mdb, name=='JASPAR_CORE-Scerevisiae-YDR520C-MA0422.1')[[1]]) |
|
38 |
+ printf ('--- %s', 'ScerTF-Scerevisiae-PDR8-badis') |
|
39 |
+ print (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]]) |
|
40 |
+ printf ('--- %s', 'ScerTF-Scerevisiae-YLR278C-badis') |
|
41 |
+ print (subset (mdb, name=='ScerTF-Scerevisiae-YLR278C-badis')[[1]]) |
|
42 |
+ printf ('--- %s', 'JASPAR_CORE-Scerevisiae-HAP1-MA0312.1') |
|
43 |
+ print (subset (mdb, name=='JASPAR_CORE-Scerevisiae-HAP1-MA0312.1')[[1]]) |
|
44 |
+ printf ('--- %s', 'ScerTF-Scerevisiae-YKL222C-zhu') |
|
45 |
+ print (subset (mdb, name=='ScerTF-Scerevisiae-YKL222C-zhu')[[1]]) |
|
46 |
+ |
|
47 |
+} # display.logos |
|
48 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
49 |
+display.logos = function () |
|
50 |
+{ |
|
51 |
+ mdb = MotifDb (quiet=TRUE) |
|
52 |
+ par (mfrow=c (3,2)) |
|
53 |
+ seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]]) |
|
54 |
+ #readline () |
|
55 |
+ seqLogo (subset (mdb, name=='JASPAR_CORE-Scerevisiae-YDR520C-MA0422.1')[[1]]) |
|
56 |
+ seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]]) |
|
57 |
+ seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-YLR278C-badis')[[1]]) |
|
58 |
+ seqLogo (subset (mdb, name=='JASPAR_CORE-Scerevisiae-HAP1-MA0312.1')[[1]]) |
|
59 |
+ seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-YKL222C-zhu')[[1]]) |
|
60 |
+ |
|
61 |
+} # display.logos |
|
62 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
63 |
+MotIV.toTable = function (match) |
|
64 |
+{ |
|
65 |
+ stopifnot (length (match@bestMatch) == 1) |
|
66 |
+ alignments = match@bestMatch[[1]]@aligns |
|
67 |
+ |
|
68 |
+ df = data.frame (stringsAsFactors=FALSE) |
|
69 |
+ for (alignment in alignments) { |
|
70 |
+ x = alignment |
|
71 |
+ name = x@TF@name |
|
72 |
+ eVal = x@evalue |
|
73 |
+ sequence = x@sequence |
|
74 |
+ match = x@match |
|
75 |
+ strand = x@strand |
|
76 |
+ df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE)) |
|
77 |
+ } # for alignment |
|
78 |
+ |
|
79 |
+ return (df) |
|
80 |
+ |
|
81 |
+} # MotIV.toTable |
|
82 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
83 |
+test.MotIV.toTable = function () |
|
84 |
+{ |
|
85 |
+ print ('--- test.MotIVtoTable') |
|
86 |
+ mdb = MotifDb (quiet=TRUE) |
|
87 |
+ test.hits = motifMatch (mdb[1]@listData, database=jaspar) |
|
88 |
+ tbl.hits = MotIV.toTable (test.hits) |
|
89 |
+ checkEquals (dim (tbl.hits), c (5, 5)) |
|
90 |
+ checkEquals (colnames (tbl.hits), c ("name", "eVal", "sequence", "match", "strand")) |
|
91 |
+ |
|
92 |
+} # test.MotIV.toTable |
|
93 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
94 |
+demo.tomtom = function () |
|
95 |
+{ |
|
96 |
+ mdb = MotifDb (quiet=TRUE) |
|
97 |
+ mdb.01 = mdb [1035] |
|
98 |
+ max = length (mdb) |
|
99 |
+ |
|
100 |
+ mdb.many = mdb [1:max] |
|
101 |
+ export (mdb.01, 'mdb1.text', 'meme') |
|
102 |
+ export (mdb.many, 'mdbMany.text', 'meme') |
|
103 |
+ # find similarity of motif #1 to all the motifs in mdbMany |
|
104 |
+ cmd = sprintf ('tomtom -no-ssc -oc ../tomtomTest -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s', |
|
105 |
+ 'mdb1.text', 'mdbMany.text') |
|
106 |
+ system (cmd) |
|
107 |
+ cmd = 'open ../tomtomTest/tomtom.html' # file.path, browseURL |
|
108 |
+ system (cmd) |
|
109 |
+ |
|
110 |
+} # demo.tomtom |
|
111 |
+#------------------------------------------------------------------------------------------------------------------------ |
0 | 112 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,304 @@ |
1 |
+Index: MotifList-class.R |
|
2 |
+=================================================================== |
|
3 |
+--- MotifList-class.R (revision 67138) |
|
4 |
+@@ -4,10 +4,30 @@ |
|
5 |
+ prototype(elementType="matrix") |
|
6 |
+ ) |
|
7 |
+ |
|
8 |
+-setGeneric ('add', signature='obj', function (obj, matrices, tbl.metadata) standardGeneric ('add')) |
|
9 |
++setValidity("MotifList", function(object) { |
|
10 |
++ msg <- NULL |
|
11 |
++ |
|
12 |
++ ## what makes for a valid MotifList? |
|
13 |
++ if (is.null(names(object))) |
|
14 |
++ msg <- c(msg, "'names()' must not be NULL") |
|
15 |
++ else if (any(duplicated(names(object)))) |
|
16 |
++ msg <- c(msg, "all 'names()' must be unique") |
|
17 |
++ ## all(sapply(object, class) == "matrix") works for 0-length object |
|
18 |
++ if (!all(sapply(object, class) == "matrix")) |
|
19 |
++ msg <- c(msg, "all elements must be of class 'matrix'") |
|
20 |
++ if (!all(sapply(object, nrow) == 4)) |
|
21 |
++ msg <- c(msg, "all elements must have 4 rows") |
|
22 |
++ ok <- sapply(object, function(elt) all(colSums(elt) == 1)) |
|
23 |
++ if (!all(ok)) |
|
24 |
++ msg <- c(msg, "all elements must have colSums equal to 1") |
|
25 |
++ |
|
26 |
++ if (is.null(msg)) TRUE else msg |
|
27 |
++}) |
|
28 |
++ |
|
29 |
++setGeneric ('add', function (obj, matrices, tbl.metadata) standardGeneric ('add'), |
|
30 |
++ signature='obj') |
|
31 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
32 |
+ MotifList = function (matrices=list(), |
|
33 |
+- names=character(), # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a |
|
34 |
+ providerNames=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
35 |
+ providerIds=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
36 |
+ geneSymbols=character(), # ABF2, Rhox11 |
|
37 |
+@@ -22,27 +42,9 @@ |
|
38 |
+ dataSources=character(), # ScerTF, UniPROBE |
|
39 |
+ experimentTypes=character(), # NA, protein-binding microarray |
|
40 |
+ pubmedIDs=character(), # 19111667, 1858359 |
|
41 |
+- tfFamilies=character()) # NA, NA |
|
42 |
++ tfFamilies=character()) |
|
43 |
+ { |
|
44 |
+- names (matrices) = names |
|
45 |
+- count = length (matrices) |
|
46 |
+- stopifnot (length (names) == count) |
|
47 |
+- stopifnot (length (providerNames) == count) |
|
48 |
+- stopifnot (length (providerIds) == count) |
|
49 |
+- stopifnot (length (geneSymbols) == count) |
|
50 |
+- stopifnot (length (geneIds) == count) |
|
51 |
+- stopifnot (length (geneIdTypes) == count) |
|
52 |
+- stopifnot (length (proteinIds) == count) |
|
53 |
+- stopifnot (length (proteinIdTypes) == count) |
|
54 |
+- stopifnot (length (sequenceCounts) == count) |
|
55 |
+- stopifnot (length (organisms) == count) |
|
56 |
+- stopifnot (length (bindingDomains) == count) |
|
57 |
+- stopifnot (length (bindingSequences) == count) |
|
58 |
+- stopifnot (length (dataSources) == count) |
|
59 |
+- stopifnot (length (experimentTypes) == count) |
|
60 |
+- stopifnot (length (pubmedIDs) == count) |
|
61 |
+- stopifnot (length (tfFamilies) == count) |
|
62 |
+- |
|
63 |
++ ## these are redundant with DataFrame validity method |
|
64 |
+ elementMetadata = DataFrame (#name=names, |
|
65 |
+ providerName=providerNames, |
|
66 |
+ providerId=providerIds, |
|
67 |
+@@ -63,45 +65,33 @@ |
|
68 |
+ |
|
69 |
+ } # ctor |
|
70 |
+ #------------------------------------------------------------------------------------------------------------------------ |
|
71 |
++## quiet vs. verbose |
|
72 |
+ MotifDb = function (loadAllSources=TRUE, quiet=TRUE) |
|
73 |
+ { |
|
74 |
+ mdb <- MotifList () |
|
75 |
+ if (loadAllSources) { |
|
76 |
+- data.path = paste (system.file (package='MotifDb'), 'data', sep='/') |
|
77 |
++ data.path = system.file(package="MotifDb", "data") |
|
78 |
+ data.files = dir (data.path, full.names=TRUE) |
|
79 |
+- sources.to.exclude = c ('ScerTF.RData') # pending minor revisions |
|