Browse code

Adds PADOG, RMassBank, ChIPXpress, MotifDb, and GeneNetworkBuilder to the repos.

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@68872 bc3139a8-67e5-0310-9ffc-ced21a209358

m.carlson authored on 28/08/2012 17:36:36
Showing 45 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+Package: MotifDb
2
+Type: Package
3
+Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
+Version: 0.99.80
5
+Date: 2012-08-12
6
+Author: Paul Shannon
7
+Maintainer: <pshannon@fhcrc.org>
8
+Depends: R (>= 2.15.0), methods, IRanges, Biostrings
9
+Suggests: RUnit, MotIV, seqLogo
10
+Imports: BiocGenerics, rtracklayer
11
+Description: More than 2000 annotated position frequency matrices from five public source, for multiple organisms
12
+License: Artistic-2.0
13
+LazyLoad: yes
14
+biocViews: GenomicSequence, MotifAnnotation
0 15
new file mode 100644
... ...
@@ -0,0 +1,16 @@
1
+exportClasses (MotifList)
2
+exportMethods (
3
+   subset,
4
+   export,
5
+   show,
6
+   query
7
+   )
8
+
9
+export(
10
+    MotifDb
11
+)
12
+
13
+importMethodsFrom(IRanges, rbind, eval, sapply, subset, values)
14
+importFrom(IRanges, DataFrame)
15
+importFrom(rtracklayer, export)
16
+
0 17
new file mode 100644
... ...
@@ -0,0 +1,265 @@
1
+setGeneric('query', signature='object', function(object, queryString, ignore.case=TRUE)
2
+            standardGeneric ('query'))
3
+#-------------------------------------------------------------------------------
4
+setClass ('MotifList',
5
+          contains='SimpleList',
6
+          representation (elementMetadata='DataFrame'),
7
+          prototype(elementType='matrix')
8
+          )
9
+
10
+setValidity('MotifList', function(object) {
11
+    msg = NULL
12
+      ## what makes for a valid MotifList?
13
+    if (length (object) ==  0)
14
+       return (TRUE)
15
+    if (is.null(names(object)))
16
+        msg = c(msg, '"names()" must not be NULL')
17
+    else if (any(duplicated(names(object))))
18
+        msg = c(msg, 'all "names()" must be unique')
19
+    if (!all(sapply(object, is.matrix)))
20
+        msg = c(msg, 'all matrices must be of class "matrix"')
21
+    if (!all(sapply(object, nrow) == 4))
22
+        msg = c(msg, 'all matrices must have 4 rows')
23
+      # all columns of all matrices should be normalized, summing to one.
24
+      # in fact, 2/2086 matrices
25
+      #   Cparvum-UniPROBE-Cgd2_3490.UP00395
26
+      #   Pfalciparum-UniPROBE-PF14_0633.UP00394
27
+      # fail to pass this test.  rounding to one decimal place allows these matrices,
28
+      # also, to pass.   round (0.98, digits=1) --> 1.0
29
+      # see the UniPROBE manpage for the full story on these two matrices
30
+    ok = sapply(object, function(elt) all (round (colSums(elt), digits=1) == 1))
31
+    if (!all(ok))
32
+       msg = c(msg, 'all elements must have colSums equal to 1')
33
+
34
+    if (is.null(msg)) TRUE else msg
35
+})
36
+#-------------------------------------------------------------------------------
37
+MotifList = function (matrices=list(), tbl.metadata=data.frame ())
38
+{  
39
+  if (nrow (tbl.metadata) == 0)
40
+    df = DataFrame ()
41
+  else
42
+    df = DataFrame (tbl.metadata, row.names = rownames (tbl.metadata))
43
+  
44
+  object = new ('MotifList', listData=matrices, elementMetadata = df)
45
+
46
+  if (length (matrices) == 0)
47
+    names (object) = character ()
48
+  else 
49
+    names (object) = rownames (tbl.metadata)
50
+
51
+  object
52
+  
53
+} # ctor
54
+#-------------------------------------------------------------------------------
55
+setMethod ('subset', signature = 'MotifList',
56
+
57
+  function (x, subset, select, drop = FALSE, ...) {
58
+
59
+    if (missing (subset)) 
60
+      i = TRUE
61
+    else {
62
+      i = eval(substitute (subset), elementMetadata (x), parent.frame ())
63
+      i = try(as.logical(i), silent=TRUE)
64
+      if (inherits(i, 'try-error'))
65
+        stop('"subset" must be coercible to logical')
66
+      i = i & !is.na(i)
67
+      } # else
68
+    return (x [i])
69
+  })
70
+
71
+#-------------------------------------------------------------------------------
72
+# transpose 4-row matrices to 4-column, so that there are as many rows as
73
+# there are nucleotides in the motif sequence. meme requires normalized matrices
74
+# exactly as we store them
75
+transformMatrixToMemeRepresentation = function (m)
76
+{
77
+  return (t (m))
78
+
79
+} # transformMatrixToMemeRepresentation
80
+#-------------------------------------------------------------------------------
81
+# http://stuff.mit.edu/afs/athena/software/meme_v3.5.4/etc/meme-explanation.html
82
+# The motif itself is a position-specific probability matrix giving, for each
83
+# position in the pattern, the observed  frequency ('probability') of each
84
+# possible letter. The probability matrix is printed 'sideways'--columns
85
+# correspond  to the letters in the alphabet (in the same order as shown in
86
+# the simplified motif) and rows corresponding to the  positions of the motif,
87
+# position one first. The motif is preceded by a line starting with
88
+# 'letter-probability matrix:' and containing the length of the alphabet,
89
+# width of the motif, number of occurrences of the motif, and the E-value
90
+# of the motif.
91
+matrixToMemeText = function (matrices)
92
+{
93
+  matrix.count = length (matrices)
94
+
95
+    # incoming matrices have nucleotide rows, position columns.  meme
96
+    # format, however, requires position-rows, and nucleotide-columns
97
+    # calculate the number of lines of text by counting columns
98
+  total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol)))
99
+  predicted.line.count = 12 + (3 * length (matrices)) +
100
+                           total.transposed.matrix.rows
101
+  #s = vector ('character', predicted.line.count)
102
+  s = character (predicted.line.count)
103
+
104
+  s [1] = 'MEME version 4'
105
+  s [2] = ''
106
+  s [3] = 'ALPHABET= ACGT'
107
+  s [4] = ''
108
+  s [5] = 'strands: + -'
109
+  s [6] = ''
110
+  s [7] = 'Background letter frequencies'
111
+  s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 '
112
+  s [9] = '' 
113
+
114
+  index = 10
115
+  for (name in names (matrices)) {
116
+       # transpose the frequency matrix version of the incoming matrix,
117
+       # hence 'tfMat'
118
+    tfMat = transformMatrixToMemeRepresentation (matrices [[name]])
119
+       # meme output may be used by tomtom, which uses matrix names as
120
+       # part of image filenames. removed all file-system-unfriendly
121
+       # characters here
122
+    fixed.name = gsub ('\\/', '_', name)
123
+    s [index] = sprintf ('MOTIF %s', fixed.name)
124
+    index = index + 1
125
+    new.line =
126
+       sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020',
127
+          nrow (tfMat), 45, 8.1e-020)
128
+    s [index] =  new.line
129
+    index = index + 1
130
+    for (r in 1:nrow (tfMat)) {
131
+      new.row = sprintf (' %12.10f  %12.10f  %12.10f  %12.10f', tfMat [r,1],
132
+                          tfMat [r,2], tfMat [r,3], tfMat [r,4])
133
+      s [index] = new.row
134
+      index = index + 1
135
+      }
136
+    s [index] = ''
137
+    index = index + 1
138
+    } # for name
139
+
140
+  invisible (s)
141
+
142
+} # matrixToMemeText
143
+#-------------------------------------------------------------------------------
144
+# connection is a character string, create a file by that name, open the file.
145
+# dispatch to export which dispatches on con='connection'
146
+
147
+setMethod ('export',  signature=c(object='MotifList', con='character',
148
+                                  format='character'),
149
+
150
+  function (object, con, format, ...) {
151
+      ## do minimum work unique to this method, then dispatch to avoid
152
+      ## code duplication
153
+    con = file (con, 'w')
154
+    on.exit(close(con))
155
+    export(object, con, format, ...)
156
+    })
157
+
158
+#-------------------------------------------------------------------------------
159
+# write to connection with specified format
160
+# format includes TRANSFAC, meme (also good for tomtom), and tsv
161
+
162
+setMethod ('export',  signature=c(object='MotifList', con='connection',
163
+                                 format='character'),
164
+
165
+  function (object, con, format, ...) {
166
+
167
+    fmt = match.arg (tolower (format), c ('meme', 'transfac'))
168
+    ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need
169
+    ## for test
170
+    ## let the user manage opened cons
171
+    if (!isOpen(con)) {
172
+        open(con)
173
+        on.exit(close(con))
174
+    }
175
+    if (fmt == 'meme') 
176
+      text = matrixToMemeText (object)
177
+    cat (text, sep='\n', file=con)
178
+    })
179
+
180
+#-------------------------------------------------------------------------------
181
+# write to connection, using default format,  ??? for matrix list, tsv for
182
+# metadata
183
+setMethod ('export',  signature=c(object='MotifList',  con='missing',
184
+                                 format='character'),
185
+
186
+  function (object, con, format,  ...) {
187
+
188
+    fmt = match.arg (tolower (format), c ('meme')) # , 'transfac'
189
+    if (fmt == 'meme') {
190
+      text = paste (matrixToMemeText (object), collapse='\n')
191
+      cat (text)
192
+      invisible (text)
193
+      }
194
+    })
195
+
196
+#-------------------------------------------------------------------------------
197
+setMethod('show', 'MotifList',
198
+
199
+    function(object) {
200
+        
201
+      msg = sprintf ('MotifDb object of length %d', length (object))
202
+      cat (msg, '\n', sep='')
203
+      if (length (object) == 0)
204
+        return ()
205
+      
206
+      cat ('| Created from downloaded public sources: 2012-Jul6', '\n', sep='')
207
+
208
+      tbl.dataSource = as.data.frame (table (values (object)$dataSource))
209
+      tbl.org = as.data.frame (table (values (object)$organism))
210
+      tbl.org = head (tbl.org [order (tbl.org$Freq, decreasing=TRUE),])
211
+      totalMatrixCount = length (object)
212
+      totalOrganismCount = length (unique (values (object)$organism))
213
+      dataSourceCount = nrow (tbl.dataSource)
214
+
215
+      source.singular.or.plural = 'sources'
216
+      if (dataSourceCount == 1)
217
+        source.singular.or.plural = 'source'
218
+      
219
+      msg = sprintf ('| %d position frequency matrices from %d %s:',
220
+                     totalMatrixCount, dataSourceCount, source.singular.or.plural)
221
+      cat (msg, '\n', sep='')
222
+      for (r in 1:nrow (tbl.dataSource)) {
223
+         dataSource = tbl.dataSource$Var1 [r]
224
+         matrixCount = tbl.dataSource$Freq [r]
225
+         msg = sprintf ('| %18s: %4d', dataSource, matrixCount)
226
+         cat (msg, '\n', sep='')
227
+         } # for r
228
+      msg = sprintf ('| %d organism/s', totalOrganismCount)
229
+      cat (msg, '\n', sep='')
230
+      for (r in 1:nrow (tbl.org)) {
231
+         organism = tbl.org$Var1 [r]
232
+         orgCount = tbl.org$Freq [r]
233
+         msg = sprintf ('| %18s: %4d', organism, orgCount)
234
+         cat (msg, '\n', sep='')
235
+         } # for r
236
+      otherOrgCount = totalMatrixCount - sum (tbl.org$Freq)
237
+      if (otherOrgCount > 0) {
238
+        msg = sprintf ('| %18s: %4d', 'other', otherOrgCount)
239
+        cat (msg, '\n', sep='')
240
+        }
241
+    if (!is.null (names (object))) {
242
+      all.names = names (object)
243
+      count = length (all.names)
244
+      if (count <= 10)
245
+        cat (paste (all.names, '\n'), sep='')
246
+      else {
247
+        cat (paste (all.names [1:5], '\n'), sep='')
248
+        cat ('...', '\n', sep='')
249
+        end = length (all.names)
250
+        start = end - 4
251
+        cat (paste (all.names [start:end], '\n'), sep='')
252
+        }
253
+      }
254
+    })
255
+#-------------------------------------------------------------------------------
256
+setMethod ('query', 'MotifList',
257
+
258
+   function (object, queryString, ignore.case=TRUE) {
259
+       indices = unique (as.integer (unlist (sapply (colnames (values (object)), 
260
+                    function (colname) 
261
+                       grep (queryString, values (object)[, colname], 
262
+                             ignore.case=ignore.case)))))
263
+        object [indices]
264
+      })
265
+#-------------------------------------------------------------------------------
0 266
new file mode 100644
... ...
@@ -0,0 +1,36 @@
1
+MotifDb <- NULL
2
+#-------------------------------------------------------------------------------
3
+.MotifDb = function (loadAllSources=TRUE, quiet=TRUE)
4
+{
5
+  mdb = MotifList ()
6
+
7
+  if (loadAllSources) {
8
+    data.path = system.file ('extdata', package='MotifDb') # e (system.file (package='MotifDb'), 'data', sep='/')
9
+    data.files = dir (data.path, full.names=TRUE)
10
+  
11
+    if (length (data.files) > 0)   
12
+      for (data.file in data.files) {
13
+         # define these to keep 'check' happy.  they are loaded by 'load'
14
+        tbl.md = NA; matrices = NA;  
15
+        variables = load (data.file)
16
+        mdb = append (mdb, MotifList (matrices, tbl.md))
17
+        if (!quiet) 
18
+          message (noquote (sprintf ('added %s (%d) matrices, length now: %d',
19
+                   basename (data.file), length (matrices), length (mdb))))
20
+      } # for data.file
21
+  
22
+    if (!quiet) {
23
+      print (table (values (mdb)$dataSource))
24
+      }
25
+    } # if loadAllSources
26
+
27
+  return (mdb)
28
+
29
+} # MotifDb
30
+#-------------------------------------------------------------------------------
31
+.onLoad <- function(libname, pkgname)
32
+{
33
+    MotifDb <<- .MotifDb (loadAllSources=TRUE, quiet=TRUE)
34
+}
35
+#-------------------------------------------------------------------------------
36
+
0 37
new file mode 100644
... ...
@@ -0,0 +1,436 @@
1
+\documentclass{article}
2
+%% %\VignetteIndexEntry{MotifDb Overview}
3
+%% %\VignettePackage{MotifDb}
4
+\usepackage[noae]{Sweave}
5
+\usepackage[left=0.5in,top=0.5in,right=0.5in,bottom=0.75in,nohead,nofoot]{geometry} 
6
+\usepackage{hyperref}
7
+\usepackage[noae]{Sweave}
8
+\usepackage{color}
9
+\usepackage{graphicx}
10
+\usepackage{caption}
11
+\usepackage{subcaption}
12
+
13
+\definecolor{Blue}{rgb}{0,0,0.5}
14
+\definecolor{Green}{rgb}{0,0.5,0}
15
+
16
+\RecustomVerbatimEnvironment{Sinput}{Verbatim}{%
17
+  xleftmargin=1em,%
18
+  fontsize=\small,%
19
+  fontshape=sl,%
20
+  formatcom=\color{Blue}%
21
+  }
22
+\RecustomVerbatimEnvironment{Soutput}{Verbatim}{%
23
+  xleftmargin=0em,%
24
+  fontsize=\scriptsize,%
25
+  formatcom=\color{Blue}%
26
+  }
27
+\RecustomVerbatimEnvironment{Scode}{Verbatim}{xleftmargin=2em}
28
+
29
+
30
+
31
+\renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}}
32
+\fvset{listparameters={\setlength{\topsep}{6pt}}}
33
+% These determine the rules used to place floating objects like figures 
34
+% They are only guides, but read the manual to see the effect of each.
35
+\renewcommand{\topfraction}{.99}
36
+\renewcommand{\bottomfraction}{.99}
37
+\renewcommand{\textfraction}{0.0}
38
+
39
+\title{MotifDb} 
40
+\author{Paul Shannon}
41
+
42
+\begin{document} 
43
+
44
+\maketitle
45
+\begin{abstract}
46
+Many kinds of biological activity are regulated by the binding of proteins to their cognate
47
+substrates.  Of particular interest is the sequence-specific binding of transcription factors to DNA, often in
48
+regulatory regions just upstream of the transcription start site of a gene.  These binding events play a pivotal
49
+role in regulating gene expression.  Sequence specificity among closely related binding sites is nearly always incomplete: some variety
50
+in the DNA sequence is routinely observed.  For this reason, these inexact binding sequence patterns are commonly
51
+described as \emph{motifs} represented numerically as frequency matrices, and visualized as sequence logos.  Despite their importance
52
+in current research, there has been until now no single, annotated, comprehensive collection of publicly available motifs.
53
+The current package provides such a collection, offering more than two thousand annotated matrices from multiple organisms, within the
54
+context of the Bioconductor project.  The matrices can be filtered and selected on the basis of their metadata, used with other
55
+Bioconductor packages (MotIV for motif comparison, seqLogo for visualization) or easily exported for use with 
56
+standard software and websites such as those provided by the MEME Suite\footnote{http://meme.sdsc.edu/meme/doc/meme.html}.
57
+\end{abstract}
58
+
59
+\tableofcontents
60
+
61
+\section{Introduction and Basic Operations}
62
+
63
+The first step is to load the necessary packages:
64
+
65
+<<libraries>>=
66
+library (MotifDb)
67
+library (MotIV)
68
+library (seqLogo)
69
+@ 
70
+
71
+<<hiddenCode results=hide, echo=FALSE>>=
72
+MotIV.toTable = function (match) {
73
+  if (length (match@bestMatch) == 0)
74
+    return (NA)
75
+  
76
+  alignments = match@bestMatch[[1]]@aligns
77
+
78
+  df = data.frame (stringsAsFactors=FALSE)
79
+  for (alignment in alignments) {
80
+    x = alignment
81
+    name = x@TF@name
82
+    eVal = x@evalue
83
+    sequence = x@sequence
84
+    match = x@match
85
+    strand = x@strand
86
+    df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, 
87
+                                match=match, strand=strand, stringsAsFactors=FALSE))
88
+    } # for alignment
89
+  return (df)
90
+  } # MotIV.toTable 
91
+@ 
92
+
93
+%% MotifDb provides two kinds of loosely linked data:  position frequency matrices, and metadata about each matrix.  The matrix
94
+%% names, and the rownames of the metadata table, are identical, so it is easy to map back and forth between
95
+%% the two.  Some measure of convenience is gained by extracting these two kinds of data into separate variables,
96
+%% as we shall see.  The cost in extra memory should not significant.
97
+%% 
98
+%% <<all.matrices>>=
99
+%% matrices.all = as.list (MotifDb)
100
+%% metadata <- values (MotifDb)
101
+%% @ 
102
+There are  more than two thousand  matrices, from five sources:
103
+<<sources>>=
104
+length (MotifDb)
105
+sort (table (values (MotifDb)$dataSource), decreasing=TRUE)
106
+@ 
107
+And 22 organisms (though the majority of the matrices come from just four):
108
+<<organisms>>=
109
+sort (table (values (MotifDb)$organism), decreasing=TRUE)
110
+@ 
111
+
112
+With these categories of metadata
113
+<<metadata>>=
114
+colnames (values (MotifDb))
115
+@ 
116
+\section{Selection}
117
+
118
+There are three ways to extract subsets of interest from the MotifDb collection.  All three operate upon the MotifDb metadata,
119
+matching values in one or more of those fifteen attributes (listed just above), and returning the subset of MotifDb  which 
120
+meet the specified criteria.  The three techniques:  \emph{query}, \emph{subset} and \emph{grep}
121
+
122
+\subsection{query}
123
+This is the simplest technique to use, and will suffice in many circumstances.  For example, if you want 
124
+all of the human matrices:
125
+<<queryHuman>>=
126
+query (MotifDb, 'hsapiens')
127
+@ 
128
+If you want all matrices associated with \textbf{\emph{Sox}} transcription factors, regardless of dataSource or organism:
129
+<<querySox>>=
130
+query (MotifDb, 'sox')
131
+@ 
132
+For all yeast transcription factors with a homeo domain
133
+<<queryYeastHomeo>>=
134
+query (query (MotifDb, 'cerevisiae'), 'homeo')
135
+@ 
136
+The last example may inspire more confidence in the precision of the result than is justified, and for a couple
137
+of reasons.  First, the assignment of  protein binding domains to specific categories is, as of 2012, an ad hoc 
138
+and incomplete process.  Second, the query commands matches the supplied character string to \emph{all} metadata
139
+columns.  In this case, 'homeo' appears both in the \emph{bindingDomain} column and the \emph{tfFamily} column,
140
+and the above \emph{query} will return matches from both.
141
+Searching and filtering should always be accompanined by close scrutiny of the data, such as these commands
142
+illustrate:
143
+
144
+<<homeoVariety>>=
145
+unique (grep ('homeo', values(MotifDb)$bindingDomain, ignore.case=T, v=T))
146
+unique (grep ('homeo', values(MotifDb)$tfFamily, ignore.case=T, v=T))
147
+@ 
148
+\subsection{grep}
149
+This selection method (and the next, \emph{subset}) require that you address metadata columns explicitly.  This is a little more
150
+work, but the requisite direct engagement with the metadata is worthwhile.  Repeating the 'query' examples from above,
151
+you can see how more knowedge of MotifDb metadata is required.
152
+<<grepHuman>>=
153
+mdb.human <- MotifDb [grep ('Hsapiens', values (MotifDb)$organism)]
154
+mdb.sox <- MotifDb [grep ('sox', values (MotifDb)$geneSymbol, ignore.case=TRUE)]
155
+yeast.indices = grepl ('scere', values (MotifDb)$organism, ignore.case=TRUE)
156
+homeo.indices.domain = grepl ('homeo', values (MotifDb)$bindingDomain, ignore.case=TRUE)
157
+homeo.indices.family = grepl ('homeo', values (MotifDb)$tfFamily, ignore.case=TRUE)
158
+yeast.homeo.indices = yeast.indices & (homeo.indices.domain | homeo.indices.family)
159
+yeast.homeoDb = MotifDb [yeast.homeo.indices]
160
+@ 
161
+
162
+An alternate and somewhat more compact approach:
163
+<<withHomeo>>=
164
+yeast.homeo.indices <- with(values(MotifDb),
165
+  grepl('scere', organism, ignore.case=TRUE) &
166
+    (grepl('homeo', bindingDomain, ignore.case=TRUE) |
167
+     grepl('homeo', tfFamily, ignore.case=TRUE)))
168
+
169
+@ 
170
+\subsection{subset}
171
+MotifDb::subset emulates the R base data.frame \emph{subset} command, which is not unlike an SQL select function.
172
+Unfortunately -- and just like the R base subset function -- this MotifDb method cannot be used reliably  within a script:
173
+\emph{It is only reliable when called interactively.}  Here, with mixed success (as you will see) , we use MotifDb::subset to
174
+reproduce the \emph{query} and \emph{grep} selections shown above.
175
+
176
+<<subsetHuman>>=
177
+if (interactive ())
178
+  subset (MotifDb, organism=='Hsapiens')
179
+@ 
180
+One can easily find all the 'sox' genes with the subset command, avoiding possible upper/lower case conflicts by passing
181
+the metadata's geneSymbol column through the function 'tolower':
182
+<<subsetSox>>=
183
+if (interactive ())
184
+  subset (MotifDb, tolower (geneSymbol) == 'sox4')
185
+@ 
186
+Similarly, subset has limited application for a permissive 'homeo' search.
187
+But for the retrieval by explicitly specified search terms, subset works very well:
188
+<<subsetYeastHomeo>>=
189
+if (interactive ())
190
+  subset (MotifDb, organism=='Scerevisiae' & bindingDomain=='Homeo')
191
+@ 
192
+
193
+\subsection{The Egr1 Case Study}
194
+
195
+We now do a simple geneSymbol search, followed by an examination of the sub-MotifDb the search returns.  We are looking for all matrices
196
+associated with the well-known and highly conserved zinc-finger transcription factor, Egr1.
197
+There are two of these in MotifDb, both from mouse, and each from a different data source.
198
+
199
+<<findEgr1>>=
200
+  # subset is convenient: 
201
+if (interactive ())
202
+  as.list (subset (MotifDb, tolower (geneSymbol) == 'egr1'))
203
+  # grep returns indices which allow for more flexibility
204
+indices = grep ('egr1', values (MotifDb)$geneSymbol, ignore.case=TRUE)  
205
+length (indices)
206
+@ 
207
+There are a variety of ways to examine and extract data from this object, a MotifList of length 2.  
208
+<<MotifDbViews>>=
209
+MotifDb [indices]
210
+@ 
211
+
212
+Now view the matrices as a named list:
213
+<<as.list>>=
214
+as.list (MotifDb [indices])
215
+@ 
216
+and finally, the metadata associated with these two matrices, transposed, for easy reading and comparison:
217
+<<as.metadata>>=
218
+noquote (t (as.data.frame (values (MotifDb [indices]))))
219
+@ 
220
+ 
221
+We used the \emph{grep} function above to find rows in the metadata table whose \emph{geneSymbol} column includes the string 'Egr1'.
222
+If you wish to identify matrices (and/or their attendant metadata) based upon a richer combination of criteria, for instance:
223
+
224
+\begin{enumerate}
225
+  \item organism  (\emph{Mmusculus})
226
+  \item gene symbol  (\emph{Egr1})
227
+  \item data source  (\emph{JASPAR\_CORE})
228
+\end{enumerate}
229
+
230
+the grep solution, while serviceable, becomes a little awkward:
231
+<<egr1-multi-grep>>=
232
+
233
+geneSymbol.rows = grep ('Egr1', values (MotifDb)$geneSymbol, ignore.case=TRUE)
234
+organism.rows = grep ('Mmusculus', values (MotifDb)$organism, ignore.case=TRUE)
235
+source.rows = grep ('JASPAR', values (MotifDb)$dataSource, ignore.case=TRUE)
236
+egr1.mouse.jaspar.rows = intersect (geneSymbol.rows, 
237
+                           intersect (organism.rows, source.rows))
238
+print (egr1.mouse.jaspar.rows)
239
+egr1.motif <- MotifDb [egr1.mouse.jaspar.rows]
240
+@ 
241
+<<MotifDbViews>>=
242
+@ 
243
+
244
+Far more concise, and fully reliable as an interactive command (though \emph{not} if used in a 
245
+script\footnote{See the help page of the base R command subset for detail), is the \emph{subset} command}):
246
+<<subsetSearchForEgr1>>=
247
+if (interactive ()) {
248
+  egr1.motif <- subset (MotifDb, organism=='Mmusculus' & 
249
+                        dataSource=='JASPAR_CORE' & 
250
+                        geneSymbol=='Egr1')
251
+  }
252
+@ 
253
+Whichever method you use, this next chunk of code displays the matrix, and then the metadata for mouse JASPAR Egr1, the latter
254
+textually-transformed for easy reading within the size constraints of this page.
255
+<<examine-egr1>>=
256
+egr1.motif
257
+as.list (egr1.motif)
258
+noquote (t (as.data.frame (values (egr1.motif))))
259
+@ 
260
+
261
+
262
+Next we use the bioconductor \emph{seqLogo} package to display this motif.
263
+
264
+<<egr1, fig=TRUE, include=FALSE>>=
265
+seqLogo (as.list (egr1.motif)[[1]])
266
+@ 
267
+
268
+\begin{figure}[htpb!]
269
+  \centering
270
+  \includegraphics[width=0.3\textwidth]{MotifDb-egr1}
271
+  \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}
272
+\end{figure}
273
+  
274
+\section{Motif Matching}
275
+We will look for the ten position frequency matrices which are the best match to JASPAR's mouse EGR1, using
276
+the MotIV package.  We actually request the top eleven hits from the entire MotifDb, since the first hit 
277
+should be the target matrix itself, since that is of necessity found in the full MotifDb.
278
+
279
+<<motifmatch>>=
280
+egr1.hits <- motifMatch (as.list (egr1.motif) [1], as.list (MotifDb), top=11)
281
+# 'MotIV.toTable' -- defined above (and hidden) -- will become part of MotIV in the upcoming release
282
+tbl.hits <- MotIV.toTable (egr1.hits)
283
+print (tbl.hits)
284
+@ 
285
+
286
+The \emph{sequence} column in this table is the \emph{consensus sequence} -- with heterogeneity left out -- for the 
287
+matrix it describes.   
288
+
289
+\vspace{10 mm}
290
+
291
+\textbf{\emph{Puzzling: the strand of the match reported above is opposite of what I expected, and opposite of what seqLogo displays.
292
+  This is a question for the MotIV developers.}}
293
+
294
+\vspace{10 mm}
295
+
296
+The six logos appear below, beginning with the logo of the query matrix, \emph{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}, including
297
+two other mouse matrices, and two zinc-finger fly matrices.  Examining the three mouse matrices and their metadata reveals that
298
+all three (geneSymbol differences aside) describe the same protein:
299
+<<three.mice.metadata>>=
300
+if (interactive ())
301
+  noquote (t (as.data.frame (subset (values (MotifDb), geneId=='13653'))))
302
+@ 
303
+Zinc finger protein domains are classified into many \emph{fold groups}; their respective cognate DNA sequence may classify similarly.
304
+That two fly matrices significantly match three reports of the mouse Egr1 motif suggests impressive conservation of this 
305
+binding pattern, or convergent evolution.  
306
+
307
+Let us look at the metadata for the first fly match, whose geneId is \textbf{FBgn0003499}:
308
+<<fly.Sr.metadata>>=
309
+noquote (t (as.data.frame (values (MotifDb)[grep ('FBgn0003499', values (MotifDb)$geneId),])))
310
+@ Note that the SANGER motif, based on 18 sequences, had a high fidelity match to mouse Egr1 (see above, 10e-12), but
311
+that the SOLEXA motif, based upon 2316 sequences, did not (in work not shown, it appears 22nd in the an expanded
312
+motifMatch hit list, with a eval of 10e-5).  It is possible that the SOLEXA motif is more accurate, and that a close
313
+examination of this case, including sequence logos, position frequency matrices, and the search parameters of
314
+motifMatch, will be instructive.  Repeating the search with \emph{tomtom} might also be illuminating -- either as
315
+confirmation of MotIV and the default parameterization we used, or as a correction to it.  Here we see the facilities
316
+for exploratory data analysis MotifDb provides, and the opportunities for data analysis which result.
317
+
318
+
319
+<<logo1, fig=TRUE, include=FALSE, echo=FALSE>>=
320
+  seqLogo (MotifDb [[tbl.hits$name[1]]])
321
+@
322
+
323
+<<logo2, fig=TRUE, include=FALSE, echo=FALSE>>=
324
+  seqLogo (MotifDb [[tbl.hits$name[2]]])
325
+@
326
+
327
+<<logo3, fig=TRUE, include=FALSE, echo=FALSE>>=
328
+  seqLogo (MotifDb [[tbl.hits$name[3]]])
329
+@
330
+
331
+<<logo4, fig=TRUE, include=FALSE, echo=FALSE>>=
332
+  seqLogo (MotifDb [[tbl.hits$name[4]]])
333
+@
334
+
335
+<<logo5, fig=TRUE, include=FALSE, echo=FALSE>>=
336
+  seqLogo (MotifDb [[tbl.hits$name[5]]])
337
+@
338
+
339
+<<logo6, fig=TRUE, include=FALSE, echo=FALSE>>=
340
+  seqLogo (MotifDb [[tbl.hits$name[6]]])
341
+@
342
+
343
+\begin{figure}[htpb!]
344
+  \centering
345
+  \begin{subfigure}[b]{0.38\textwidth}
346
+    \includegraphics[width=\textwidth]{MotifDb-logo1}
347
+    \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}
348
+    \label{fig:Egr1-MA0162.1}
349
+    \end{subfigure}%
350
+  \begin{subfigure}[b]{0.38\textwidth}
351
+    \includegraphics[width=\textwidth]{MotifDb-logo2}
352
+    \caption{Dme-FFS-sr\_SANGER\_5\_FBgn0003499\\(abbreviated)}
353
+    \label{fig:Egr1-logo2}
354
+    \end{subfigure}%
355
+\end{figure}
356
+
357
+\begin{figure}[htpb!]
358
+  \centering
359
+  \begin{subfigure}[b]{0.38\textwidth}
360
+    \includegraphics[width=\textwidth]{MotifDb-logo3}
361
+    \caption{Mmusculus-UniPROBE-Zif268.UP00400}
362
+    \label{fig:Egr1-logo3}
363
+    \end{subfigure}%
364
+  \begin{subfigure}[b]{0.38\textwidth}
365
+    \includegraphics[width=\textwidth]{MotifDb-logo4}
366
+    \caption{Dme-FFS-klu\_SANGER\_10\_FBgn0013469}
367
+    \label{fig:Egr1-logo4}
368
+    \end{subfigure}%
369
+\end{figure}
370
+
371
+
372
+\begin{figure}[htpb!]
373
+  \centering
374
+  \begin{subfigure}[b]{0.38\textwidth}
375
+    \includegraphics[width=\textwidth]{MotifDb-logo5}
376
+    \caption{Mmusculus-UniPROBE-Egr1.UP00007}
377
+    \label{fig:Egr1-logo5}
378
+    \end{subfigure}%
379
+  \begin{subfigure}[b]{0.38\textwidth}
380
+    \centering
381
+    \includegraphics[width=\textwidth]{MotifDb-logo6}
382
+    \caption{Dme-FFS-klu\_SOLEXA\_5\_FBgn0013469}
383
+    \label{fig:Egr1-logo6}
384
+    \end{subfigure}%
385
+\end{figure}
386
+
387
+\newpage
388
+
389
+\section{Exporting to the MEME Suite}
390
+Some users of this package may wish to export the data -- both matrices and metadata -- so that they may be used in
391
+other programs.  The MEME suite, among others, is broadly useful, continuously improved and well-regarded throughout the
392
+bioinformatics community.  The code below exports all of the MotifDb matrices as a text file in the MEME format, and all
393
+of the metadata as a tab-delimited text file.
394
+
395
+<<export>>=
396
+matrix.output.file = tempfile ()   # substitute your preferred filename here
397
+meme.text = export (MotifDb, matrix.output.file, 'meme')
398
+
399
+metadata.output.file = tempfile () # substitute your preferred filename here
400
+write.table (as.data.frame (values (MotifDb)), file=metadata.output.file, sep='\t', 
401
+             row.names=TRUE, col.names=TRUE, quote=FALSE)
402
+@ 
403
+
404
+\section{Future Work}
405
+This first version of MotifDb collects into one R package all of the best-known public domain protein-DNA binding matrices, with
406
+as much metadata as could be gleaned from the five providers.  However, not all of these matrices are equally supported by data
407
+and by no means are all are accompanied by complete metadata.
408
+
409
+With the passage of time our knowledge of protein-DNA binding sequence motifs will improve.  They will be derived from more 
410
+binding events, with more precision and specificity, and accompanied by more (and better understood) contextual detail.  Cooperative binding, 
411
+mentioned only in a few times in the current (July 2012) version of this package, will be well-represented.  Metadata will improve.
412
+Better assignment of binding domains to consensus categories will be especially useful when it is available.  Three-dimensional 
413
+models of specific proteins binding to specific DNA may someday become commonplace.
414
+
415
+
416
+\section{References}
417
+
418
+\begin{itemize}
419
+
420
+\item Portales-Casamar E, Thongjuea S, Kwon AT, Arenillas D, Zhao X, Valen E, Yusuf D, Lenhard B, Wasserman WW, Sandelin A. JASPAR 2010: the greatly expanded open-access database of transcription factor binding profiles. Nucleic Acids Res. 2010 Jan;38(Database issue):D105-10. Epub 2009 Nov 11.
421
+
422
+\item Robasky K, Bulyk ML. UniPROBE, update 2011: expanded content and search tools in the online database of protein-binding microarray data on protein-DNA interactions. Nucleic Acids Res. 2011 Jan;39(Database issue):D124-8. Epub 2010 Oct 30.
423
+
424
+\item Spivak AT, Stormo GD. ScerTF: a comprehensive database of benchmarked position weight matrices for Saccharomyces species. Nucleic Acids Res. 2012 Jan;40(Database issue):D162-8. Epub 2011 Dec 2.
425
+
426
+\item Xie Z, Hu S, Blackshaw S, Zhu H, Qian J. hPDI: a database of experimental human protein-DNA interactions. Bioinformatics. 2010 Jan 15;26(2):287-9. Epub 2009 Nov 9.
427
+
428
+\item Zhu LJ, et al. 2011. FlyFactorSurvey: a database of Drosophila transcription factor binding specificities determined using the bacterial one-hybrid system. Nucleic Acids Res. 2011 Jan;39(Database issue):D111-7. Epub 2010 Nov 19.
429
+
430
+
431
+\end{itemize}
432
+
433
+
434
+\end{document}
435
+
436
+
0 437
new file mode 100644
1 438
Binary files /dev/null and b/inst/extdata/ScerTF.RData differ
2 439
new file mode 100644
3 440
Binary files /dev/null and b/inst/extdata/flyFactorSurvey.RData differ
4 441
new file mode 100644
5 442
Binary files /dev/null and b/inst/extdata/hPDI.RData differ
6 443
new file mode 100644
7 444
Binary files /dev/null and b/inst/extdata/jaspar.RData differ
8 445
new file mode 100644
9 446
Binary files /dev/null and b/inst/extdata/uniprobe.RData differ
10 447
new file mode 100644
... ...
@@ -0,0 +1,85 @@
1
+default: build install
2
+
3
+help:
4
+	egrep "^#" makefile | sed "s/^#//"
5
+
6
+clean:
7
+	- rm ../../MotifDb_*tar.gz
8
+
9
+
10
+# --- quickbuild: no vignette
11
+#
12
+quickbuild:
13
+	(cd ../..; R CMD build --no-vignettes MotifDb)
14
+
15
+
16
+# --- build
17
+#
18
+build:
19
+	(cd ../..; R CMD build --no-vignettes MotifDb)
20
+
21
+# --- install
22
+#
23
+install:
24
+	(cd ../..; R CMD install MotifDb)
25
+
26
+# --- check
27
+#
28
+check: clean build install
29
+	(cd ../..; R CMD check --no-manual --no-vignettes --no-codoc --no-examples --no-manual  MotifDb)
30
+
31
+# --- checkfull
32
+#
33
+checkfull: 
34
+	(cd ../..; R CMD build MotifDb)
35
+	(cd ../..; R CMD check MotifDb)
36
+
37
+
38
+# --- vanillaTest
39
+# run all the unit tests, in a clean context
40
+#
41
+
42
+vanillaTest:  build install
43
+	- rm vanillaTest.out
44
+	R --vanilla < vanillaTest.R > vanillaTest.out 2>&1
45
+
46
+# --- vt
47
+# run all the unit tests, in a clean context
48
+#
49
+
50
+vt: vanillaTest
51
+
52
+
53
+# --- checkvig
54
+# check just the vignette
55
+#
56
+
57
+checkvig:
58
+	(cd ../..; R CMD check --no-manual --no-codoc --no-tests --no-examples MotifDb)
59
+
60
+
61
+# --- tangle
62
+# extract the R code from the vignette file
63
+#
64
+
65
+tangle:
66
+	(cd ../vignettes; R CMD Stangle MotifDb.Rnw)
67
+
68
+
69
+
70
+# --- sweave
71
+# creates MotifDb.tex, runs all embedded examples
72
+# run this before the pdf target
73
+#
74
+sweave: 
75
+	(cd ../vignettes; R CMD Sweave MotifDb.Rnw --pdf)
76
+
77
+# --- pdf
78
+# make and open MotifDb.pdf, the vignette 
79
+#
80
+
81
+pdf:  sweave
82
+	(cd ../vignettes; open MotifDb.pdf)
83
+
84
+
85
+
0 86
new file mode 100644
... ...
@@ -0,0 +1,304 @@
1
+Index: MotifList-class.R
2
+===================================================================
3
+--- MotifList-class.R	(revision 67138)
4
+@@ -4,10 +4,30 @@
5
+           prototype(elementType="matrix")
6
+           )
7
+ 
8
+-setGeneric ('add', signature='obj', function (obj, matrices, tbl.metadata) standardGeneric ('add'))
9
++setValidity("MotifList", function(object) {
10
++    msg <- NULL
11
++
12
++    ## what makes for a valid MotifList?
13
++    if (is.null(names(object)))
14
++        msg <- c(msg, "'names()' must not be NULL")
15
++    else if (any(duplicated(names(object))))
16
++        msg <- c(msg, "all 'names()' must be unique")
17
++    ## all(sapply(object, class) == "matrix") works for 0-length object
18
++    if (!all(sapply(object, class) == "matrix"))
19
++        msg <- c(msg, "all elements must be of class 'matrix'")
20
++    if (!all(sapply(object, nrow) == 4))
21
++        msg <- c(msg, "all elements must have 4 rows")
22
++    ok <- sapply(object, function(elt) all(colSums(elt) == 1))
23
++    if (!all(ok))
24
++        msg <- c(msg, "all elements must have colSums equal to 1")
25
++
26
++    if (is.null(msg)) TRUE else msg
27
++})
28
++
29
++setGeneric ('add', function (obj, matrices, tbl.metadata) standardGeneric ('add'),
30
++            signature='obj')
31
+ #------------------------------------------------------------------------------------------------------------------------
32
+ MotifList = function (matrices=list(),
33
+-                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
34
+                       providerNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
35
+                       providerIds=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
36
+                       geneSymbols=character(),              # ABF2, Rhox11
37
+@@ -22,27 +42,9 @@
38
+                       dataSources=character(),              # ScerTF, UniPROBE
39
+                       experimentTypes=character(),          # NA, protein-binding microarray
40
+                       pubmedIDs=character(),                # 19111667, 1858359
41
+-                      tfFamilies=character())               # NA, NA
42
++                      tfFamilies=character())
43
+ {  
44
+-  names (matrices) = names
45
+-  count = length (matrices)
46
+-  stopifnot (length (names) == count)
47
+-  stopifnot (length (providerNames) == count)
48
+-  stopifnot (length (providerIds) == count)
49
+-  stopifnot (length (geneSymbols) == count)
50
+-  stopifnot (length (geneIds) == count)
51
+-  stopifnot (length (geneIdTypes) == count)
52
+-  stopifnot (length (proteinIds) == count)
53
+-  stopifnot (length (proteinIdTypes) == count)
54
+-  stopifnot (length (sequenceCounts) == count)
55
+-  stopifnot (length (organisms) == count)
56
+-  stopifnot (length (bindingDomains) == count)
57
+-  stopifnot (length (bindingSequences) == count)
58
+-  stopifnot (length (dataSources) == count)
59
+-  stopifnot (length (experimentTypes) == count)
60
+-  stopifnot (length (pubmedIDs) == count)
61
+-  stopifnot (length (tfFamilies) == count)
62
+-
63
++  ## these are redundant with DataFrame validity method
64
+   elementMetadata = DataFrame (#name=names,
65
+                                providerName=providerNames,
66
+                                providerId=providerIds,
67
+@@ -63,45 +65,33 @@
68
+   
69
+ } # ctor
70
+ #------------------------------------------------------------------------------------------------------------------------
71
++## quiet vs. verbose
72
+ MotifDb = function (loadAllSources=TRUE, quiet=TRUE)
73
+ {
74
+   mdb <- MotifList ()
75
+   if (loadAllSources) {
76
+-    data.path = paste (system.file (package='MotifDb'), 'data', sep='/')
77
++    data.path = system.file(package="MotifDb", "data")
78
+     data.files = dir (data.path, full.names=TRUE)
79
+-    sources.to.exclude = c ('ScerTF.RData')   # pending minor revisions
80
+-    removers = as.integer (sapply (sources.to.exclude, function (source) grep (source, data.files)))
81
+-    if (length (removers) > 0)
82
+-      data.files = data.files [-removers]
83
+-  
84
+-    count = 0
85
+-    if (length (data.files) > 0)   
86
++    sources.to.exclude = "ScerTF.RData"   # pending minor revisions
87
++    data.files = data.files[!basename(data.files) %in% sources.to.exclude]
88
+       for (data.file in data.files) {
89
++        ## see globalVariables(tbl.md, matrices)
90
+         tbl.md = NA; matrices = NA;  # define these to keep 'check' happy.  they are loaded by 'load'
91
+-        variables = load (data.file)
92
+-        mdb = add (mdb, matrices, tbl.md)
93
+-        filename.tokens = strsplit (data.file, '\\/') [[1]]
94
+-        shortFilename = filename.tokens [length (filename.tokens)]
95
+-        if (!quiet) 
96
+-          print (noquote (sprintf ('added %s (%d) matrices, length now: %d', shortFilename, length (matrices), length (mdb))))
97
++        load (data.file)
98
++        mdb <- append(mdb, do.call("MotifList", c(matrices, tbl.md)))
99
++        if (!quiet)
100
++          message(sprintf ('added %s (%d) matrices, length now: %d',
101
++                           basename(data.file), length (matrices),
102
++                           length (mdb)))
103
+       } # for data.file
104
+   
105
+-    if (!quiet) {
106
++    if (!quiet)
107
+       print (table (mdb$dataSource))
108
+-      }
109
+     } # if loadAllSources
110
+ 
111
+-  return (mdb)
112
+-
113
++  mdb
114
+ } # MotifDb
115
+ #------------------------------------------------------------------------------------------------------------------------
116
+-setMethod ('length', signature = 'MotifList',
117
+-
118
+-  function (x) {
119
+-    return (length (x@listData))
120
+-    })
121
+-
122
+-#------------------------------------------------------------------------------------------------------------------------
123
+ setMethod ('subset', signature = 'MotifList',
124
+ 
125
+   function (x, subset, select, drop = FALSE, ...) {
126
+@@ -115,42 +105,26 @@
127
+         stop("'subset' must be coercible to logical")
128
+       i <- i & !is.na(i)
129
+       } # else
130
+-    return (x [i])
131
++    x [i]
132
+   })
133
+ 
134
+ #------------------------------------------------------------------------------------------------------------------------
135
++## 'add' is too general a name; implement as do.call + append, but
136
++## then not much added value so omit method?
137
+ setMethod ('add', signature = 'MotifList',
138
+ 
139
+   function (obj, matrices, tbl.metadata) { 
140
+ 
141
+-     full.names = names (matrices)   # full in this sense:  organism-source-providerName[-possibleUniquifier]
142
+-     size = length (matrices)
143
+-     empty.chars = rep (NA_character_, size)
144
+-     empty.ints = rep (NA_integer_, size)
145
+-     new.list = MotifList (matrices, 
146
+-                           names=full.names,
147
+-                           providerNames=tbl.metadata$providerName,                  # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
148
+-                           providerIds=tbl.metadata$providerId,                      # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
149
+-                           geneSymbols=tbl.metadata$geneSymbol,                      # ABF2, Rhox11n
150
+-                           geneIds=tbl.metadata$geneId,
151
+-                           geneIdTypes=tbl.metadata$geneIdType,
152
+-                           proteinIds=tbl.metadata$proteinId,
153
+-                           proteinIdTypes=tbl.metadata$proteinIdType,
154
+-                           sequenceCounts=tbl.metadata$sequenceCount,                # often NA
155
+-                           organisms=tbl.metadata$organism,                          # Scerevisiae, Mmusculus
156
+-                           bindingDomains=tbl.metadata$bindingDomain,                # NA, Homeo
157
+-                           bindingSequences=tbl.metadata$bindingSequence,    
158
+-                           dataSources=tbl.metadata$dataSource,                      # ScerTF, UniPROBE
159
+-                           experimentTypes=tbl.metadata$experimentType,              # NA, protein-binding microarray, SELEX, 
160
+-                           pubmedIDs=tbl.metadata$pubmedID,                          # 19111667, 1858359
161
+-                           tfFamilies=tbl.metadata$tfFamily)                         # NA
162
+-     obj@listData = c (obj@listData, new.list@listData)
163
+-     elementMetadata (obj) = rbind (elementMetadata (obj), elementMetadata (new.list))
164
+-     return (obj)
165
++     if (!is.list(matrices))            # allow solo matrix
166
++         matrices <- list(matrices)
167
++     ## 1:1 mapping between tbl.metadata (data.frame?) column names
168
++     ## and MotifDb elementMetadata
169
++     new.list <- do.call("MotifList", c(matrices, tbl.metadata))
170
++     append(obj, new.list)
171
+     })
172
+ 
173
+ #------------------------------------------------------------------------------------------------------------------------
174
+-setMethod("$", signature="MotifList", 
175
++setMethod("$", signature="MotifList",   # is this a good idea?
176
+ 
177
+   function(x, name) {
178
+     elementMetadata (x)[[name]]
179
+@@ -201,45 +175,38 @@
180
+ {
181
+   matrix.count = length (matrices)
182
+ 
183
+-    # incoming matrices have nucleotide rows, position columns.  meme format, however, requires position-rows, and nucleotide-columns
184
+-    # calculate the number of lines of text by counting columns
185
+-  total.transposed.matrix.rows = sum (as.integer (sapply (matrices, ncol)))
186
+-  predicted.line.count = 12 + (3 * length (matrices)) + total.transposed.matrix.rows
187
+-  s = vector ('character', predicted.line.count)
188
+-
189
++  ## incoming matrices have nucleotide rows, position columns.  meme
190
++  ## format, however, requires position-rows, and nucleotide-columns
191
++  ## calculate the number of lines of text by counting columns
192
+   #printf ("length of pre-allocated vector: %d", length (s))
193
+-  s [1] = 'MEME version 4'
194
+-  s [2] = ''
195
+-  s [3] = 'ALPHABET= ACGT'
196
+-  s [4] = ''
197
+-  s [5] = 'strands: + -'
198
+-  s [6] = ''
199
+-  s [7] = 'Background letter frequencies'
200
+-  s [8] = 'A 0.250 C 0.250 G 0.250 T 0.250 '
201
+-  s [9] = '' 
202
+ 
203
+-  index = 10
204
+-  for (name in names (matrices)) {
205
+-       # transpose the frequency matrix version of the incoming matrix, hence 'tfMat'
206
+-    tfMat = transformMatrixToMemeRepresentation (matrices [[name]])
207
+-       # meme output may be used by tomtom, which uses matrix names as part of image filenames. 
208
+-       # removed all file-system-unfriendly characters here
209
++  header = c(
210
++    'MEME version 4',
211
++    '',
212
++    'ALPHABET= ACGT',
213
++    '',
214
++    'strands: + -',
215
++    '',
216
++    'Background letter frequencies',
217
++    'A 0.250 C 0.250 G 0.250 T 0.250 ',
218
++    '')
219
++
220
++  tfStrings <- Map(function(name, tfMat) {
221
++    tfMat = transformMatrixToMemeRepresentation (tfMat)
222
++    ## meme output may be used by tomtom, which uses matrix names as
223
++    ## part of image filenames.  removed all file-system-unfriendly
224
++    ## characters here
225
+     fixed.name = gsub ('\\/', '_', name)
226
+-    s [index] = sprintf ('MOTIF %s', fixed.name)
227
+-    index = index + 1
228
+-    s [index] = sprintf ('letter-probability matrix: alength= 4 w= %d nsites= %d E=8.1e-020', nrow (tfMat), 45, 8.1e-020)
229
+-    index = index + 1
230
+-    for (r in 1:nrow (tfMat)) {
231
+-      new.row = sprintf (' %12.10f  %12.10f  %12.10f  %12.10f', tfMat [r,1], tfMat [r,2], tfMat [r,3], tfMat [r,4])
232
+-      s [index] = new.row
233
+-      index = index + 1
234
+-      }
235
+-    s [index] = ''
236
+-    index = index + 1
237
+-    } # for name
238
++    id <- sprintf ('MOTIF %s', fixed.name)
239
++    desc <-
240
++        sprintf ('letter-probability matrix: alength= 4 w= %d nsites= 45 E=8.1e-020',
241
++                 nrow (tfMat))
242
++    mat <- sprintf (' %12.10f  %12.10f  %12.10f  %12.10f',
243
++                    tfMat[,1], tfMat[,2], tfMat[,3], tfMat[,4])
244
++    c(id, desc, mat, "")
245
++  }, names(matricies), matricies)
246
+ 
247
+-  invisible (s)
248
+-
249
++  invisible(c(header, unlist(tfStrings)))
250
+ } # matrixToMemeText
251
+ #------------------------------------------------------------------------------------------------------------------------
252
+ # write to connection with specified format
253
+@@ -250,14 +217,11 @@
254
+ setMethod ('export',  signature=c(object='MotifList', con='character', format='character'),
255
+ 
256
+   function (object, con, format, ...) {
257
+-
258
+-    fmt = match.arg (tolower (format), c ('meme', 'transfac'))
259
+-    if (fmt %in% c ('meme', 'transfac')) {
260
+-      text = matrixToMemeText (object)
261
+-      }
262
++      ## do minimum work unique to this method, then dispatch to avoid
263
++      ## code duplication
264
+     con = file (con, 'w')
265
+-    cat (text, sep='\n', file=con)
266
+-    close (con)
267
++    on.exit(close(con))
268
++    export(object, con, format, ...)
269
+     })
270
+ 
271
+ #------------------------------------------------------------------------------------------------------------------------
272
+@@ -271,23 +235,16 @@
273
+   function (object, con, format, ...) {
274
+ 
275
+     fmt = match.arg (tolower (format), c ('meme', 'transfac'))
276
+-    if (fmt %in% c ('meme', 'transfac')) {
277
+-      text = matrixToMemeText (object)
278
+-      }
279
++    ## match.arg fails if !fmt %in% c('meme', 'transfac'), so no need
280
++    ## for test
281
++    ## let the user manage opened cons
282
++    if (!isOpen(con)) {
283
++        open(con)
284
++        on.exit(close(con))
285
++    }
286
+     cat (text, sep='\n', file=con)
287
+-    close (con)
288
+     })
289
+ 
290
+ #------------------------------------------------------------------------------------------------------------------------
291
+ # write to connection, using default format,  ??? for matrix list, tsv for metadata
292
+-setMethod ('export',  signature=c(object='MotifList',  con='missing', format='character'),
293
+-
294
+-  function (object, con, format,  ...) {
295
+-
296
+-    fmt = match.arg (tolower (format), c ('meme')) # , 'transfac'))
297
+-    if (fmt == 'meme') 
298
+-      text = matrixToMemeText (object)
299
+-    return (text)
300
+-    })
301
+-
302
+-#------------------------------------------------------------------------------------------------------------------------
303
++## already handled by rtracklayer?
0 304
new file mode 100644
... ...
@@ -0,0 +1,111 @@
1
+library (MotifDb)
2
+library (MotIV)
3
+library (seqLogo)
4
+library (RUnit)
5
+#-----------------------------------------------------------------------------------------
6
+demo.MotIV = function ()
7
+{
8
+  print ('--- demo.MotIV')
9
+  mdb = MotifDb (quiet=TRUE)
10
+  db.tmp = mdb@listData
11
+
12
+     # match motif 1 against the entire MotifDb  collection
13
+  motif.hits = motifMatch (db.tmp [1], database=db.tmp)
14
+
15
+     # the long way to extract the matrix name.  see MotIV.toTable below for more convenient way
16
+  checkEquals (motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names (db.tmp)[1])
17
+
18
+     # jaspar, uniprobe and ScerTF contribute a total of 1035 matrices
19
+  checkTrue (length (mdb) >= 1035)
20
+
21
+  target = db.tmp [1035]
22
+  printf ('target matrix: %s', names (target))
23
+  motif.hits =  motifMatch (db.tmp [1035], database=db.tmp)
24
+  tbl.hits = MotIV.toTable (motif.hits)
25
+  checkEquals (names (db.tmp)[1035], tbl.hits [1, 'name'])   # first hit should be the target itself
26
+
27
+  return (tbl.hits)
28
+
29
+} # demo.MotIV
30
+#------------------------------------------------------------------------------------------------------------------------
31
+display.matrices= function ()
32
+{
33
+  mdb = MotifDb (quiet=TRUE)
34
+  printf ('--- %s', 'ScerTF-Scerevisiae-PDR8-badis')
35
+  print (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]])
36
+  printf ('--- %s', 'JASPAR_CORE-Scerevisiae-YDR520C-MA0422.1')
37
+  print (subset (mdb, name=='JASPAR_CORE-Scerevisiae-YDR520C-MA0422.1')[[1]])
38
+  printf ('--- %s', 'ScerTF-Scerevisiae-PDR8-badis')
39
+  print (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]])
40
+  printf ('--- %s', 'ScerTF-Scerevisiae-YLR278C-badis')
41
+  print (subset (mdb, name=='ScerTF-Scerevisiae-YLR278C-badis')[[1]])
42
+  printf ('--- %s', 'JASPAR_CORE-Scerevisiae-HAP1-MA0312.1')
43
+  print (subset (mdb, name=='JASPAR_CORE-Scerevisiae-HAP1-MA0312.1')[[1]])
44
+  printf ('--- %s', 'ScerTF-Scerevisiae-YKL222C-zhu')
45
+  print (subset (mdb, name=='ScerTF-Scerevisiae-YKL222C-zhu')[[1]])
46
+
47
+} # display.logos
48
+#------------------------------------------------------------------------------------------------------------------------
49
+display.logos = function ()
50
+{
51
+  mdb = MotifDb (quiet=TRUE)
52
+  par (mfrow=c (3,2))
53
+  seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]])
54
+  #readline ()
55
+  seqLogo (subset (mdb, name=='JASPAR_CORE-Scerevisiae-YDR520C-MA0422.1')[[1]])
56
+  seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-PDR8-badis')[[1]])
57
+  seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-YLR278C-badis')[[1]])
58
+  seqLogo (subset (mdb, name=='JASPAR_CORE-Scerevisiae-HAP1-MA0312.1')[[1]])
59
+  seqLogo (subset (mdb, name=='ScerTF-Scerevisiae-YKL222C-zhu')[[1]])
60
+
61
+} # display.logos
62
+#------------------------------------------------------------------------------------------------------------------------
63
+MotIV.toTable = function (match)
64
+{
65
+  stopifnot (length (match@bestMatch) == 1)
66
+  alignments = match@bestMatch[[1]]@aligns
67
+
68
+  df = data.frame (stringsAsFactors=FALSE)
69
+  for (alignment in alignments) {
70
+    x = alignment
71
+    name = x@TF@name
72
+    eVal = x@evalue
73
+    sequence = x@sequence
74
+    match = x@match
75
+    strand = x@strand
76
+    df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
77
+    } # for alignment
78
+
79
+  return (df)
80
+
81
+} # MotIV.toTable 
82
+#------------------------------------------------------------------------------------------------------------------------
83
+test.MotIV.toTable = function ()
84
+{
85
+  print ('--- test.MotIVtoTable')
86
+  mdb = MotifDb (quiet=TRUE)
87
+  test.hits = motifMatch (mdb[1]@listData, database=jaspar)
88
+  tbl.hits =  MotIV.toTable (test.hits)
89
+  checkEquals (dim (tbl.hits), c (5, 5))
90
+  checkEquals (colnames (tbl.hits), c ("name", "eVal", "sequence", "match", "strand"))
91
+
92
+} # test.MotIV.toTable 
93
+#------------------------------------------------------------------------------------------------------------------------
94
+demo.tomtom = function ()
95
+{
96
+  mdb = MotifDb (quiet=TRUE)
97
+  mdb.01 = mdb [1035]
98
+  max = length (mdb)
99
+
100
+  mdb.many = mdb [1:max]
101
+  export (mdb.01, 'mdb1.text', 'meme')
102
+  export (mdb.many, 'mdbMany.text', 'meme')
103
+     # find similarity of motif #1 to all the motifs in mdbMany
104
+  cmd = sprintf ('tomtom -no-ssc -oc ../tomtomTest -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s',
105
+                  'mdb1.text',  'mdbMany.text')
106
+  system (cmd)
107
+  cmd = 'open ../tomtomTest/tomtom.html'   # file.path, browseURL
108
+  system (cmd)
109
+
110
+} # demo.tomtom
111
+#------------------------------------------------------------------------------------------------------------------------
0 112
new file mode 100644
... ...
@@ -0,0 +1,304 @@
1
+Index: MotifList-class.R
2
+===================================================================
3
+--- MotifList-class.R	(revision 67138)
4
+@@ -4,10 +4,30 @@
5
+           prototype(elementType="matrix")
6
+           )
7
+ 
8
+-setGeneric ('add', signature='obj', function (obj, matrices, tbl.metadata) standardGeneric ('add'))
9
++setValidity("MotifList", function(object) {
10
++    msg <- NULL
11
++
12
++    ## what makes for a valid MotifList?
13
++    if (is.null(names(object)))
14
++        msg <- c(msg, "'names()' must not be NULL")
15
++    else if (any(duplicated(names(object))))
16
++        msg <- c(msg, "all 'names()' must be unique")
17
++    ## all(sapply(object, class) == "matrix") works for 0-length object
18
++    if (!all(sapply(object, class) == "matrix"))
19
++        msg <- c(msg, "all elements must be of class 'matrix'")
20
++    if (!all(sapply(object, nrow) == 4))
21
++        msg <- c(msg, "all elements must have 4 rows")
22
++    ok <- sapply(object, function(elt) all(colSums(elt) == 1))
23
++    if (!all(ok))
24
++        msg <- c(msg, "all elements must have colSums equal to 1")
25
++
26
++    if (is.null(msg)) TRUE else msg
27
++})
28
++
29
++setGeneric ('add', function (obj, matrices, tbl.metadata) standardGeneric ('add'),
30
++            signature='obj')
31
+ #------------------------------------------------------------------------------------------------------------------------
32
+ MotifList = function (matrices=list(),
33
+-                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
34
+                       providerNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
35
+                       providerIds=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
36
+                       geneSymbols=character(),              # ABF2, Rhox11
37
+@@ -22,27 +42,9 @@
38
+                       dataSources=character(),              # ScerTF, UniPROBE
39
+                       experimentTypes=character(),          # NA, protein-binding microarray
40
+                       pubmedIDs=character(),                # 19111667, 1858359
41
+-                      tfFamilies=character())               # NA, NA
42
++                      tfFamilies=character())
43
+ {  
44
+-  names (matrices) = names
45
+-  count = length (matrices)
46
+-  stopifnot (length (names) == count)
47
+-  stopifnot (length (providerNames) == count)
48
+-  stopifnot (length (providerIds) == count)
49
+-  stopifnot (length (geneSymbols) == count)
50
+-  stopifnot (length (geneIds) == count)
51
+-  stopifnot (length (geneIdTypes) == count)
52
+-  stopifnot (length (proteinIds) == count)
53
+-  stopifnot (length (proteinIdTypes) == count)
54
+-  stopifnot (length (sequenceCounts) == count)
55
+-  stopifnot (length (organisms) == count)
56
+-  stopifnot (length (bindingDomains) == count)
57
+-  stopifnot (length (bindingSequences) == count)
58
+-  stopifnot (length (dataSources) == count)
59
+-  stopifnot (length (experimentTypes) == count)
60
+-  stopifnot (length (pubmedIDs) == count)
61
+-  stopifnot (length (tfFamilies) == count)
62
+-
63
++  ## these are redundant with DataFrame validity method
64
+   elementMetadata = DataFrame (#name=names,
65
+                                providerName=providerNames,
66
+                                providerId=providerIds,
67
+@@ -63,45 +65,33 @@
68
+   
69
+ } # ctor
70
+ #------------------------------------------------------------------------------------------------------------------------
71
++## quiet vs. verbose
72
+ MotifDb = function (loadAllSources=TRUE, quiet=TRUE)
73
+ {
74
+   mdb <- MotifList ()
75
+   if (loadAllSources) {
76
+-    data.path = paste (system.file (package='MotifDb'), 'data', sep='/')
77
++    data.path = system.file(package="MotifDb", "data")
78
+     data.files = dir (data.path, full.names=TRUE)
79
+-    sources.to.exclude = c ('ScerTF.RData')   # pending minor revisions