Browse code

removed motifmatchr and its dependencies

paul-shannon authored on 02/04/2020 20:20:03
Showing10 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,2 @@
1
+^doc$
2
+^Meta$
... ...
@@ -1,13 +1,13 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.29.6
5
-Date: 2020-03-18
4
+Version: 1.29.8
5
+Date: 2020-04-02
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
8 8
 Depends: R (>= 3.5.0), methods, BiocGenerics, S4Vectors, IRanges, GenomicRanges, Biostrings
9
-Suggests: RUnit
10
-Imports: rtracklayer, splitstackshape, TFBSTools, universalmotif, motifmatchr
9
+Suggests: RUnit, seqLogo, MotIV
10
+Imports: rtracklayer, splitstackshape
11 11
 Description: More than 9900 annotated position frequency matrices from 14 public sources, for multiple organisms.
12 12
 License: Artistic-2.0 | file LICENSE
13 13
 License_is_FOSS: no
... ...
@@ -4,7 +4,6 @@ exportMethods (
4 4
    export,
5 5
    show,
6 6
    query,
7
-   matchMotif,
8 7
    motifToGene,
9 8
    geneToMotif,
10 9
    associateTranscriptionFactors
... ...
@@ -14,11 +13,6 @@ export(
14 13
     MotifDb
15 14
 )
16 15
 
17
-import(motifmatchr)
18
-import(TFBSTools)
19
-importFrom(universalmotif, convert_motifs)
20
-
21
-
22 16
 import(BiocGenerics)
23 17
 import(S4Vectors)
24 18
 import(IRanges)
... ...
@@ -4,8 +4,8 @@ setGeneric('geneToMotif', signature='object', function(object, geneSymbols, sour
4 4
 setGeneric('associateTranscriptionFactors', signature='object',
5 5
            function(object, tbl.withMotifs,  source, expand.rows, motifColumnName="motifName")
6 6
               standardGeneric('associateTranscriptionFactors'))
7
-setGeneric('matchMotif', signature='object', function(object, motifs, genomeName, regions, pval.cutoff,
8
-                                                      fimoDataFrameStyle=FALSE) standardGeneric('matchMotif'))
7
+#setGeneric('matchMotif', signature='object', function(object, motifs, genomeName, regions, pval.cutoff,
8
+#                                                      fimoDataFrameStyle=FALSE) standardGeneric('matchMotif'))
9 9
 #------------------------------------------------------------------------------------------------------------------------
10 10
 setClass ('MotifList',
11 11
           contains='SimpleList',
... ...
@@ -64,29 +64,29 @@ MotifList = function (matrices=list(), tbl.metadata=data.frame ())
64 64
 
65 65
 } # ctor
66 66
 #-------------------------------------------------------------------------------
67
-setMethod('matchMotif', signature='MotifList',
68
-
69
-   function(object, motifs, genomeName, regions, pval.cutoff, fimoDataFrameStyle=FALSE){
70
-     motifs.pfmatrix <- lapply(motifs, function(motif) convert_motifs(motif, "TFBSTools-PFMatrix"))
71
-     motifs.pfmList <- do.call(PFMatrixList, motifs.pfmatrix)
72
-     gr.list <- matchMotifs(motifs.pfmList, regions, genome=genomeName, out="positions", p.cutoff=pval.cutoff)
73
-     result <- gr.list
74
-     if(fimoDataFrameStyle){
75
-        gr <- unlist(gr.list)
76
-        motif.names <- names(gr)
77
-        names(gr) <- NULL
78
-        tbl <- as.data.frame(gr)
79
-        tbl$motif_id <- motif.names
80
-        colnames(tbl)[1] <- "chrom"
81
-        tbl$chrom <- as.character(tbl$chrom)
82
-        colnames(tbl)[grep("score", colnames(tbl))] <- "mood.score"
83
-        new.order <- order(tbl$start, decreasing=FALSE)
84
-        tbl <- tbl[new.order,]
85
-        result <- tbl
86
-        }
87
-     return(result)
88
-     })
89
-
67
+# setMethod('matchMotif', signature='MotifList',
68
+#
69
+#    function(object, motifs, genomeName, regions, pval.cutoff, fimoDataFrameStyle=FALSE){
70
+#      motifs.pfmatrix <- lapply(motifs, function(motif) convert_motifs(motif, "TFBSTools-PFMatrix"))
71
+#      motifs.pfmList <- do.call(PFMatrixList, motifs.pfmatrix)
72
+#      gr.list <- motifmatchr::matchMotifs(motifs.pfmList, regions, genome=genomeName, out="positions", p.cutoff=pval.cutoff)
73
+#      result <- gr.list
74
+#      if(fimoDataFrameStyle){
75
+#         gr <- unlist(gr.list)
76
+#         motif.names <- names(gr)
77
+#         names(gr) <- NULL
78
+#         tbl <- as.data.frame(gr)
79
+#         tbl$motif_id <- motif.names
80
+#         colnames(tbl)[1] <- "chrom"
81
+#         tbl$chrom <- as.character(tbl$chrom)
82
+#         colnames(tbl)[grep("score", colnames(tbl))] <- "mood.score"
83
+#         new.order <- order(tbl$start, decreasing=FALSE)
84
+#         tbl <- tbl[new.order,]
85
+#         result <- tbl
86
+#         }
87
+#      return(result)
88
+#      })
89
+#
90 90
 #-------------------------------------------------------------------------------
91 91
 setMethod ('subset', signature = 'MotifList',
92 92
 
... ...
@@ -1,6 +1,6 @@
1 1
 quick: install
2 2
 
3
-all:  vig build check
3
+all:  install vig build check
4 4
 
5 5
 vig:
6 6
 	R -e "devtools::build_vignettes()"
... ...
@@ -36,8 +36,6 @@ runTests = function()
36 36
   test.export_memeFormatToFile()
37 37
   test.export_memeFormatToFileDuplication()
38 38
   test.export_memeFormatToFile_run_tomtom()
39
-  #test.MotIV.toTable()
40
-  #test.run_MotIV.motifMatch()
41 39
   test.flyFactorGeneSymbols()
42 40
   test.export_jasparFormatStdOut()
43 41
   test.export_jasparFormatToFile()
... ...
@@ -48,8 +46,6 @@ runTests = function()
48 46
   test.motifToGene()
49 47
   test.associateTranscriptionFactors()
50 48
 
51
-  test.match()
52
-
53 49
   test.hocomoco11.with.reliabilityScores()
54 50
 
55 51
 } # runTests
... ...
@@ -1102,69 +1098,71 @@ test.associateTranscriptionFactors <- function()
1102 1098
       # now some motif names
1103 1099
 } # test.associateTranscriptionFactors
1104 1100
 #------------------------------------------------------------------------------------------------------------------------
1105
-test.match <- function()
1106
-{
1107
-   printf("--- test.match")
1108
-   gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560))
1109
-   motifs <- query(MotifDb, c("jaspar2018", "ZNF263"))
1110
-   checkEquals(length(motifs), 1)
1111
-   gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
1112
-   checkEquals(length(gr.match), 1)  # just one motif
1113
-   checkEquals(names(gr.match), names(motifs))
1114
-   checkEquals(length(gr.match[[1]]), 3)
1115
-
1116
-   tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
1117
-   checkEquals(dim(tbl.match), c(3, 7))
1118
-   checkTrue(all(tbl.match$motif == names(motifs)))
1119
-   checkEquals(class(tbl.match$chrom), "character")  # not a factor
1120
-
1121
-   motifs <- query(MotifDb, "ZNF263", c("jaspar2018", "swissregulon"))
1122
-   checkEquals(length(motifs), 2)
1123
-   gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
1124
-   checkEquals(names(gr.match), names(motifs))
1125
-   checkEquals(as.numeric(lapply(gr.match, length)), c(3, 1))
1126
-
1127
-   tbl.match <-matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
1128
-   checkEquals(dim(tbl.match), c(4, 7))
1129
-   checkEquals(length(unique(tbl.match$motif)), 2)
1130
-   checkEquals(unique(tbl.match$motif), names(motifs))
1131
-   checkEquals(colnames(tbl.match), c("chrom", "start", "end", "width", "strand", "mood.score", "motif_id"))
1132
-
1133
-
1134
-        #------------------------------------------------
1135
-        # now all jaspar2018 human motifs
1136
-        #------------------------------------------------
1137
-
1138
-   motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
1139
-   tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
1140
-   checkEquals(dim(tbl.match), c(7, 7))
1141
-   checkEquals(sort(unique(tbl.match$motif)),
1142
-               c("Hsapiens-jaspar2018-EWSR1-FLI1-MA0149.1", "Hsapiens-jaspar2018-ZNF263-MA0528.1"))
1143
-
1144
-        #-----------------------------------------------------
1145
-        # now all jaspar2018 human motifs, loosen the pValue
1146
-        #-----------------------------------------------------
1147
-
1148
-   motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
1149
-   tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-4, fimoDataFrameStyle=TRUE)
1150
-   checkTrue(nrow(tbl.match) > 15)
1151
-
1152
-   tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-3, fimoDataFrameStyle=TRUE)
1153
-   checkTrue(nrow(tbl.match) > 50)
1154
-
1155
-       #-------------------------------------------------------------
1156
-       # now all jaspar2018 and hocomoco human motifs across 10kb
1157
-       #------------------------------------------------------------
1158
-
1159
-   motifs <- query(MotifDb, "hsapiens", orStrings=c("jaspar2018", "hocomoco-core"))
1160
-   checkTrue(length(motifs) > 500)
1161
-   gr.region <- GRanges(seqnames="chr1", IRanges(start=47229000, end=47239000))
1162
-
1163
-   tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-7, fimoDataFrameStyle=TRUE)
1164
-   checkTrue(nrow(tbl.match) > 90 && nrow(tbl.match) < 110)
1165
-   checkEquals(order(tbl.match$start), seq_len(nrow(tbl.match)))
1166
-
1167
-} # test.match
1101
+# disabled (2 apr 2020) due to very large (~100?) dependendencies introducted directly and indirectly
1102
+# via motifmatchr, TFBSTools, universalmotif
1103
+# test.match <- function()
1104
+# {
1105
+#    printf("--- test.match")
1106
+#    gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560))
1107
+#    motifs <- query(MotifDb, c("jaspar2018", "ZNF263"))
1108
+#    checkEquals(length(motifs), 1)
1109
+#    gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
1110
+#    checkEquals(length(gr.match), 1)  # just one motif
1111
+#    checkEquals(names(gr.match), names(motifs))
1112
+#    checkEquals(length(gr.match[[1]]), 3)
1113
+#
1114
+#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
1115
+#    checkEquals(dim(tbl.match), c(3, 7))
1116
+#    checkTrue(all(tbl.match$motif == names(motifs)))
1117
+#    checkEquals(class(tbl.match$chrom), "character")  # not a factor
1118
+#
1119
+#    motifs <- query(MotifDb, "ZNF263", c("jaspar2018", "swissregulon"))
1120
+#    checkEquals(length(motifs), 2)
1121
+#    gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
1122
+#    checkEquals(names(gr.match), names(motifs))
1123
+#    checkEquals(as.numeric(lapply(gr.match, length)), c(3, 1))
1124
+#
1125
+#    tbl.match <-matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
1126
+#    checkEquals(dim(tbl.match), c(4, 7))
1127
+#    checkEquals(length(unique(tbl.match$motif)), 2)
1128
+#    checkEquals(unique(tbl.match$motif), names(motifs))
1129
+#    checkEquals(colnames(tbl.match), c("chrom", "start", "end", "width", "strand", "mood.score", "motif_id"))
1130
+#
1131
+#
1132
+#         #------------------------------------------------
1133
+#         # now all jaspar2018 human motifs
1134
+#         #------------------------------------------------
1135
+#
1136
+#    motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
1137
+#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
1138
+#    checkEquals(dim(tbl.match), c(7, 7))
1139
+#    checkEquals(sort(unique(tbl.match$motif)),
1140
+#                c("Hsapiens-jaspar2018-EWSR1-FLI1-MA0149.1", "Hsapiens-jaspar2018-ZNF263-MA0528.1"))
1141
+#
1142
+#         #-----------------------------------------------------
1143
+#         # now all jaspar2018 human motifs, loosen the pValue
1144
+#         #-----------------------------------------------------
1145
+#
1146
+#    motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
1147
+#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-4, fimoDataFrameStyle=TRUE)
1148
+#    checkTrue(nrow(tbl.match) > 15)
1149
+#
1150
+#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-3, fimoDataFrameStyle=TRUE)
1151
+#    checkTrue(nrow(tbl.match) > 50)
1152
+#
1153
+#        #-------------------------------------------------------------
1154
+#        # now all jaspar2018 and hocomoco human motifs across 10kb
1155
+#        #------------------------------------------------------------
1156
+#
1157
+#    motifs <- query(MotifDb, "hsapiens", orStrings=c("jaspar2018", "hocomoco-core"))
1158
+#    checkTrue(length(motifs) > 500)
1159
+#    gr.region <- GRanges(seqnames="chr1", IRanges(start=47229000, end=47239000))
1160
+#
1161
+#    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-7, fimoDataFrameStyle=TRUE)
1162
+#    checkTrue(nrow(tbl.match) > 90 && nrow(tbl.match) < 110)
1163
+#    checkEquals(order(tbl.match$start), seq_len(nrow(tbl.match)))
1164
+#
1165
+# } # test.match
1168 1166
 #------------------------------------------------------------------------------------------------------------------------
1169 1167
 findMotifsWithMutuallyExclusiveMappings <- function()
1170 1168
 {
1171 1169
deleted file mode 100644
... ...
@@ -1,45 +0,0 @@
1
-\name{matchMotif}
2
-\alias{matchMotif,MotifList-method}
3
-\alias{matchMotif}
4
-\title{matchMotif}
5
-\description{
6
-A very general search tool, returning all matrices whose metadata, in ANY
7
-column, is matched by the query string.
8
-}
9
-\usage{
10
-\S4method{matchMotif}{MotifList}(object, motifs, genomeName, regions, pval.cutoff, fimoDataFrameStyle=FALSE)
11
-}
12
-\arguments{
13
-  \item{object}{a \code{MotifList} object.}
14
-  \item{motifs}{a \code{MotifList} typically the result of a MotifDb query}
15
-  \item{genomeName}{a \code{character} string vector, e.g. "hg38" or "mm10"}
16
-  \item{regions}{a \code{GRange} specifies regions in which to try to match the motifs}
17
-  \item{pval.cutoff}{a \code{numeric} value, for example, 1e-5}
18
-  \item{fimoDataFrameStyle}{a \code{logical} value, default FALSE}
19
-
20
-}
21
-
22
-\value{
23
-A list of the matrices
24
-}
25
-\author{Paul Shannon}
26
-
27
-\examples{
28
-   motifs <- query(MotifDb, "ZNF263", c("jaspar2018", "swissregulon"))
29
-   gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560))
30
-   gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
31
-   tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
32
-}
33
-\seealso{
34
-  MotifDb,
35
-  query,
36
-  subset,
37
-  export,
38
-  flyFactorSurvey,
39
-  hPDI,
40
-  jaspar,
41
-  ScerTF,
42
-  uniprobe
43
-}
44
-
45
-\keyword{utilities}
46 0
deleted file mode 100644
47 1
Binary files a/vignettes/MotifDb-egr1.pdf and /dev/null differ
... ...
@@ -272,119 +272,153 @@ seqLogo (as.list (egr1.motif)[[1]])
272 272
 \end{figure}
273 273
 
274 274
 \section{Motif Matching}
275
-We will look for the ten position frequency matrices which are the best match to JASPAR's mouse EGR1, using
276
-the MotIV package.  We actually request the top eleven hits from the entire MotifDb, since the first hit
277
-should be the target matrix itself, since that is of necessity found in the full MotifDb.
278
-
279
-<<motifmatch>>=
280
-egr1.hits <- motifMatch (as.list (egr1.motif) [1], as.list (MotifDb), top=11)
281
-# 'MotIV.toTable' -- defined above (and hidden) -- will become part of MotIV in the upcoming release
282
-tbl.hits <- MotIV.toTable (egr1.hits)
283
-print (tbl.hits)
284
-@
275
+Note: this section is disabled for now (2 april 2020) due to the
276
+breakage and perhaps end-of-life status of the Bioconductor package MotIV.
285 277
 
286
-The \emph{sequence} column in this table is the \emph{consensus sequence} -- with heterogeneity left out -- for the
287
-matrix it describes.
278
+An alternative Bioconductor package for motif matching is motifmatchr,
279
+which uses the C++ moods library.  Unfortunately, the inclusion of
280
+motifmatchr and supporting packages needed to render it interoperable
281
+with MotifDb requires the further inclusion of very many dependencies.
288 282
 
289
-\vspace{10 mm}
283
+For that reason, we present only non-executable code here:
290 284
 
291
-\textbf{\emph{Puzzling: the strand of the match reported above is opposite of what I expected, and opposite of what seqLogo displays.
292
-  This is a question for the MotIV developers.}}
285
+<<motifmatchr, eval=FALSE>>=
293 286
 
294
-\vspace{10 mm}
287
+ gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560))
288
+ motifs <- query(MotifDb, c("jaspar2018", "ZNF263"))
295 289
 
296
-The six logos appear below, beginning with the logo of the query matrix, \emph{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}, including
297
-two other mouse matrices, and two zinc-finger fly matrices.  Examining the three mouse matrices and their metadata reveals that
298
-all three (geneSymbol differences aside) describe the same protein:
299
-<<three.mice.metadata>>=
300
-if (interactive ())
301
-  noquote (t (as.data.frame (subset (values (MotifDb), geneId=='13653'))))
302
-@
303
-Zinc finger protein domains are classified into many \emph{fold groups}; their respective cognate DNA sequence may classify similarly.
304
-That two fly matrices significantly match three reports of the mouse Egr1 motif suggests impressive conservation of this
305
-binding pattern, or convergent evolution.
306
-
307
-Let us look at the metadata for the first fly match, whose geneId is \textbf{FBgn0003499}:
308
-<<fly.Sr.metadata>>=
309
-noquote (t (as.data.frame (values (MotifDb)[grep ('FBgn0003499', values (MotifDb)$geneId),])))
310
-@ Note that the SANGER motif, based on 18 sequences, had a high fidelity match to mouse Egr1 (see above, 10e-12), but
311
-that the SOLEXA motif, based upon 2316 sequences, did not (in work not shown, it appears 22nd in the an expanded
312
-motifMatch hit list, with a eval of 10e-5).  It is possible that the SOLEXA motif is more accurate, and that a close
313
-examination of this case, including sequence logos, position frequency matrices, and the search parameters of
314
-motifMatch, will be instructive.  Repeating the search with \emph{tomtom} might also be illuminating -- either as
315
-confirmation of MotIV and the default parameterization we used, or as a correction to it.  Here we see the facilities
316
-for exploratory data analysis MotifDb provides, and the opportunities for data analysis which result.
317
-
318
-
319
-<<logo1, fig=TRUE, include=FALSE, echo=FALSE>>=
320
-  seqLogo (MotifDb [[tbl.hits$name[1]]])
321
-@
290
+ motifs.pfmatrix <- lapply(motifs,
291
+                           function(motif) convert_motifs(motif, "TFBSTools-PFMatrix"))
322 292
 
323
-<<logo2, fig=TRUE, include=FALSE, echo=FALSE>>=
324
-  seqLogo (MotifDb [[tbl.hits$name[2]]])
325
-@
293
+ motifs.pfmList <- do.call(PFMatrixList, motifs.pfmatrix)
294
+ gr.list <- motifmatchr::matchMotifs(motifs.pfmList, regions, genome=genomeName,
295
+                                     out="positions", p.cutoff=1e-5)
296
+ gr <- unlist(gr.list)
297
+ motif.names <- names(gr)
298
+ names(gr) <- NULL
299
+ tbl <- as.data.frame(gr)
300
+ tbl$motif_id <- motif.names
301
+ colnames(tbl)[1] <- "chrom"
302
+ tbl$chrom <- as.character(tbl$chrom)
303
+ colnames(tbl)[grep("score", colnames(tbl))] <- "mood.score"
304
+ new.order <- order(tbl$start, decreasing=FALSE)
305
+ tbl <- tbl[new.order,]
326 306
 
327
-<<logo3, fig=TRUE, include=FALSE, echo=FALSE>>=
328
-  seqLogo (MotifDb [[tbl.hits$name[3]]])
329 307
 @
330 308
 
331
-<<logo4, fig=TRUE, include=FALSE, echo=FALSE>>=
332
-  seqLogo (MotifDb [[tbl.hits$name[4]]])
333
-@
309
+%% We will look for the ten position frequency matrices which are the best match to JASPAR's mouse EGR1, using
310
+%% the MotIV package.  We actually request the top eleven hits from the entire MotifDb, since the first hit
311
+%% should be the target matrix itself, since that is of necessity found in the full MotifDb.
334 312
 
335
-<<logo5, fig=TRUE, include=FALSE, echo=FALSE>>=
336
-  seqLogo (MotifDb [[tbl.hits$name[5]]])
337
-@
338
-
339
-<<logo6, fig=TRUE, include=FALSE, echo=FALSE>>=
340
-  seqLogo (MotifDb [[tbl.hits$name[6]]])
341
-@
342
-
343
-\begin{figure}[htpb!]
344
-  \centering
345
-  \begin{subfigure}[b]{0.38\textwidth}
346
-    \includegraphics[width=\textwidth]{MotifDb-logo1}
347
-    \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}
348
-    \label{fig:Egr1-MA0162.1}
349
-    \end{subfigure}%
350
-  \begin{subfigure}[b]{0.38\textwidth}
351
-    \includegraphics[width=\textwidth]{MotifDb-logo2}
352
-    \caption{Dme-FFS-sr\_SANGER\_5\_FBgn0003499\\(abbreviated)}
353
-    \label{fig:Egr1-logo2}
354
-    \end{subfigure}%
355
-\end{figure}
356
-
357
-\begin{figure}[htpb!]
358
-  \centering
359
-  \begin{subfigure}[b]{0.38\textwidth}
360
-    \includegraphics[width=\textwidth]{MotifDb-logo3}
361
-    \caption{Mmusculus-UniPROBE-Zif268.UP00400}
362
-    \label{fig:Egr1-logo3}
363
-    \end{subfigure}%
364
-  \begin{subfigure}[b]{0.38\textwidth}
365
-    \includegraphics[width=\textwidth]{MotifDb-logo4}
366
-    \caption{Dme-FFS-klu\_SANGER\_10\_FBgn0013469}
367
-    \label{fig:Egr1-logo4}
368
-    \end{subfigure}%
369
-\end{figure}
370
-
371
-
372
-\begin{figure}[htpb!]
373
-  \centering
374
-  \begin{subfigure}[b]{0.38\textwidth}
375
-    \includegraphics[width=\textwidth]{MotifDb-logo5}
376
-    \caption{Mmusculus-UniPROBE-Egr1.UP00007}
377
-    \label{fig:Egr1-logo5}
378
-    \end{subfigure}%
379
-  \begin{subfigure}[b]{0.38\textwidth}
380
-    \centering
381
-    \includegraphics[width=\textwidth]{MotifDb-logo6}
382
-    \caption{Dme-FFS-klu\_SOLEXA\_5\_FBgn0013469}
383
-    \label{fig:Egr1-logo6}
384
-    \end{subfigure}%
385
-\end{figure}
386
-
387
-\newpage
313
+%% <<motifmatch>>=
314
+%% egr1.hits <- motifMatch (as.list (egr1.motif) [1], as.list (MotifDb), top=11)
315
+%% # 'MotIV.toTable' -- defined above (and hidden) -- will become part of MotIV in the upcoming release
316
+%% tbl.hits <- MotIV.toTable (egr1.hits)
317
+%% print (tbl.hits)
318
+%% @
319
+%%
320
+%% The \emph{sequence} column in this table is the \emph{consensus sequence} -- with heterogeneity left out -- for the
321
+%% matrix it describes.
322
+%%
323
+%% \vspace{10 mm}
324
+%%
325
+%% \textbf{\emph{Puzzling: the strand of the match reported above is opposite of what I expected, and opposite of what seqLogo displays.
326
+%%   This is a question for the MotIV developers.}}
327
+%%
328
+%% \vspace{10 mm}
329
+%%
330
+%% The six logos appear below, beginning with the logo of the query matrix, \emph{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}, including
331
+%% two other mouse matrices, and two zinc-finger fly matrices.  Examining the three mouse matrices and their metadata reveals that
332
+%% all three (geneSymbol differences aside) describe the same protein:
333
+%% <<three.mice.metadata>>=
334
+%% if (interactive ())
335
+%%   noquote (t (as.data.frame (subset (values (MotifDb), geneId=='13653'))))
336
+%% @
337
+%% Zinc finger protein domains are classified into many \emph{fold groups}; their respective cognate DNA sequence may classify similarly.
338
+%% That two fly matrices significantly match three reports of the mouse Egr1 motif suggests impressive conservation of this
339
+%% binding pattern, or convergent evolution.
340
+%%
341
+%% Let us look at the metadata for the first fly match, whose geneId is \textbf{FBgn0003499}:
342
+%% <<fly.Sr.metadata>>=
343
+%% noquote (t (as.data.frame (values (MotifDb)[grep ('FBgn0003499', values (MotifDb)$geneId),])))
344
+%% @ Note that the SANGER motif, based on 18 sequences, had a high fidelity match to mouse Egr1 (see above, 10e-12), but
345
+%% that the SOLEXA motif, based upon 2316 sequences, did not (in work not shown, it appears 22nd in the an expanded
346
+%% motifMatch hit list, with a eval of 10e-5).  It is possible that the SOLEXA motif is more accurate, and that a close
347
+%% examination of this case, including sequence logos, position frequency matrices, and the search parameters of
348
+%% motifMatch, will be instructive.  Repeating the search with \emph{tomtom} might also be illuminating -- either as
349
+%% confirmation of MotIV and the default parameterization we used, or as a correction to it.  Here we see the facilities
350
+%% for exploratory data analysis MotifDb provides, and the opportunities for data analysis which result.
351
+%%
352
+%%
353
+%% <<logo1, fig=TRUE, include=FALSE, echo=FALSE>>=
354
+%%   seqLogo (MotifDb [[tbl.hits$name[1]]])
355
+%% @
356
+%%
357
+%% <<logo2, fig=TRUE, include=FALSE, echo=FALSE>>=
358
+%%   seqLogo (MotifDb [[tbl.hits$name[2]]])
359
+%% @
360
+%%
361
+%% <<logo3, fig=TRUE, include=FALSE, echo=FALSE>>=
362
+%%   seqLogo (MotifDb [[tbl.hits$name[3]]])
363
+%% @
364
+%%
365
+%% <<logo4, fig=TRUE, include=FALSE, echo=FALSE>>=
366
+%%   seqLogo (MotifDb [[tbl.hits$name[4]]])
367
+%% @
368
+%%
369
+%% <<logo5, fig=TRUE, include=FALSE, echo=FALSE>>=
370
+%%   seqLogo (MotifDb [[tbl.hits$name[5]]])
371
+%% @
372
+%%
373
+%% <<logo6, fig=TRUE, include=FALSE, echo=FALSE>>=
374
+%%   seqLogo (MotifDb [[tbl.hits$name[6]]])
375
+%% @
376
+%%
377
+%% \begin{figure}[htpb!]
378
+%%   \centering
379
+%%   \begin{subfigure}[b]{0.38\textwidth}
380
+%%     \includegraphics[width=\textwidth]{MotifDb-logo1}
381
+%%     \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}
382
+%%     \label{fig:Egr1-MA0162.1}
383
+%%     \end{subfigure}%
384
+%%   \begin{subfigure}[b]{0.38\textwidth}
385
+%%     \includegraphics[width=\textwidth]{MotifDb-logo2}
386
+%%     \caption{Dme-FFS-sr\_SANGER\_5\_FBgn0003499\\(abbreviated)}
387
+%%     \label{fig:Egr1-logo2}
388
+%%     \end{subfigure}%
389
+%% \end{figure}
390
+%%
391
+%% \begin{figure}[htpb!]
392
+%%   \centering
393
+%%   \begin{subfigure}[b]{0.38\textwidth}
394
+%%     \includegraphics[width=\textwidth]{MotifDb-logo3}
395
+%%     \caption{Mmusculus-UniPROBE-Zif268.UP00400}
396
+%%     \label{fig:Egr1-logo3}
397
+%%     \end{subfigure}%
398
+%%   \begin{subfigure}[b]{0.38\textwidth}
399
+%%     \includegraphics[width=\textwidth]{MotifDb-logo4}
400
+%%     \caption{Dme-FFS-klu\_SANGER\_10\_FBgn0013469}
401
+%%     \label{fig:Egr1-logo4}
402
+%%     \end{subfigure}%
403
+%% \end{figure}
404
+%%
405
+%%
406
+%% \begin{figure}[htpb!]
407
+%%   \centering
408
+%%   \begin{subfigure}[b]{0.38\textwidth}
409
+%%     \includegraphics[width=\textwidth]{MotifDb-logo5}
410
+%%     \caption{Mmusculus-UniPROBE-Egr1.UP00007}
411
+%%     \label{fig:Egr1-logo5}
412
+%%     \end{subfigure}%
413
+%%   \begin{subfigure}[b]{0.38\textwidth}
414
+%%     \centering
415
+%%     \includegraphics[width=\textwidth]{MotifDb-logo6}
416
+%%     \caption{Dme-FFS-klu\_SOLEXA\_5\_FBgn0013469}
417
+%%     \label{fig:Egr1-logo6}
418
+%%     \end{subfigure}%
419
+%% \end{figure}
420
+%%
421
+%% \newpage
388 422
 
389 423
 \section{Exporting to the MEME Suite}
390 424
 Some users of this package may wish to export the data -- both matrices and metadata -- so that they may be used in
391 425
deleted file mode 100644
... ...
@@ -1,1205 +0,0 @@
1
-\documentclass{article}
2
-%% %\VignetteIndexEntry{MotifDb Overview}
3
-%% %\VignettePackage{MotifDb}
4
-\usepackage[noae]{Sweave}
5
-\usepackage[left=0.5in,top=0.5in,right=0.5in,bottom=0.75in,nohead,nofoot]{geometry}
6
-\usepackage{hyperref}
7
-\usepackage[noae]{Sweave}
8
-\usepackage{color}
9
-\usepackage{graphicx}
10
-\usepackage{caption}
11
-\usepackage{subcaption}
12
-
13
-\definecolor{Blue}{rgb}{0,0,0.5}
14
-\definecolor{Green}{rgb}{0,0.5,0}
15
-
16
-\RecustomVerbatimEnvironment{Sinput}{Verbatim}{%
17
-  xleftmargin=1em,%
18
-  fontsize=\small,%
19
-  fontshape=sl,%
20
-  formatcom=\color{Blue}%
21
-  }
22
-\RecustomVerbatimEnvironment{Soutput}{Verbatim}{%
23
-  xleftmargin=0em,%
24
-  fontsize=\scriptsize,%
25
-  formatcom=\color{Blue}%
26
-  }
27
-\RecustomVerbatimEnvironment{Scode}{Verbatim}{xleftmargin=2em}
28
-
29
-
30
-
31
-\renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}}
32
-\fvset{listparameters={\setlength{\topsep}{6pt}}}
33
-% These determine the rules used to place floating objects like figures
34
-% They are only guides, but read the manual to see the effect of each.
35
-\renewcommand{\topfraction}{.99}
36
-\renewcommand{\bottomfraction}{.99}
37
-\renewcommand{\textfraction}{0.0}
38
-
39
-\title{MotifDb}
40
-\author{Paul Shannon}
41
-
42
-\begin{document}
43
-
44
-\maketitle
45
-\begin{abstract}
46
-Many kinds of biological activity are regulated by the binding of proteins to their cognate
47
-substrates.  Of particular interest is the sequence-specific binding of transcription factors to DNA, often in
48
-regulatory regions just upstream of the transcription start site of a gene.  These binding events play a pivotal
49
-role in regulating gene expression.  Sequence specificity among closely related binding sites is nearly always incomplete: some variety
50
-in the DNA sequence is routinely observed.  For this reason, these inexact binding sequence patterns are commonly
51
-described as \emph{motifs} represented numerically as frequency matrices, and visualized as sequence logos.  Despite their importance
52
-in current research, there has been until now no single, annotated, comprehensive collection of publicly available motifs.
53
-The current package provides such a collection, offering more than two thousand annotated matrices from multiple organisms, within the
54
-context of the Bioconductor project.  The matrices can be filtered and selected on the basis of their metadata, used with other
55
-Bioconductor packages (MotIV for motif comparison, seqLogo for visualization) or easily exported for use with
56
-standard software and websites such as those provided by the MEME Suite\footnote{http://meme.sdsc.edu/meme/doc/meme.html}.
57
-\end{abstract}
58
-
59
-\tableofcontents
60
-
61
-\section{Introduction and Basic Operations}
62
-
63
-The first step is to load the necessary packages:
64
-
65
-\begin{Schunk}
66
-\begin{Sinput}
67
-> library (MotifDb)
68
-> library (MotIV)
69
-> library (seqLogo)
70
-\end{Sinput}
71
-\end{Schunk}
72
-
73
-
74
-%% MotifDb provides two kinds of loosely linked data:  position frequency matrices, and metadata about each matrix.  The matrix
75
-%% names, and the rownames of the metadata table, are identical, so it is easy to map back and forth between
76
-%% the two.  Some measure of convenience is gained by extracting these two kinds of data into separate variables,
77
-%% as we shall see.  The cost in extra memory should not significant.
78
-%%
79
-%% <<all.matrices>>=
80
-%% matrices.all = as.list (MotifDb)
81
-%% metadata <- values (MotifDb)
82
-%% @
83
-There are  more than two thousand  matrices, from five sources:
84
-\begin{Schunk}
85
-\begin{Sinput}
86
-> length (MotifDb)
87
-\end{Sinput}
88
-\begin{Soutput}
89
-[1] 10701
90
-\end{Soutput}
91
-\begin{Sinput}
92
-> sort (table (values (MotifDb)$dataSource), decreasing=TRUE)
93
-\end{Sinput}
94
-\begin{Soutput}
95
-        jaspar2018         jaspar2016        HOCOMOCOv10         cisbp_1.02 
96
-              1564               1209               1066                874 
97
-         jolma2013       SwissRegulon            stamlab    FlyFactorSurvey 
98
-               843                684                683                614 
99
-       JASPAR_2014        JASPAR_CORE               hPDI           UniPROBE 
100
-               592                459                437                380 
101
-             HOMER HOCOMOCOv11-full-D             ScerTF HOCOMOCOv11-core-A 
102
-               332                290                196                181 
103
-HOCOMOCOv11-core-C HOCOMOCOv11-core-B HOCOMOCOv11-full-A HOCOMOCOv11-full-B 
104
-               135                 84                 46                 19 
105
-HOCOMOCOv11-full-C 
106
-                13 
107
-\end{Soutput}
108
-\end{Schunk}
109
-And 22 organisms (though the majority of the matrices come from just four):
110
-\begin{Schunk}
111
-\begin{Sinput}
112
-> sort (table (values (MotifDb)$organism), decreasing=TRUE)
113
-\end{Sinput}
114
-\begin{Soutput}
115
-                                                                     Hsapiens 
116
-                                                                         5384 
117
-                                                                    Mmusculus 
118
-                                                                         1411 
119
-                                                                Dmelanogaster 
120
-                                                                         1287 
121
-                                                                  Scerevisiae 
122
-                                                                         1051 
123
-                                                                    Athaliana 
124
-                                                                          803 
125
-                                                                     Celegans 
126
-                                                                           90 
127
-                                                                           NA 
128
-                                                                           40 
129
-                                                                  Rnorvegicus 
130
-                                                                           35 
131
-                                                                  Pfalciparum 
132
-                                                                           28 
133
-                                                                        Zmays 
134
-                                                                           27 
135
-                                                                   Vertebrata 
136
-                                                                           18 
137
-                                                                      Ncrassa 
138
-                                                                           15 
139
-                                                                     Psativum 
140
-                                                                           13 
141
-                                                                       Amajus 
142
-                                                                           12 
143
-                                                                  Ddiscoideum 
144
-                                                                            9 
145
-                                                                    Anidulans 
146
-                                                                            8 
147
-                                                                      Ggallus 
148
-                                                                            8 
149
-                                                                      Ppatens 
150
-                                                                            7 
151
-                                                                      Xlaevis 
152
-                                                                            7 
153
-                                               Mmusculus;Rnorvegicus;Hsapiens 
154
-                                                                            6 
155
-                                                                      Osativa 
156
-                                                                            5 
157
-                                                                     Hroretzi 
158
-                                                                            4 
159
-                                                                     Hvulgare 
160
-                                                                            4 
161
-                                                                   Ocuniculus 
162
-                                                                            4 
163
-                                                                     Phybrida 
164
-                                                                            4 
165
-                                                                      Rrattus 
166
-                                                                            4 
167
-                                                                    Taestivam 
168
-                                                                            4 
169
-                                                                       Drerio 
170
-                                                                            3 
171
-                                                                       Gallus 
172
-                                                                            3 
173
-                                                           Mmusculus;Hsapiens 
174
-                                                                            3 
175
-                                                                  Bdistachyon 
176
-                                                                            2 
177
-                                                                      Cparvum 
178
-                                                                            2 
179
-                                                                      Csativa 
180
-                                                                            2 
181
-                                                        Mmusculus;Rnorvegicus 
182
-                                                                            2 
183
-Mmusculus;Rnorvegicus;Xlaevis;Stropicalis;Ggallus;Hsapiens;Btaurus;Ocuniculus 
184
-                                                                            2 
185
-                                                                         Nsp. 
186
-                                                                            2 
187
-                                                                  Nsylvestris 
188
-                                                                            2 
189
-                                                                       Otauri 
190
-                                                                            2 
191
-                                                                Acarolinensis 
192
-                                                                            1 
193
-                                                                       Apisum 
194
-                                                                            1 
195
-                                                                     Aterreus 
196
-                                                                            1 
197
-                                                                   Gaculeatus 
198
-                                                                            1 
199
-                                                                  Hcapsulatum 
200
-                                                                            1 
201
-                                                                   Mdomestica 
202
-                                                                            1 
203
-                                                                   Mgallopavo 
204
-                                                                            1 
205
-                                                                     Mmurinus 
206
-                                                                            1 
207
-                                    Mmusculus;Rnorvegicus;Hsapiens;Ocuniculus 
208
-                                                                            1 
209
-                               Mmusculus;Rnorvegicus;Omykiss;Ggallus;Hsapiens 
210
-                                                                            1 
211
-                                        Mmusculus;Rrattus;Hsapiens;Ocuniculus 
212
-                                                                            1 
213
-                                                                  Mtruncatula 
214
-                                                                            1 
215
-                                                                     Ngruberi 
216
-                                                                            1 
217
-                                                                Nhaematococca 
218
-                                                                            1 
219
-                                                                   Nvectensis 
220
-                                                                            1 
221
-                                                                    Pcapensis 
222
-                                                                            1 
223
-                                                                    Ppygmaeus 
224
-                                                                            1 
225
-                                                                 Ptetraurelia 
226
-                                                                            1 
227
-                                                         Rnorvegicus;Hsapiens 
228
-                                                                            1 
229
-                                                                 Tthermophila 
230
-                                                                            1 
231
-                                                                    Vvinifera 
232
-                                                                            1 
233
-                                                                  Xtropicalis 
234
-                                                                            1 
235
-\end{Soutput}
236
-\end{Schunk}
237
-
238
-With these categories of metadata
239
-\begin{Schunk}
240
-\begin{Sinput}
241
-> colnames (values (MotifDb))
242
-\end{Sinput}
243
-\begin{Soutput}
244
- [1] "providerName"    "providerId"      "dataSource"      "geneSymbol"     
245
- [5] "geneId"          "geneIdType"      "proteinId"       "proteinIdType"  
246
- [9] "organism"        "sequenceCount"   "bindingSequence" "bindingDomain"  
247
-[13] "tfFamily"        "experimentType"  "pubmedID"       
248
-\end{Soutput}
249
-\end{Schunk}
250
-\section{Selection}
251
-
252
-There are three ways to extract subsets of interest from the MotifDb collection.  All three operate upon the MotifDb metadata,
253
-matching values in one or more of those fifteen attributes (listed just above), and returning the subset of MotifDb  which
254
-meet the specified criteria.  The three techniques:  \emph{query}, \emph{subset} and \emph{grep}
255
-
256
-\subsection{query}
257
-This is the simplest technique to use, and will suffice in many circumstances.  For example, if you want
258
-all of the human matrices:
259
-\begin{Schunk}
260
-\begin{Sinput}
261
-> query (MotifDb, 'hsapiens')
262
-\end{Sinput}
263
-\begin{Soutput}
264
-MotifDb object of length 5399
265
-| Created from downloaded public sources: 2013-Aug-30
266
-| 5399 position frequency matrices from 18 sources:
267
-|         cisbp_1.02:  313
268
-|        HOCOMOCOv10:  640
269
-| HOCOMOCOv11-core-A:  181
270
-| HOCOMOCOv11-core-B:   84
271
-| HOCOMOCOv11-core-C:  135
272
-| HOCOMOCOv11-full-A:   46
273
-| HOCOMOCOv11-full-B:   19
274
-| HOCOMOCOv11-full-C:   13
275
-| HOCOMOCOv11-full-D:  290
276
-|               hPDI:  437
277
-|        JASPAR_2014:  117
278
-|        JASPAR_CORE:   66
279
-|         jaspar2016:  442
280
-|         jaspar2018:  537
281
-|          jolma2013:  710
282
-|            stamlab:  683
283
-|       SwissRegulon:  684
284
-|           UniPROBE:    2
285
-| 8 organism/s
286
-|           Hsapiens: 5384
287
-| Mmusculus;Rnorvegicus;Hsapiens:    6
288
-| Mmusculus;Hsapiens:    3
289
-| Mmusculus;Rnorvegicus;Xlaevis;Stropicalis;Ggallus;Hsapiens;Btaurus;Ocuniculus:    2
290
-| Mmusculus;Rnorvegicus;Hsapiens;Ocuniculus:    1
291
-| Mmusculus;Rnorvegicus;Omykiss;Ggallus;Hsapiens:    1
292
-|              other:    2
293
-Hsapiens-cisbp_1.02-M1838_1.02 
294
-Hsapiens-cisbp_1.02-M1857_1.02 
295
-Hsapiens-cisbp_1.02-M1875_1.02 
296
-Hsapiens-cisbp_1.02-M1880_1.02 
297
-Hsapiens-cisbp_1.02-M1889_1.02 
298
-...
299
-Hsapiens-SwissRegulon-ZNF784.SwissRegulon 
300
-Hsapiens-SwissRegulon-ZNF8.SwissRegulon 
301
-Hsapiens-SwissRegulon-ZSCAN4.SwissRegulon 
302
-Hsapiens-UniPROBE-Sox4.UP00401 
303
-Hsapiens-UniPROBE-Oct_1.UP00399 
304
-\end{Soutput}
305
-\end{Schunk}
306
-If you want all matrices associated with \textbf{\emph{Sox}} transcription factors, regardless of dataSource or organism:
307
-\begin{Schunk}
308
-\begin{Sinput}
309
-> query (MotifDb, 'sox')
310
-\end{Sinput}
311
-\begin{Soutput}
312
-MotifDb object of length 196
313
-| Created from downloaded public sources: 2013-Aug-30
314
-| 196 position frequency matrices from 17 sources:
315
-|    FlyFactorSurvey:    2
316
-|        HOCOMOCOv10:   25
317
-| HOCOMOCOv11-core-A:    1
318
-| HOCOMOCOv11-core-B:    5
319
-| HOCOMOCOv11-core-C:    2
320
-| HOCOMOCOv11-full-A:    2
321
-| HOCOMOCOv11-full-B:    1
322
-| HOCOMOCOv11-full-D:    9
323
-|              HOMER:    9
324
-|               hPDI:    2
325
-|        JASPAR_2014:    8
326
-|        JASPAR_CORE:    5
327
-|         jaspar2016:   16
328
-|         jaspar2018:   19
329
-|          jolma2013:   56
330
-|       SwissRegulon:   19
331
-|           UniPROBE:   15
332
-| 7 organism/s
333
-|           Hsapiens:  115
334
-|          Mmusculus:   67
335
-|      Dmelanogaster:    2
336
-| Mmusculus;Rnorvegicus;Hsapiens:    1
337
-|        Rnorvegicus:    1
338
-|         Vertebrata:    1
339
-|              other:    9
340
-Dmelanogaster-FlyFactorSurvey-Sox14_SANGER_10_FBgn0005612 
341
-Dmelanogaster-FlyFactorSurvey-Sox15_SANGER_5_FBgn0005613 
342
-Hsapiens-HOCOMOCOv10-SOX10_HUMAN.H10MO.D 
343
-Hsapiens-HOCOMOCOv10-SOX11_HUMAN.H10MO.D 
344
-Hsapiens-HOCOMOCOv10-SOX13_HUMAN.H10MO.D 
345
-...
346
-Mmusculus-UniPROBE-Sox30.UP00023 
347
-Mmusculus-UniPROBE-Sox4.UP00062 
348
-Mmusculus-UniPROBE-Sox5.UP00091 
349
-Mmusculus-UniPROBE-Sox7.UP00034 
350
-Mmusculus-UniPROBE-Sox8.UP00051 
351
-\end{Soutput}
352
-\end{Schunk}
353
-For all yeast transcription factors with a homeo domain
354
-\begin{Schunk}
355
-\begin{Sinput}
356
-> query (query (MotifDb, 'cerevisiae'), 'homeo')
357
-\end{Sinput}
358
-\begin{Soutput}
359
-MotifDb object of length 32
360
-| Created from downloaded public sources: 2013-Aug-30
361
-| 32 position frequency matrices from 5 sources:
362
-|        JASPAR_2014:   10
363
-|        JASPAR_CORE:   10
364
-|         jaspar2016:    4
365
-|         jaspar2018:    4
366
-|           UniPROBE:    4
367
-| 1 organism/s
368
-|        Scerevisiae:   32
369
-Scerevisiae-JASPAR_CORE-CUP9-MA0288.1 
370
-Scerevisiae-JASPAR_CORE-HMRA2-MA0318.1 
371
-Scerevisiae-JASPAR_CORE-MATA1-MA0327.1 
372
-Scerevisiae-JASPAR_CORE-MATALPHA2-MA0328.1 
373
-Scerevisiae-JASPAR_CORE-PHO2-MA0356.1 
374
-...
375
-Scerevisiae-jaspar2018-TOS8-MA0408.1 
376
-Scerevisiae-UniPROBE-Cup9.UP00308 
377
-Scerevisiae-UniPROBE-Matalpha2.UP00307 
378
-Scerevisiae-UniPROBE-Pho2.UP00268 
379
-Scerevisiae-UniPROBE-Yox1.UP00274 
380
-\end{Soutput}
381
-\end{Schunk}
382
-The last example may inspire more confidence in the precision of the result than is justified, and for a couple
383
-of reasons.  First, the assignment of  protein binding domains to specific categories is, as of 2012, an ad hoc
384
-and incomplete process.  Second, the query commands matches the supplied character string to \emph{all} metadata
385
-columns.  In this case, 'homeo' appears both in the \emph{bindingDomain} column and the \emph{tfFamily} column,
386
-and the above \emph{query} will return matches from both.
387
-Searching and filtering should always be accompanined by close scrutiny of the data, such as these commands
388
-illustrate:
389
-
390
-\begin{Schunk}
391
-\begin{Sinput}
392
-> unique (grep ('homeo', values(MotifDb)$bindingDomain, ignore.case=T, v=T))
393
-\end{Sinput}
394
-\begin{Soutput}
395
- [1] "Homeobox"                          "Hox9_act;Homeobox"                
396
- [3] "LIM;Homeobox"                      "PAX;Homeobox"                     
397
- [5] "OAR;Homeobox"                      "Pou;Homeobox"                     
398
- [7] "Distant similarity to homeodomain" "Homeo"                            
399
- [9] "Homeo, PAX"                        "Homeo, POU"                       
400
-\end{Soutput}
401
-\begin{Sinput}
402
-> unique (grep ('homeo', values(MotifDb)$tfFamily, ignore.case=T, v=T))
403
-\end{Sinput}
404
-\begin{Soutput}
405
- [1] "HOX-related factors{3.1.1}: CDX (Caudal type homeobox){3.1.1.9}"          
406
- [2] "HOX-related factors{3.1.1}: GBX (Gastrulation brain homeobox){3.1.1.11}"  
407
- [3] "TALE-type homeo domain factors{3.1.4}: IRX (Iroquois){3.1.4.1}"           
408
- [4] "TALE-type homeo domain factors{3.1.4}: MEIS{3.1.4.2}"                     
409
- [5] "Paired domain only{3.2.2}: PAX-1/9 (no homeo remnant){3.2.2.1}"           
410
- [6] "Paired domain only{3.2.2}: PAX-2-like factors (partial homeobox){3.2.2.2}"
411
- [7] "Paired plus homeo domain{3.2.1}: PAX-3/7{3.2.1.1}"                        
412
- [8] "Paired plus homeo domain{3.2.1}: PAX-4/6{3.2.1.2}"                        
413
- [9] "TALE-type homeo domain factors{3.1.4}: PBX{3.1.4.4}"                      
414
-[10] "TALE-type homeo domain factors{3.1.4}: PKNOX{3.1.4.5}"                    
415
-[11] "TALE-type homeo domain factors{3.1.4}: TGIF{3.1.4.6}"                     
416
-[12] "Homeo"                                                                    
417
-[13] "Homeo::Nuclear Factor I-CCAAT-binding"                                    
418
-[14] "Homeodomain"                                                              
419
-[15] "Paired plus homeo domain"                                                 
420
-[16] "TALE-type homeo domain factors"                                           
421
-[17] "homeodomain"                                                              
422
-\end{Soutput}
423
-\end{Schunk}
424
-\subsection{grep}
425
-This selection method (and the next, \emph{subset}) require that you address metadata columns explicitly.  This is a little more
426
-work, but the requisite direct engagement with the metadata is worthwhile.  Repeating the 'query' examples from above,
427
-you can see how more knowedge of MotifDb metadata is required.
428
-\begin{Schunk}
429
-\begin{Sinput}
430
-> mdb.human <- MotifDb [grep ('Hsapiens', values (MotifDb)$organism)]
431
-> mdb.sox <- MotifDb [grep ('sox', values (MotifDb)$geneSymbol, ignore.case=TRUE)]
432
-> yeast.indices = grepl ('scere', values (MotifDb)$organism, ignore.case=TRUE)
433
-> homeo.indices.domain = grepl ('homeo', values (MotifDb)$bindingDomain, ignore.case=TRUE)
434
-> homeo.indices.family = grepl ('homeo', values (MotifDb)$tfFamily, ignore.case=TRUE)
435
-> yeast.homeo.indices = yeast.indices & (homeo.indices.domain | homeo.indices.family)
436
-> yeast.homeoDb = MotifDb [yeast.homeo.indices]
437
-\end{Sinput}
438
-\end{Schunk}
439
-
440
-An alternate and somewhat more compact approach:
441
-\begin{Schunk}
442
-\begin{Sinput}
443
-> yeast.homeo.indices <- with(values(MotifDb),
444
-+   grepl('scere', organism, ignore.case=TRUE) &
445
-+     (grepl('homeo', bindingDomain, ignore.case=TRUE) |
446
-+      grepl('homeo', tfFamily, ignore.case=TRUE)))
447
-> 
448
-\end{Sinput}
449
-\end{Schunk}
450
-\subsection{subset}
451
-MotifDb::subset emulates the R base data.frame \emph{subset} command, which is not unlike an SQL select function.
452
-Unfortunately -- and just like the R base subset function -- this MotifDb method cannot be used reliably  within a script:
453
-\emph{It is only reliable when called interactively.}  Here, with mixed success (as you will see) , we use MotifDb::subset to
454
-reproduce the \emph{query} and \emph{grep} selections shown above.
455
-
456
-\begin{Schunk}
457
-\begin{Sinput}
458
-> if (interactive ())
459
-+   subset (MotifDb, organism=='Hsapiens')
460
-\end{Sinput}
461
-\end{Schunk}
462
-One can easily find all the 'sox' genes with the subset command, avoiding possible upper/lower case conflicts by passing
463
-the metadata's geneSymbol column through the function 'tolower':
464
-\begin{Schunk}
465
-\begin{Sinput}
466
-> if (interactive ())
467
-+   subset (MotifDb, tolower (geneSymbol) == 'sox4')
468
-\end{Sinput}
469
-\end{Schunk}
470
-Similarly, subset has limited application for a permissive 'homeo' search.
471
-But for the retrieval by explicitly specified search terms, subset works very well:
472
-\begin{Schunk}
473
-\begin{Sinput}
474
-> if (interactive ())
475
-+   subset (MotifDb, organism=='Scerevisiae' & bindingDomain=='Homeo')
476
-\end{Sinput}
477
-\end{Schunk}
478
-
479
-\subsection{The Egr1 Case Study}
480
-
481
-We now do a simple geneSymbol search, followed by an examination of the sub-MotifDb the search returns.  We are looking for all matrices
482
-associated with the well-known and highly conserved zinc-finger transcription factor, Egr1.
483
-There are two of these in MotifDb, both from mouse, and each from a different data source.
484
-
485
-\begin{Schunk}
486
-\begin{Sinput}
487
->   # subset is convenient:
488
-> if (interactive ())
489
-+   as.list (subset (MotifDb, tolower (geneSymbol) == 'egr1'))
490
->   # grep returns indices which allow for more flexibility
491
-> indices = grep ('egr1', values (MotifDb)$geneSymbol, ignore.case=TRUE)
492
-> length (indices)
493
-\end{Sinput}
494
-\begin{Soutput}
495
-[1] 17
496
-\end{Soutput}
497
-\end{Schunk}
498
-There are a variety of ways to examine and extract data from this object, a MotifList of length 2.
499
-\begin{Schunk}
500
-\begin{Sinput}
501
-> MotifDb [indices]
502
-\end{Sinput}
503
-\begin{Soutput}
504
-MotifDb object of length 17
505
-| Created from downloaded public sources: 2013-Aug-30
506
-| 17 position frequency matrices from 10 sources:
507
-|        HOCOMOCOv10:    3
508
-| HOCOMOCOv11-core-A:    1
509
-|              HOMER:    1
510
-|        JASPAR_2014:    1
511
-|        JASPAR_CORE:    1
512
-|         jaspar2016:    2
513
-|         jaspar2018:    3
514
-|          jolma2013:    3
515
-|       SwissRegulon:    1
516
-|           UniPROBE:    1
517
-| 3 organism/s
518
-|           Hsapiens:   10
519
-|          Mmusculus:    6
520
-|              other:    1
521
-Hsapiens-HOCOMOCOv10-EGR1_HUMAN.H10MO.A 
522
-Hsapiens-HOCOMOCOv10-EGR1_HUMAN.H10MO.S 
523
-Mmusculus-HOCOMOCOv10-EGR1_MOUSE.H10MO.A 
524
-Hsapiens-HOCOMOCOv11A-EGR1_HUMAN.H11MO.0.A 
525
-NA-HOMER-Egr1(Zf)/K562-Egr1-ChIP-Seq(GSE32465)/Homer 
526
-...
527
-Hsapiens-jolma2013-EGR1 
528
-Hsapiens-jolma2013-EGR1-2 
529
-Mmusculus-jolma2013-Egr1 
530
-Hsapiens-SwissRegulon-EGR1.SwissRegulon 
531
-Mmusculus-UniPROBE-Egr1.UP00007 
532
-\end{Soutput}
533
-\end{Schunk}
534
-
535
-Now view the matrices as a named list:
536
-\begin{Schunk}
537
-\begin{Sinput}
538
-> as.list (MotifDb [indices])
539
-\end{Sinput}
540
-\begin{Soutput}
541
-$`Hsapiens-HOCOMOCOv10-EGR1_HUMAN.H10MO.A`
542
-      1     2     3     4     5     6     7     8     9    10    11    12    13
543
-A 0.190 0.208 0.212 0.270 0.222 0.116 0.168 0.042 0.034 0.160 0.008 0.032 0.262
544
-C 0.192 0.206 0.144 0.140 0.074 0.082 0.484 0.042 0.008 0.006 0.000 0.038 0.452
545
-G 0.438 0.446 0.452 0.468 0.380 0.756 0.050 0.808 0.452 0.804 0.976 0.914 0.006
546
-T 0.180 0.140 0.192 0.122 0.324 0.046 0.298 0.108 0.506 0.030 0.016 0.016 0.280
547
-     14    15    16    17    18
548
-A 0.180 0.072 0.236 0.278 0.218
549
-C 0.012 0.012 0.092 0.098 0.184
550
-G 0.750 0.774 0.534 0.458 0.490
551
-T 0.058 0.142 0.138 0.166 0.108
552
-
553
-$`Hsapiens-HOCOMOCOv10-EGR1_HUMAN.H10MO.S`
554
-          1          2          3           4           5          6
555
-A 0.1515633 0.04398516 0.05988341 0.003709592 0.009538951 0.07578166
556
-C 0.1886592 0.06041335 0.82829889 0.001059883 0.013248543 0.01907790
557
-G 0.3184950 0.88288288 0.01854796 0.993640700 0.490726020 0.90196078
558
-T 0.3412825 0.01271860 0.09326974 0.001589825 0.486486486 0.00317965
559
-            7          8           9          10         11
560
-A 0.025437202 0.01218866 0.089030207 0.021727610 0.09062003
561
-C 0.065712772 0.01006889 0.748277689 0.007419184 0.06518283
562
-G 0.900900901 0.97774245 0.007419184 0.955484897 0.60943296
563
-T 0.007949126 0.00000000 0.155272920 0.015368309 0.23476418
564
-
565
-$`Mmusculus-HOCOMOCOv10-EGR1_MOUSE.H10MO.A`
566
-          1          2          3           4           5          6
567
-A 0.1515633 0.04398516 0.05988341 0.003709592 0.009538951 0.07578166
568
-C 0.1886592 0.06041335 0.82829889 0.001059883 0.013248543 0.01907790
569
-G 0.3184950 0.88288288 0.01854796 0.993640700 0.490726020 0.90196078
570
-T 0.3412825 0.01271860 0.09326974 0.001589825 0.486486486 0.00317965
571
-            7          8           9          10         11
572
-A 0.025437202 0.01218866 0.089030207 0.021727610 0.09062003
573
-C 0.065712772 0.01006889 0.748277689 0.007419184 0.06518283
574
-G 0.900900901 0.97774245 0.007419184 0.955484897 0.60943296
575
-T 0.007949126 0.00000000 0.155272920 0.015368309 0.23476418
576
-
577
-$`Hsapiens-HOCOMOCOv11A-EGR1_HUMAN.H11MO.0.A`
578
-      1     2     3     4     5     6     7     8     9    10    11    12    13
579
-A 0.232 0.320 0.278 0.236 0.112 0.236 0.056 0.032 0.092 0.016 0.024 0.220 0.088
580
-C 0.248 0.128 0.180 0.112 0.030 0.506 0.034 0.004 0.002 0.004 0.032 0.476 0.010
581
-G 0.410 0.318 0.460 0.354 0.808 0.046 0.870 0.410 0.892 0.958 0.930 0.004 0.842
582
-T 0.110 0.234 0.082 0.298 0.050 0.212 0.040 0.554 0.014 0.022 0.014 0.300 0.060
583
-     14    15    16    17
584
-A 0.110 0.338 0.312 0.264
585
-C 0.022 0.128 0.082 0.182
586
-G 0.696 0.406 0.456 0.436
587
-T 0.172 0.128 0.150 0.118
588
-
589
-$`NA-HOMER-Egr1(Zf)/K562-Egr1-ChIP-Seq(GSE32465)/Homer`
590
-      1     2     3     4     5     6     7     8     9    10
591
-A 0.128 0.078 0.154 0.001 0.001 0.027 0.001 0.001 0.153 0.034
592
-C 0.072 0.036 0.523 0.001 0.001 0.001 0.002 0.001 0.415 0.002
593
-G 0.142 0.882 0.023 0.997 0.282 0.971 0.973 0.997 0.010 0.940
594
-T 0.658 0.004 0.300 0.001 0.716 0.001 0.024 0.001 0.422 0.024
595
-
596
-$`Mmusculus-JASPAR_CORE-Egr1-MA0162.1`
597
-           1          2         3 4   5   6          7 8         9 10
598
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
599
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
600
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
601
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
602
-          11
603
-A 0.06666667
604
-C 0.00000000
605
-G 0.46666667
606
-T 0.46666667
607
-
608
-$`Hsapiens-JASPAR_2014-EGR1-MA0162.2`
609
-           1         2          3          4          5         6 7          8
610
-A 0.08958877 0.1228786 0.09464752 0.10892624 0.01901110 0.2375163 0 0.00000000
611
-C 0.46736292 0.5586651 0.49355418 0.85109334 0.94435379 0.0000000 1 0.96703655
612
-G 0.25155026 0.1108845 0.18358355 0.00000000 0.00000000 0.5580940 0 0.00000000
613
-T 0.19149804 0.2075718 0.22821475 0.03998042 0.03663512 0.2043897 0 0.03296345
614
-           9         10         11        12         13        14
615
-A 0.00000000 0.29797650 0.00000000 0.1932115 0.00000000 0.2468995
616
-C 0.82849217 0.68219648 0.97519582 0.0000000 0.80360640 0.4565111
617
-G 0.04985313 0.00000000 0.00000000 0.5384302 0.11586162 0.1560868
618
-T 0.12165470 0.01982702 0.02480418 0.2683584 0.08053198 0.1405026
619
-
620
-$`Mmusculus-jaspar2016-Egr1-MA0162.1`
621
-           1          2         3 4   5   6          7 8         9 10
622
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
623
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
624
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
625
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
626
-          11
627
-A 0.06666667
628
-C 0.00000000
629
-G 0.46666667
630
-T 0.46666667
631
-
632
-$`Hsapiens-jaspar2016-EGR1-MA0162.2`
633
-           1         2          3          4          5         6 7          8
634
-A 0.08958877 0.1228786 0.09464752 0.10892624 0.01901110 0.2375163 0 0.00000000
635
-C 0.46736292 0.5586651 0.49355418 0.85109334 0.94435379 0.0000000 1 0.96703655
636
-G 0.25155026 0.1108845 0.18358355 0.00000000 0.00000000 0.5580940 0 0.00000000
637
-T 0.19149804 0.2075718 0.22821475 0.03998042 0.03663512 0.2043897 0 0.03296345
638
-           9         10         11        12         13        14
639
-A 0.00000000 0.29797650 0.00000000 0.1932115 0.00000000 0.2468995
640
-C 0.82849217 0.68219648 0.97519582 0.0000000 0.80360640 0.4565111
641
-G 0.04985313 0.00000000 0.00000000 0.5384302 0.11586162 0.1560868
642
-T 0.12165470 0.01982702 0.02480418 0.2683584 0.08053198 0.1405026
643
-
644
-$`Mmusculus-jaspar2018-Egr1-MA0162.1`
645
-           1          2         3 4   5   6          7 8         9 10
646
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
647
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
648
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
649
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
650
-          11
651
-A 0.06666667
652
-C 0.00000000
653
-G 0.46666667
654
-T 0.46666667
655
-
656
-$`Hsapiens-jaspar2018-EGR1-MA0162.2`
657
-           1         2          3          4          5         6 7          8
658
-A 0.08958877 0.1228786 0.09464752 0.10892624 0.01901110 0.2375163 0 0.00000000
659
-C 0.46736292 0.5586651 0.49355418 0.85109334 0.94435379 0.0000000 1 0.96703655
660
-G 0.25155026 0.1108845 0.18358355 0.00000000 0.00000000 0.5580940 0 0.00000000
661
-T 0.19149804 0.2075718 0.22821475 0.03998042 0.03663512 0.2043897 0 0.03296345
662
-           9         10         11        12         13        14
663
-A 0.00000000 0.29797650 0.00000000 0.1932115 0.00000000 0.2468995
664
-C 0.82849217 0.68219648 0.97519582 0.0000000 0.80360640 0.4565111
665
-G 0.04985313 0.00000000 0.00000000 0.5384302 0.11586162 0.1560868
666
-T 0.12165470 0.01982702 0.02480418 0.2683584 0.08053198 0.1405026
667
-
668
-$`Hsapiens-jaspar2018-EGR1-MA0162.3`
669
-          1           2           3          4 5           6         7
670
-A 0.2722977 0.737507906 0.006723716 0.01834431 0 0.000000000 0.0000000
671
-C 0.2309510 0.249209361 0.987775061 0.00000000 1 0.992159228 0.9797136
672
-G 0.1139988 0.001897533 0.001833741 0.98165569 0 0.000000000 0.0000000
673
-T 0.3827525 0.011385199 0.003667482 0.00000000 0 0.007840772 0.0202864
674
-            8            9           10         11         12         13
675
-A 0.795439739 0.0000000000 0.0000000000 0.00000000 0.86166008 0.29390244
676
-C 0.200000000 0.9993943065 0.0000000000 0.99220156 0.01317523 0.27926829
677
-G 0.004560261 0.0000000000 0.9990732159 0.00000000 0.10540184 0.06341463
678
-T 0.000000000 0.0006056935 0.0009267841 0.00779844 0.01976285 0.36341463
679
-         14
680
-A 0.3035714
681
-C 0.1255952
682
-G 0.1077381
683
-T 0.4630952
684
-
685
-$`Hsapiens-jolma2013-EGR1`
686
-          1          2           3           4           5           6
687
-A 0.2494781 0.51390568 0.003223727 0.105202754 0.000000000 0.002604167
688
-C 0.2411273 0.39540508 0.969696970 0.005355777 0.980025773 0.992838542
689
-G 0.1539666 0.03627570 0.007736944 0.854246366 0.007731959 0.000000000
690
-T 0.3554280 0.05441354 0.019342360 0.035195103 0.012242268 0.004557292
691
-            7           8           9         10       11         12        13
692
-A 0.000000000 0.652638191 0.003253090 0.01906158 0.010000 0.68089431 0.2790573
693
-C 0.928214732 0.343592965 0.995445673 0.01136364 0.938125 0.06910569 0.2485270
694
-G 0.009363296 0.000000000 0.000000000 0.93181818 0.011875 0.14227642 0.1253348
695
-T 0.062421973 0.003768844 0.001301236 0.03775660 0.040000 0.10772358 0.3470809
696
-         14
697
-A 0.2673936
698
-C 0.1905504
699
-G 0.1396677
700
-T 0.4023884
701
-
702
-$`Hsapiens-jolma2013-EGR1-2`
703
-          1           2           3          4 5           6         7
704
-A 0.2722977 0.737507906 0.006723716 0.01834431 0 0.000000000 0.0000000
705
-C 0.2309510 0.249209361 0.987775061 0.00000000 1 0.992159228 0.9797136
706
-G 0.1139988 0.001897533 0.001833741 0.98165569 0 0.000000000 0.0000000
707
-T 0.3827525 0.011385199 0.003667482 0.00000000 0 0.007840772 0.0202864
708
-            8            9           10         11         12         13
709
-A 0.795439739 0.0000000000 0.0000000000 0.00000000 0.86166008 0.29390244
710
-C 0.200000000 0.9993943065 0.0000000000 0.99220156 0.01317523 0.27926829
711
-G 0.004560261 0.0000000000 0.9990732159 0.00000000 0.10540184 0.06341463
712
-T 0.000000000 0.0006056935 0.0009267841 0.00779844 0.01976285 0.36341463
713
-         14
714
-A 0.3035714
715
-C 0.1255952
716
-G 0.1077381
717
-T 0.4630952
718
-
719
-$`Mmusculus-jolma2013-Egr1`
720
-          1          2           3            4           5            6
721
-A 0.3231418 0.32278481 0.618181818 0.0000000000 0.075444498 0.0000000000
722
-C 0.3241961 0.30907173 0.366753247 0.9968454259 0.004324844 0.9994728519
723
-G 0.1133368 0.03691983 0.003636364 0.0005257624 0.911100432 0.0005271481
724
-T 0.2393253 0.33122363 0.011428571 0.0026288118 0.009130226 0.0000000000
725
-            7 8           9          10         11          12         13
726
-A 0.001578117 0 0.517114271 0.003149606 0.00422833 0.001579779 0.89181562
727
-C 0.997369805 1 0.481305951 0.995275591 0.16732105 0.998420221 0.05738476
728
-G 0.001052078 0 0.001579779 0.000000000 0.25581395 0.000000000 0.03621825
729
-T 0.000000000 0 0.000000000 0.001574803 0.57263667 0.000000000 0.01458137
730
-          14         15         16
731
-A 0.44251055 0.31170886 0.26213080
732
-C 0.32278481 0.19778481 0.31012658
733
-G 0.04957806 0.04272152 0.09651899
734
-T 0.18512658 0.44778481 0.33122363
735
-
736
-$`Hsapiens-SwissRegulon-EGR1.SwissRegulon`
737
-           1          2         3 4   5   6          7 8         9 10
738
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
739
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
740
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
741
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
742
-          11
743
-A 0.06666667
744
-C 0.00000000
745
-G 0.46666667
746
-T 0.46666667
747
-
748
-$`Mmusculus-UniPROBE-Egr1.UP00007`
749
-          1          2          3          4           5           6
750
-A 0.2115466 0.14198757 0.03260499 0.11512588 0.003516173 0.004715059
751
-C 0.2827083 0.72243721 0.87717185 0.07060553 0.990021152 0.982482238
752
-G 0.2034722 0.05485440 0.01243161 0.78128969 0.002264928 0.009896878
753
-T 0.3022730 0.08072082 0.07779155 0.03297890 0.004197748 0.002905824
754
-            7           8           9         10         11         12
755
-A 0.001626612 0.262351637 0.005889514 0.02289301 0.02303758 0.56763334
756
-C 0.975937323 0.731731673 0.985755764 0.09046006 0.85994854 0.05739392
757
-G 0.001661635 0.002729558 0.002081402 0.64932246 0.03791264 0.16679165
758
-T 0.020774430 0.003187133 0.006273319 0.23732447 0.07910124 0.20818108
759
-         13        14
760
-A 0.1765973 0.1830489
761
-C 0.3312648 0.1837744
762
-G 0.1253083 0.2267928
763
-T 0.3668295 0.4063840
764
-\end{Soutput}
765
-\end{Schunk}
766
-and finally, the metadata associated with these two matrices, transposed, for easy reading and comparison:
767
-\begin{Schunk}
768
-\begin{Sinput}
769
-> noquote (t (as.data.frame (values (MotifDb [indices]))))
770
-\end{Sinput}
771
-\begin{Soutput}
772
-                Hsapiens-HOCOMOCOv10-EGR1_HUMAN.H10MO.A
773
-providerName    EGR1_HUMAN.H10MO.A                     
774
-providerId      EGR1_HUMAN.H10MO.A                     
775
-dataSource      HOCOMOCOv10                            
776
-geneSymbol      EGR1                                   
777
-geneId          <NA>                                   
778
-geneIdType      <NA>                                   
779
-proteinId       P18146                                 
780
-proteinIdType   UNIPROT                                
781
-organism        Hsapiens                               
782
-sequenceCount   500                                    
783
-bindingSequence <NA>                                   
784
-bindingDomain   <NA>                                   
785
-tfFamily        <NA>                                   
786
-experimentType  low- and high-throughput methods       
787
-pubmedID        26586801                               
788
-                Hsapiens-HOCOMOCOv10-EGR1_HUMAN.H10MO.S
789
-providerName    EGR1_HUMAN.H10MO.S                     
790
-providerId      EGR1_HUMAN.H10MO.S                     
791
-dataSource      HOCOMOCOv10                            
792
-geneSymbol      EGR1                                   
793
-geneId          <NA>                                   
794
-geneIdType      <NA>                                   
795
-proteinId       P18146                                 
796
-proteinIdType   UNIPROT                                
797
-organism        Hsapiens                               
798
-sequenceCount   1887                                   
799
-bindingSequence <NA>                                   
800
-bindingDomain   <NA>                                   
801
-tfFamily        <NA>                                   
802
-experimentType  low- and high-throughput methods       
803
-pubmedID        26586801                               
804
-                Mmusculus-HOCOMOCOv10-EGR1_MOUSE.H10MO.A
805
-providerName    EGR1_MOUSE.H10MO.A                      
806
-providerId      EGR1_MOUSE.H10MO.A                      
807
-dataSource      HOCOMOCOv10                             
808
-geneSymbol      EGR1                                    
809
-geneId          <NA>                                    
810
-geneIdType      <NA>                                    
811
-proteinId       P08046                                  
812
-proteinIdType   UNIPROT                                 
813
-organism        Mmusculus                               
814
-sequenceCount   1887                                    
815
-bindingSequence <NA>                                    
816
-bindingDomain   <NA>                                    
817
-tfFamily        <NA>                                    
818
-experimentType  low- and high-throughput methods        
819
-pubmedID        26586801                                
820
-                Hsapiens-HOCOMOCOv11A-EGR1_HUMAN.H11MO.0.A                            
821
-providerName    EGR1_HUMAN.H11MO.0.A                                                  
822
-providerId      EGR1_HUMAN.H11MO.0.A                                                  
823
-dataSource      HOCOMOCOv11-core-A                                                    
824
-geneSymbol      EGR1                                                                  
825
-geneId          1958                                                                  
826
-geneIdType      ENTREZ                                                                
827
-proteinId       EGR1_HUMAN                                                            
828
-proteinIdType   UNIPROT                                                               
829
-organism        Hsapiens                                                              
830
-sequenceCount   500                                                                   
831
-bindingSequence <NA>                                                                  
832
-bindingDomain   <NA>                                                                  
833
-tfFamily        Three-zinc finger Krüppel-related factors{2.3.1}: EGR factors{2.3.1.3}
834
-experimentType  ChIP-Seq                                                              
835
-pubmedID        23175603                                                              
836
-                NA-HOMER-Egr1(Zf)/K562-Egr1-ChIP-Seq(GSE32465)/Homer
837
-providerName    Egr1(Zf)/K562-Egr1-ChIP-Seq(GSE32465)/Homer         
838
-providerId      Egr1(Zf)/K562-Egr1-ChIP-Seq(GSE32465)/Homer         
839
-dataSource      HOMER                                               
840
-geneSymbol      Egr1                                                
841
-geneId          <NA>                                                
842
-geneIdType      <NA>                                                
843
-proteinId       ?query=Egr1(Zf)_K562-Egr1-ChIP-Seq(GSE32465)        
844
-proteinIdType   UNIPROT                                             
845
-organism        <NA>                                                
846
-sequenceCount   1                                                   
847
-bindingSequence <NA>                                                
848
-bindingDomain   <NA>                                                
849
-tfFamily        <NA>                                                
850
-experimentType  low- and high-throughput methods                    
851
-pubmedID        26586801                                            
852
-                Mmusculus-JASPAR_CORE-Egr1-MA0162.1
853
-providerName    Egr1                               
854
-providerId      MA0162.1                           
855
-dataSource      JASPAR_CORE                        
856
-geneSymbol      Egr1                               
857
-geneId          13653                              
858
-geneIdType      ENTREZ                             
859
-proteinId       P08046                             
860
-proteinIdType   UNIPROT                            
861
-organism        Mmusculus                          
862
-sequenceCount   15                                 
863
-bindingSequence <NA>                               
864
-bindingDomain   Zinc-coordinating                  
865
-tfFamily        BetaBetaAlpha-zinc finger          
866
-experimentType  bacterial 1-hybrid                 
867
-pubmedID        16041365                           
868
-                Hsapiens-JASPAR_2014-EGR1-MA0162.2
869
-providerName    EGR1                              
870
-providerId      MA0162.2                          
871
-dataSource      JASPAR_2014                       
872
-geneSymbol      EGR1                              
873
-geneId          1958                              
874
-geneIdType      ENTREZ                            
875
-proteinId       P18146                            
876
-proteinIdType   UNIPROT                           
877
-organism        Hsapiens                          
878
-sequenceCount   12256                             
879
-bindingSequence <NA>                              
880
-bindingDomain   Zinc-coordinating                 
881
-tfFamily        BetaBetaAlpha-zinc finger         
882
-experimentType  ChIP-seq                          
883
-pubmedID        16041365                          
884
-                Mmusculus-jaspar2016-Egr1-MA0162.1
885
-providerName    MA0162.1                          
886
-providerId      MA0162.1                          
887
-dataSource      jaspar2016                        
888
-geneSymbol      Egr1                              
889
-geneId          <NA>                              
890
-geneIdType      <NA>                              
891
-proteinId       P08046                            
892
-proteinIdType   UNIPROT                           
893
-organism        Mmusculus                         
894
-sequenceCount   15                                
895
-bindingSequence <NA>                              
896
-bindingDomain   <NA>                              
897
-tfFamily        BetaBetaAlpha-zinc finger         
898
-experimentType  bacterial 1-hybrid                
899
-pubmedID        24194598                          
900
-                Hsapiens-jaspar2016-EGR1-MA0162.2        
901
-providerName    MA0162.2                                 
902
-providerId      MA0162.2                                 
903
-dataSource      jaspar2016                               
904
-geneSymbol      EGR1                                     
905
-geneId          <NA>                                     
906
-geneIdType      <NA>                                     
907
-proteinId       P18146                                   
908
-proteinIdType   UNIPROT                                  
909
-organism        Hsapiens                                 
910
-sequenceCount   12256                                    
911
-bindingSequence <NA>                                     
912
-bindingDomain   <NA>                                     
913
-tfFamily        Three-zinc finger Krüppel-related factors
914
-experimentType  ChIP-seq                                 
915
-pubmedID        24194598                                 
916
-                Mmusculus-jaspar2018-Egr1-MA0162.1        
917
-providerName    MA0162.1                                  
918
-providerId      MA0162.1                                  
919
-dataSource      jaspar2018                                
920
-geneSymbol      Egr1                                      
921
-geneId          <NA>                                      
922
-geneIdType      <NA>                                      
923
-proteinId       P08046                                    
924
-proteinIdType   UNIPROT                                   
925
-organism        Mmusculus                                 
926
-sequenceCount   15                                        
927
-bindingSequence <NA>                                      
928
-bindingDomain   <NA>                                      
929
-tfFamily        Three-zinc finger Kruppel-related factors 
930
-experimentType  bacterial 1-hybrid                        
931
-pubmedID        16041365                                  
932
-                Hsapiens-jaspar2018-EGR1-MA0162.2         
933
-providerName    MA0162.2                                  
934
-providerId      MA0162.2                                  
935
-dataSource      jaspar2018                                
936
-geneSymbol      EGR1                                      
937
-geneId          <NA>                                      
938
-geneIdType      <NA>                                      
939
-proteinId       P18146                                    
940
-proteinIdType   UNIPROT                                   
941
-organism        Hsapiens                                  
942
-sequenceCount   12256                                     
943
-bindingSequence <NA>                                      
944
-bindingDomain   <NA>                                      
945
-tfFamily        Three-zinc finger Kruppel-related factors 
946
-experimentType  ChIP-seq                                  
947
-pubmedID        16041365                                  
948
-                Hsapiens-jaspar2018-EGR1-MA0162.3         
949
-providerName    MA0162.3                                  
950
-providerId      MA0162.3                                  
951
-dataSource      jaspar2018                                
952
-geneSymbol      EGR1                                      
953
-geneId          <NA>                                      
954
-geneIdType      <NA>                                      
955
-proteinId       P18146                                    
956
-proteinIdType   UNIPROT                                   
957
-organism        Hsapiens                                  
958
-sequenceCount   2158                                      
959
-bindingSequence <NA>                                      
960
-bindingDomain   <NA>                                      
961
-tfFamily        Three-zinc finger Kruppel-related factors 
962
-experimentType  HT-SELEX                                  
963
-pubmedID        16041365                                  
964
-                Hsapiens-jolma2013-EGR1 Hsapiens-jolma2013-EGR1-2
965
-providerName    Hsapiens-jolma2013-EGR1 Hsapiens-jolma2013-EGR1-2
966
-providerId      EGR1                    EGR1                     
967
-dataSource      jolma2013               jolma2013                
968
-geneSymbol      EGR1                    EGR1                     
969
-geneId          1958                    1958                     
970
-geneIdType      ENTREZ                  ENTREZ                   
971
-proteinId       <NA>                    <NA>                     
972
-proteinIdType   <NA>                    <NA>                     
973
-organism        Hsapiens                Hsapiens                 
974
-sequenceCount   1831                    1703                     
975
-bindingSequence NMCGCCCMCGCANN          NACGCCCACGCANN           
976
-bindingDomain   <NA>                    <NA>                     
977
-tfFamily        C2H2                    C2H2                     
978
-experimentType  SELEX                   SELEX                    
979
-pubmedID        23332764                23332764                 
980
-                Mmusculus-jolma2013-Egr1
981
-providerName    Mmusculus-jolma2013-Egr1
982
-providerId      Egr1                    
983
-dataSource      jolma2013               
984
-geneSymbol      Egr1                    
985
-geneId          1958                    
986
-geneIdType      ENTREZ                  
987
-proteinId       <NA>                    
988
-proteinIdType   <NA>                    
989
-organism        Mmusculus               
990
-sequenceCount   2013                    
991
-bindingSequence NNMCGCCCMCTCANNN        
992
-bindingDomain   <NA>                    
993
-tfFamily        C2H2                    
994
-experimentType  SELEX                   
995
-pubmedID        23332764                
996
-                Hsapiens-SwissRegulon-EGR1.SwissRegulon
997
-providerName    EGR1.SwissRegulon                      
998
-providerId      EGR1.SwissRegulon                      
999
-dataSource      SwissRegulon                           
1000
-geneSymbol      EGR1                                   
1001
-geneId          <NA>                                   
1002
-geneIdType      <NA>                                   
1003
-proteinId       <NA>                                   
1004
-proteinIdType   UNIPROT                                
1005
-organism        Hsapiens                               
1006
-sequenceCount   15                                     
1007
-bindingSequence <NA>                                   
1008
-bindingDomain   <NA>                                   
1009
-tfFamily        <NA>                                   
1010
-experimentType  low- and high-throughput methods       
1011
-pubmedID        19377474                               
1012
-                Mmusculus-UniPROBE-Egr1.UP00007
1013
-providerName    SCI09/Egr1_pwm_primary.txt     
1014
-providerId      UP00007                        
1015
-dataSource      UniPROBE                       
1016
-geneSymbol      Egr1                           
1017
-geneId          13653                          
1018
-geneIdType      ENTREZ                         
1019
-proteinId       P08046                         
1020
-proteinIdType   UNIPROT                        
1021
-organism        Mmusculus                      
1022
-sequenceCount   <NA>                           
1023
-bindingSequence <NA>                           
1024
-bindingDomain   ZnF_C2H2                       
1025
-tfFamily        <NA>                           
1026
-experimentType  protein binding microarray     
1027
-pubmedID        19443739                       
1028
-\end{Soutput}
1029
-\end{Schunk}
1030
-
1031
-We used the \emph{grep} function above to find rows in the metadata table whose \emph{geneSymbol} column includes the string 'Egr1'.
1032
-If you wish to identify matrices (and/or their attendant metadata) based upon a richer combination of criteria, for instance:
1033
-
1034
-\begin{enumerate}
1035
-  \item organism  (\emph{Mmusculus})
1036
-  \item gene symbol  (\emph{Egr1})
1037
-  \item data source  (\emph{JASPAR\_CORE})
1038
-\end{enumerate}
1039
-
1040
-the grep solution, while serviceable, becomes a little awkward:
1041
-\begin{Schunk}
1042
-\begin{Sinput}
1043
-> geneSymbol.rows = grep ('Egr1', values (MotifDb)$geneSymbol, ignore.case=TRUE)
1044
-> organism.rows = grep ('Mmusculus', values (MotifDb)$organism, ignore.case=TRUE)
1045
-> source.rows = grep ('JASPAR', values (MotifDb)$dataSource, ignore.case=TRUE)
1046
-> egr1.mouse.jaspar.rows = intersect (geneSymbol.rows,
1047
-+                            intersect (organism.rows, source.rows))
1048
-> print (egr1.mouse.jaspar.rows)
1049
-\end{Sinput}
1050
-\begin{Soutput}
1051
-[1] 4246 5400 6521
1052
-\end{Soutput}
1053
-\begin{Sinput}
1054
-> egr1.motif <- MotifDb [egr1.mouse.jaspar.rows]
1055
-\end{Sinput}
1056
-\end{Schunk}
1057
-
1058
-Far more concise, and fully reliable as an interactive command (though \emph{not} if used in a
1059
-script\footnote{See the help page of the base R command subset for detail), is the \emph{subset} command}):
1060
-\begin{Schunk}
1061
-\begin{Sinput}
1062
-> if (interactive ()) {
1063
-+   egr1.motif <- subset (MotifDb, organism=='Mmusculus' &
1064
-+                         dataSource=='JASPAR_CORE' &
1065
-+                         geneSymbol=='Egr1')
1066
-+   }
1067
-\end{Sinput}
1068
-\end{Schunk}
1069
-Whichever method you use, this next chunk of code displays the matrix, and then the metadata for mouse JASPAR Egr1, the latter
1070
-textually-transformed for easy reading within the size constraints of this page.
1071
-\begin{Schunk}
1072
-\begin{Sinput}
1073
-> egr1.motif
1074
-\end{Sinput}
1075
-\begin{Soutput}
1076
-MotifDb object of length 3
1077
-| Created from downloaded public sources: 2013-Aug-30
1078
-| 3 position frequency matrices from 3 sources:
1079
-|        JASPAR_CORE:    1
1080
-|         jaspar2016:    1
1081
-|         jaspar2018:    1
1082
-| 1 organism/s
1083
-|          Mmusculus:    3
1084
-Mmusculus-JASPAR_CORE-Egr1-MA0162.1 
1085
-Mmusculus-jaspar2016-Egr1-MA0162.1 
1086
-Mmusculus-jaspar2018-Egr1-MA0162.1 
1087
-\end{Soutput}
1088
-\begin{Sinput}
1089
-> as.list (egr1.motif)
1090
-\end{Sinput}
1091
-\begin{Soutput}
1092
-$`Mmusculus-JASPAR_CORE-Egr1-MA0162.1`
1093
-           1          2         3 4   5   6          7 8         9 10
1094
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
1095
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
1096
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
1097
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
1098
-          11
1099
-A 0.06666667
1100
-C 0.00000000
1101
-G 0.46666667
1102
-T 0.46666667
1103
-
1104
-$`Mmusculus-jaspar2016-Egr1-MA0162.1`
1105
-           1          2         3 4   5   6          7 8         9 10
1106
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
1107
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
1108
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
1109
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
1110
-          11
1111
-A 0.06666667
1112
-C 0.00000000
1113
-G 0.46666667
1114
-T 0.46666667
1115
-
1116
-$`Mmusculus-jaspar2018-Egr1-MA0162.1`
1117
-           1          2         3 4   5   6          7 8         9 10
1118
-A 0.20000000 0.13333333 0.0000000 0 0.0 0.2 0.06666667 0 0.1333333  0
1119
-C 0.26666667 0.06666667 0.8666667 0 0.0 0.0 0.00000000 0 0.6666667  0
1120
-G 0.06666667 0.80000000 0.0000000 1 0.2 0.8 0.93333333 1 0.0000000  1
1121
-T 0.46666667 0.00000000 0.1333333 0 0.8 0.0 0.00000000 0 0.2000000  0
1122
-          11
1123
-A 0.06666667
1124
-C 0.00000000
1125
-G 0.46666667
1126
-T 0.46666667
1127
-\end{Soutput}
1128
-\begin{Sinput}
1129
-> noquote (t (as.data.frame (values (egr1.motif))))
1130
-\end{Sinput}
1131
-\begin{Soutput}
1132
-                Mmusculus-JASPAR_CORE-Egr1-MA0162.1
1133
-providerName    Egr1                               
1134
-providerId      MA0162.1                           
1135
-dataSource      JASPAR_CORE                        
1136
-geneSymbol      Egr1                               
1137
-geneId          13653                              
1138
-geneIdType      ENTREZ                             
1139
-proteinId       P08046                             
1140
-proteinIdType   UNIPROT                            
1141
-organism        Mmusculus                          
1142
-sequenceCount   15                                 
1143
-bindingSequence <NA>                               
1144
-bindingDomain   Zinc-coordinating                  
1145
-tfFamily        BetaBetaAlpha-zinc finger          
1146
-experimentType  bacterial 1-hybrid                 
1147
-pubmedID        16041365                           
1148
-                Mmusculus-jaspar2016-Egr1-MA0162.1
1149
-providerName    MA0162.1                          
1150
-providerId      MA0162.1                          
1151
-dataSource      jaspar2016                        
1152
-geneSymbol      Egr1                              
1153
-geneId          <NA>                              
1154
-geneIdType      <NA>                              
1155
-proteinId       P08046                            
1156
-proteinIdType   UNIPROT                           
1157
-organism        Mmusculus                         
1158
-sequenceCount   15                                
1159
-bindingSequence <NA>                              
1160
-bindingDomain   <NA>                              
1161
-tfFamily        BetaBetaAlpha-zinc finger         
1162
-experimentType  bacterial 1-hybrid                
1163
-pubmedID        24194598                          
1164
-                Mmusculus-jaspar2018-Egr1-MA0162.1        
1165
-providerName    MA0162.1                                  
1166
-providerId      MA0162.1                                  
1167
-dataSource      jaspar2018                                
1168
-geneSymbol      Egr1                                      
1169
-geneId          <NA>                                      
1170
-geneIdType      <NA>                                      
1171
-proteinId       P08046                                    
1172
-proteinIdType   UNIPROT                                   
1173
-organism        Mmusculus                                 
1174
-sequenceCount   15                                        
1175
-bindingSequence <NA>                                      
1176
-bindingDomain   <NA>                                      
1177
-tfFamily        Three-zinc finger Kruppel-related factors 
1178
-experimentType  bacterial 1-hybrid                        
1179
-pubmedID        16041365                                  
1180
-\end{Soutput}
1181
-\end{Schunk}
1182
-
1183
-
1184
-Next we use the bioconductor \emph{seqLogo} package to display this motif.
1185
-
1186
-\begin{Schunk}
1187
-\begin{Sinput}
1188
-> seqLogo (as.list (egr1.motif)[[1]])
1189
-\end{Sinput}
1190
-\end{Schunk}
1191
-
1192
-\begin{figure}[htpb!]
1193
-  \centering
1194
-  \includegraphics[width=0.3\textwidth]{MotifDb-egr1}
1195
-  \caption{Mmusculus-JASPAR\_CORE-Egr1-MA0162.1}
1196
-\end{figure}
1197
-
1198
-\section{Motif Matching}
1199
-We will look for the ten position frequency matrices which are the best match to JASPAR's mouse EGR1, using
1200
-the MotIV package.  We actually request the top eleven hits from the entire MotifDb, since the first hit
1201
-should be the target matrix itself, since that is of necessity found in the full MotifDb.
1202
-
1203
-\begin{Schunk}
1204
-\begin{Sinput}
1205
-> egr1.hits <- motifMatch (as.list (egr1.motif) [1], as.list (MotifDb), top=11)
1206 0
\ No newline at end of file