Browse code

motifToGene refactored, now suuports c('motifDb', tfClass') sources

paul-shannon authored on 18/05/2018 22:05:19
Showing4 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.21.4
5
-Date: 2018-05-10
4
+Version: 1.21.6
5
+Date: 2018-05-18
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
8 8
 Depends: R (>= 2.15.0), methods, BiocGenerics, S4Vectors, IRanges, Biostrings
... ...
@@ -4,7 +4,6 @@ exportMethods (
4 4
    export,
5 5
    show,
6 6
    query,
7
-   query2,
8 7
    motifToGene,
9 8
    geneToMotif,
10 9
    associateTranscriptionFactors
... ...
@@ -1,5 +1,4 @@
1
-setGeneric('query', signature='object', function(object, queryString, ignore.case=TRUE) standardGeneric ('query'))
2
-setGeneric('query2', signature='object', function(object, andStrings, orStrings=c(), notStrings=c(), ignore.case=TRUE) standardGeneric ('query2'))
1
+setGeneric('query', signature='object', function(object, andStrings, orStrings=c(), notStrings=c(), ignore.case=TRUE) standardGeneric ('query'))
3 2
 setGeneric('motifToGene', signature='object', function(object, motifs, source) standardGeneric('motifToGene'))
4 3
 setGeneric('geneToMotif', signature='object', function(object, geneSymbols, source, ignore.case=FALSE) standardGeneric('geneToMotif'))
5 4
 setGeneric('associateTranscriptionFactors', signature='object',
... ...
@@ -271,17 +270,17 @@ setMethod('show', 'MotifList',
271 270
       }
272 271
     })
273 272
 #-------------------------------------------------------------------------------
274
-setMethod ('query', 'MotifList',
275
-
276
-   function (object, queryString, ignore.case=TRUE) {
277
-       indices = unique (as.integer (unlist (sapply (colnames (mcols (object)),
278
-                    function (colname)
279
-                       grep (queryString, mcols (object)[, colname],
280
-                             ignore.case=ignore.case)))))
281
-        object [indices]
282
-      })
273
+#setMethod ('query', 'MotifList',
274
+#
275
+#   function (object, queryString, ignore.case=TRUE) {
276
+#       indices = unique (as.integer (unlist (sapply (colnames (mcols (object)),
277
+#                    function (colname)
278
+#                       grep (queryString, mcols (object)[, colname],
279
+#                             ignore.case=ignore.case)))))
280
+#        object [indices]
281
+#      })
283 282
 #-------------------------------------------------------------------------------
284
-setMethod ('query2', 'MotifList',
283
+setMethod ('query', 'MotifList',
285 284
 
286 285
    function (object, andStrings, orStrings=c(), notStrings=c(), ignore.case=TRUE) {
287 286
       find.indices <- function(queryString)
... ...
@@ -384,37 +383,103 @@ matrixToJasparText <- function (matrices)
384 383
 } # matrixToJasparText
385 384
 #-------------------------------------------------------------------------------
386 385
 # returns a data.frame with motif, geneSymbol, source, pubmedID columns
386
+# setMethod ('oldMotifToGene', 'MotifList',
387
+#
388
+#    function (object, motifs, source) {
389
+#      source <- tolower(source)
390
+#      stopifnot(source %in% c("motifdb", "tfclass"))
391
+#      tbl <- data.frame()
392
+#      if(source %in% c("motifdb")){
393
+#         providerId <- NULL   # avoid R CMD check note
394
+#         tbl <- as.data.frame(subset(mcols(object), providerId %in% motifs))
395
+#         if(nrow(tbl) == 0)
396
+#            return(data.frame())
397
+#         tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
398
+#         colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
399
+#         tbl <- tbl[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")]
400
+#         if(nrow(tbl) > 0)
401
+#            tbl$source <- "MotifDb"
402
+#         }
403
+#      if(source %in% c("tfclass")){
404
+#         motif <- NULL
405
+#         tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs)
406
+#         if(nrow(tbl) == 0)
407
+#            return(data.frame())
408
+#         tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
409
+#         tbl <- tbl[order(tbl$motif),]
410
+#         rownames(tbl) <- NULL
411
+#         colnames(tbl) <- c("motif", "geneSymbol", "pubmedID")
412
+#         if(nrow(tbl) > 0)
413
+#            tbl$source <- "TFClass"
414
+#         }
415
+#      tbl
416
+#      }) # oldMotifToGene
417
+#
418
+#-------------------------------------------------------------------------------
419
+# returns a data.frame with motif, geneSymbol, source, pubmedID columns
387 420
 setMethod ('motifToGene', 'MotifList',
388 421
 
389 422
    function (object, motifs, source) {
423
+      # for MotifDb, motif names come in a variety of forms, and our first step
424
+      # is to convert them all, if needed, into that which is found in
425
+      # the "providerId" column of the MotifDb metadata table.
426
+      #
427
+      # first check to see if the supplied motif name is actually a MotifDb
428
+      # matrix list name, e.g., Hsapiens-HOCOMOCOv10-IKZF1_HUMAN.H10MO.C
429
+      # when those cases are discovered, they are translated to the matrices
430
+      # providerId - which is our standard currency for lookup, using
431
+      # the mcols (the metadata, the annotation) which accompanies
432
+      # each pfm matrix
433
+
434
+     name.map <- as.list(motifs)
435
+     names(name.map) <- motifs
436
+     for(i in seq_len(length(motifs))){
437
+        x <-match(motifs[i], names(MotifDb));
438
+        if(!is.na(x)){
439
+           newValue <-  mcols(MotifDb[motifs[i]])$providerId
440
+           names(name.map)[i] <- newValue
441
+           motifs[i] <-newValue
442
+            }
443
+        } # for i
444
+     #browser()
390 445
      source <- tolower(source)
391
-     stopifnot(source %in% c("motifdb", "tfclass"))
392
-     tbl <- data.frame()
393
-     if(source %in% c("motifdb")){
446
+     stopifnot(all(source %in% c("motifdb", "tfclass")))
447
+     tbl.mdb <- data.frame()
448
+     if("motifdb" %in% source){
394 449
         providerId <- NULL   # avoid R CMD check note
395
-        tbl <- as.data.frame(subset(mcols(object), providerId %in% motifs))
396
-        if(nrow(tbl) == 0)
450
+        tbl.mdb <- as.data.frame(subset(mcols(object), providerId %in% motifs))
451
+        if(nrow(tbl.mdb) == 0)
397 452
            return(data.frame())
398
-        tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
399
-        colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
400
-        tbl <- tbl[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")]
401
-        if(nrow(tbl) > 0)
402
-           tbl$source <- "MotifDb"
403
-        }
404
-     if(source %in% c("tfclass")){
453
+        tbl.mdb <- unique(tbl.mdb [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
454
+        colnames(tbl.mdb) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
455
+        tbl.mdb <- tbl.mdb[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")]
456
+        if(nrow(tbl.mdb) > 0){
457
+           tbl.mdb$source <- "MotifDb"
458
+           tbl.mdb <- tbl.mdb[, c("motif", "geneSymbol", "pubmedID", "organism", "source")]
459
+           rownames(tbl.mdb) <- NULL
460
+           }
461
+        }  # motifDb
462
+     tbl.tfc <- data.frame()
463
+     if("tfclass" %in% source){
405 464
         motif <- NULL
406
-        tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs)
407
-        if(nrow(tbl) == 0)
465
+        tbl.tfc <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs)
466
+        if(nrow(tbl.tfc) == 0)
408 467
            return(data.frame())
409
-        tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
410
-        tbl <- tbl[order(tbl$motif),]
411
-        rownames(tbl) <- NULL
412
-        colnames(tbl) <- c("motif", "geneSymbol", "pubmedID")
413
-        if(nrow(tbl) > 0)
414
-           tbl$source <- "TFClass"
468
+        tbl.tfc <- unique(tbl.tfc[, c("motif", "tf.gene", "pubmedID")])
469
+        tbl.tfc <- tbl.tfc[order(tbl.tfc$motif),]
470
+        rownames(tbl.tfc) <- NULL
471
+        colnames(tbl.tfc) <- c("motif", "geneSymbol", "pubmedID")
472
+        if(nrow(tbl.tfc) > 0){
473
+           tbl.tfc$source <- "TFClass"
474
+           tbl.tfc$organism <- "Hsapiens"
475
+           }
415 476
         }
416
-     tbl
417
-     })
477
+      tbl.out <- rbind(tbl.mdb, tbl.tfc)
478
+      if(length(name.map) > 0)
479
+         tbl.out$motif <- as.character(name.map[tbl.out$motif])
480
+      #browser()
481
+      tbl.out
482
+      })
418 483
 
419 484
 #-------------------------------------------------------------------------------
420 485
 # returns a data.frame with motif, geneSymbol, source, pubmedID columns
... ...
@@ -28,8 +28,8 @@ runTests = function ()
28 28
   test.allFullNames ()
29 29
   test.subset ()
30 30
   test.subsetWithVariables ()
31
-  test.query ()
32
-  test.query2()
31
+  test.queryOldStyle ()
32
+  test.query()
33 33
   test.transformMatrixToMemeRepresentation ()
34 34
   test.matrixToMemeText ()
35 35
   test.export_memeFormatStdOut ()
... ...
@@ -48,6 +48,7 @@ runTests = function ()
48 48
   test.motifToGene()
49 49
 
50 50
   test.associateTranscriptionFactors()
51
+
51 52
 } # runTests
52 53
 #------------------------------------------------------------------------------------------------------------------------
53 54
 test.emptyCtor = function ()
... ...
@@ -396,9 +397,10 @@ test.subsetWithVariables = function ()
396 397
 
397 398
 } # test.subsetWithVariables
398 399
 #------------------------------------------------------------------------------------------------------------------------
399
-test.query = function ()
400
+# "old style": just one query term allowed
401
+test.queryOldStyle = function ()
400 402
 {
401
-  print ('--- test.query')
403
+  print ('--- test.queryOldStyle')
402 404
   mdb = MotifDb
403 405
 
404 406
     # do queries on dataSource counts match those from a contingency table?
... ...
@@ -437,43 +439,43 @@ test.query = function ()
437 439
     # query uses base R's grep, in which
438 440
     #  x <- query(mdb, "ELK1,4_GABP{A,B1}.p3")
439 441
 
440
-} # test.query
442
+} # test.queryOldStyle
441 443
 #------------------------------------------------------------------------------------------------------------------------
442
-test.query2 <- function()
444
+test.query <- function()
443 445
 {
444
-  print ('--- test.query2')
446
+  print ('--- test.query')
445 447
   mdb = MotifDb
446 448
 
447 449
   ors <- c("MA0511.1", "MA0057.1")
448 450
   ands <- c("jaspar2018", "sapiens")
449 451
   nots <- "cisbp"
450
-  x <- query2(mdb, andStrings=ands, orStrings=ors)
452
+  x <- query(mdb, andStrings=ands, orStrings=ors)
451 453
   checkEquals(length(x), 2)
452 454
   checkEquals(sort(names(x)),
453 455
              c("Hsapiens-jaspar2018-MZF1(var.2)-MA0057.1", "Hsapiens-jaspar2018-RUNX2-MA0511.1"))
454 456
 
455
-  x <- query2(mdb, andStrings="MA0057.1")
457
+  x <- query(mdb, andStrings="MA0057.1")
456 458
   checkEquals(length(x), 15)
457 459
 
458
-  x <- query2(mdb, andStrings=c("MA0057.1", "cisbp"))
460
+  x <- query(mdb, andStrings=c("MA0057.1", "cisbp"))
459 461
   checkEquals(length(x), 11)
460 462
 
461
-  x <- query2(mdb, andStrings=c("MA0057.1"), notStrings="cisbp")
463
+  x <- query(mdb, andStrings=c("MA0057.1"), notStrings="cisbp")
462 464
   checkEquals(length(x), 4)
463 465
 
464
-  x <- query2(mdb, andStrings=c("MA0057.1"), notStrings=c("cisbp", "JASPAR_2014"))
466
+  x <- query(mdb, andStrings=c("MA0057.1"), notStrings=c("cisbp", "JASPAR_2014"))
465 467
   checkEquals(length(x), 3)
466 468
 
467
-  x <- query2(mdb, orStrings=c("mus", "sapiens"), andStrings="MA0057.1")
469
+  x <- query(mdb, orStrings=c("mus", "sapiens"), andStrings="MA0057.1")
468 470
   #checkEquals(sort(names(x)),
469 471
 
470 472
     # do queries on dataSource counts match those from a contingency table?
471 473
   sources.list = as.list (table (mcols(mdb)$dataSource))
472
-  checkEquals (length (query2 (mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
473
-  checkEquals (length (query2 (mdb, 'uniprobe')), sources.list$UniPROBE)
474
-  checkEquals (length (query2 (mdb, 'UniPROBE')), sources.list$UniPROBE)
474
+  checkEquals (length (query (mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
475
+  checkEquals (length (query (mdb, 'uniprobe')), sources.list$UniPROBE)
476
+  checkEquals (length (query (mdb, 'UniPROBE')), sources.list$UniPROBE)
475 477
 
476
-} # test.query2
478
+} # test.query
477 479
 #------------------------------------------------------------------------------------------------------------------------
478 480
 test.transformMatrixToMemeRepresentation = function ()
479 481
 {
... ...
@@ -881,25 +883,29 @@ test.geneToMotif.ignore.jasparSuffixes <- function()
881 883
 test.motifToGene <- function()
882 884
 {
883 885
    printf("--- test.motifToGene")
884
-   printf("Sys.getlocale: %s", Sys.getlocale())
886
+   #printf("Sys.getlocale: %s", Sys.getlocale())
887
+
888
+     # good test case of querying both sources,
889
+   motif <- "MA0099.2"
890
+   tbl <- motifToGene(MotifDb, motif, c("MotifDb", "TFclass"))
891
+      # [1]   mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN
892
+      # [1]   tfc.genes: FOS, JUN
893
+   checkEquals(sort(unique(tbl$geneSymbol)), c("AP1", "FOS", "FOS::JUN", "JUN", "JUN::FOS"))
885 894
 
886 895
    motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")
887 896
 
888 897
       # MotifDb mode uses the MotifDb metadata "providerId",
889 898
    tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb")
890
-   checkEquals(dim(tbl.mdb), c(4, 6))
899
+   checkEquals(dim(tbl.mdb), c(4, 5))
891 900
    expected <- sort(c("MA0592.2", "MA0592.2", "ELF1.SwissRegulon", "UP00022"))
892 901
    actual <- sort(tbl.mdb$motif)
893 902
    checkEquals(actual, expected)
894 903
    checkEquals(sort(tbl.mdb$geneSymbol), sort(c("ELF1", "Esrra", "Esrra", "Zfp740")))
895
-   checkEquals(sort(tbl.mdb$dataSource), sort(c("jaspar2016", "jaspar2018", "SwissRegulon", "UniPROBE")))
896
-
897
-   checkEquals(sort(tbl.mdb$organism),   sort(c("Hsapiens", "Mmusculus", "Mmusculus", "Mmusculus")))
898 904
    checkEquals(tbl.mdb$source,     rep("MotifDb", 4))
899 905
 
900 906
       # TFClass mode uses  TF family classifcation
901 907
    tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass")
902
-   checkEquals(dim(tbl.tfClass), c(9,4))
908
+   checkEquals(dim(tbl.tfClass), c(9,5))
903 909
    checkEquals(tbl.tfClass$motif, rep("MA0592.2", 9))
904 910
    checkEquals(sort(tbl.tfClass$gene), sort(c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR")))
905 911
    checkEquals(tbl.tfClass$source,       rep("TFClass", 9))
... ...
@@ -910,19 +916,49 @@ test.motifToGene <- function()
910 916
    checkEquals(nrow(tbl), 0)
911 917
 
912 918
    tbl <- motifToGene(MotifDb, motifs, source="tfclass")
913
-   checkEquals(ncol(tbl), 4)
919
+   checkEquals(ncol(tbl), 5)
914 920
    checkTrue(nrow(tbl) > 80)
915 921
    checkTrue(nrow(tbl) < 100)
916 922
    checkTrue(all(motifs %in% tbl$motif))
917 923
 
918
-} # test.motifToGene
924
+   motifs <- c("Hsapiens-HOCOMOCOv10-IKZF1_HUMAN.H10MO.C",
925
+               "MA0099.2",
926
+               "Hsapiens-SwissRegulon-ARID3A.SwissRegulon",
927
+               "MA0592.2",
928
+               "MA0036913_1.02")
929
+
930
+   tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))
931
+   checkTrue(all(motifs %in% tbl$motif))
932
+
933
+   motif <- "Mmusculus;Rnorvegicus;Hsapiens-jaspar2018-FOS::JUN-MA0099.2"
934
+   tbl <- motifToGene(MotifDb, motif, source="TFClass")
935
+   checkEquals(tbl$geneSymbol, c("FOS", "JUN"))
936
+
937
+   motifs <- c("Hsapiens-jaspar2016-RUNX1-MA0002.1",
938
+               "Hsapiens-jaspar2016-TFAP2A-MA0003.1",
939
+               "Hsapiens-jaspar2016-TFAP2A-MA0003.2",
940
+               "Hsapiens-jaspar2016-TFAP2A-MA0003.3",
941
+               "Hsapiens-jaspar2016-AR-MA0007.2",
942
+               "MA0872.1")
943
+   tbl <- motifToGene(MotifDb, motifs, source="Motifdb")
944
+   checkTrue(all(motifs %in% tbl$motif))
945
+   checkEquals(sort(unique(tbl$geneSymbol)), c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)"))
946
+
947
+   tbl <- motifToGene(MotifDb, motifs, source="TFClass")
948
+   checkEquals(sort(unique(tbl$motif)), c("Hsapiens-jaspar2016-TFAP2A-MA0003.3", "MA0872.1"))
949
+   checkEquals(sort(unique(tbl$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
950
+
951
+   tbl <- motifToGene(MotifDb, motifs, source="TFClass")
952
+
953
+   } # test.motifToGene
919 954
 #------------------------------------------------------------------------------------------------------------------------
920 955
 test.associateTranscriptionFactors <- function()
921 956
 {
922 957
    printf("--- test.associateTranscriptionFactors")
923 958
 
924 959
    mdb <- MotifDb
925
-   pfms <- query(query(mdb, "jaspar2016"), "sapiens")
960
+   pfms <- query(mdb, andStrings=c("sapiens", "jaspar2016"))
961
+      # query(mdb, "jaspar2016", "sapiens")
926 962
 
927 963
       # first check motifs with MotifDb-style long names, using MotifDb lookup, in the
928 964
       # metadata of MotifDb:
... ...
@@ -983,6 +1019,29 @@ test.associateTranscriptionFactors <- function()
983 1019
    tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
984 1020
    checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
985 1021
 
986
-
1022
+      # now some motif names
987 1023
 } # test.associateTranscriptionFactors
988 1024
 #------------------------------------------------------------------------------------------------------------------------
1025
+findMotifsWithMutuallyExclusiveMappings <- function()
1026
+{
1027
+   xtab <- as.data.frame(table(MotifDb@manuallyCuratedGeneMotifAssociationTable$motif))
1028
+   xtab <- xtab[order(xtab$Freq, decreasing=TRUE),]
1029
+   for(motif in xtab$Var1){
1030
+      mdb.genes <- toupper(motifToGene(MotifDb, motif, "motifdb")$geneSymbol)
1031
+      tfc.genes <- toupper(motifToGene(MotifDb, motif, "tfclass")$geneSymbol)
1032
+      if(length(mdb.genes) == 0) next
1033
+      if(length(tfc.genes) == 0) next
1034
+      if(length(intersect(mdb.genes, tfc.genes)) == 0){
1035
+         printf("------ %s", motif)
1036
+         printf("  mdb.genes: %s", paste(mdb.genes, collapse=", "))
1037
+         printf("  tfc.genes: %s", paste(tfc.genes, collapse=", "))
1038
+        } # if no intersection
1039
+      } # for motif
1040
+
1041
+   #  [1] ------ MA0099.2
1042
+   #  [1]   mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN
1043
+   #  [1]   tfc.genes: FOS, JUN
1044
+
1045
+
1046
+} # findMotifsWithMutuallyExclusiveMappings
1047
+#------------------------------------------------------------------------------------------------------------------------