Browse code

geneToMotif, motifToGene, associateTranscriptionFactors tested

paul-shannon authored on 07/09/2017 23:28:23
Showing4 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.19.3
5
-Date: 2017-08-12
4
+Version: 1.19.6
5
+Date: 2017-09-07
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
8 8
 Depends: R (>= 2.15.0), methods, BiocGenerics, S4Vectors, IRanges, Biostrings
... ...
@@ -4,8 +4,9 @@ exportMethods (
4 4
    export,
5 5
    show,
6 6
    query,
7
-   mapMotifToTranscriptionFactorGeneSymbol,
8
-   mapTranscriptionFactorGeneSymbolToMotif
7
+   motifToGene,
8
+   geneToMotif,
9
+   associateTranscriptionFactors
9 10
    )
10 11
 
11 12
 export(
... ...
@@ -18,3 +19,4 @@ import(IRanges)
18 19
 importFrom(rtracklayer, export)
19 20
 import(Biostrings)
20 21
 import(methods)
22
+importFrom(splitstackshape, expandRows)
... ...
@@ -1,9 +1,10 @@
1 1
 setGeneric('query', signature='object', function(object, queryString, ignore.case=TRUE)
2 2
               standardGeneric ('query'))
3
-setGeneric('mapMotifToTranscriptionFactorGeneSymbol', signature='object', function(object, motifs, mode)
4
-              standardGeneric('mapMotifToTranscriptionFactorGeneSymbol'))
5
-setGeneric('mapTranscriptionFactorGeneSymbolToMotif', signature='object', function(object, geneSymbols, mode)
6
-              standardGeneric('mapTranscriptionFactorGeneSymbolToMotif'))
3
+setGeneric('motifToGene', signature='object', function(object, motifs, source) standardGeneric('motifToGene'))
4
+setGeneric('geneToMotif', signature='object', function(object, geneSymbols, source) standardGeneric('geneToMotif'))
5
+setGeneric('associateTranscriptionFactors', signature='object',
6
+           function(object, tbl.withMotifs, motif.column.name, source, expand.rows)
7
+              standardGeneric('associateTranscriptionFactors'))
7 8
 #------------------------------------------------------------------------------------------------------------------------
8 9
 setClass ('MotifList',
9 10
           contains='SimpleList',
... ...
@@ -323,7 +324,7 @@ matrixToJasparText <- function (matrices)
323 324
       # For each line of the matrix, print the correct letter and the
324 325
       # matrix row surrounded by brackets
325 326
       motif.matrix <- matrices[name][[1]]
326
-	    
327
+
327 328
 
328 329
       for (r in 1:nrow(motif.matrix)) {
329 330
           s[index] <- sprintf("%s [ %s ]",
... ...
@@ -342,35 +343,105 @@ matrixToJasparText <- function (matrices)
342 343
 } # matrixToJasparText
343 344
 #-------------------------------------------------------------------------------
344 345
 # returns a data.frame with motif, geneSymbol, source, pubmedID columns
345
-setMethod ('mapMotifToTranscriptionFactorGeneSymbol', 'MotifList',
346
-
347
-   function (object, motifs, mode) {
348
-     stopifnot(mode %in% c("direct", "indirect", "both"))
349
-     if(mode %in% c("direct", "both")){
350
-        # result <- rbind(result, newRows
346
+setMethod ('motifToGene', 'MotifList',
347
+
348
+   function (object, motifs, source) {
349
+     stopifnot(source %in% c("MotifDb", "TFClass"))
350
+     tbl <- data.frame()
351
+     if(source %in% c("MotifDb")){
352
+        tbl <- as.data.frame(subset(mcols(object), providerId %in% motifs))
353
+        tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
354
+        colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
355
+        tbl
356
+        tbl <- tbl[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")]
357
+        tbl$from <- "MotifDb"
351 358
         }
352
-     if(mode %in% c("indirect", "both")){
353
-        # draw on object@manuallyCuratedGeneMotifAssociationTable
354
-        # result <- rbind(result, newRows
359
+     if(source %in% c("TFClass")){
360
+        tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs)
361
+        tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
362
+        tbl <- tbl[order(tbl$motif),]
363
+        rownames(tbl) <- NULL
364
+        colnames(tbl) <- c("motif", "gene", "pubmedID")
365
+        tbl$from <- "TFClass"
355 366
         }
356
-     result
367
+     tbl
357 368
      })
358 369
 
359 370
 #-------------------------------------------------------------------------------
360 371
 # returns a data.frame with motif, geneSymbol, source, pubmedID columns
361
-setMethod ('mapTranscriptionFactorGeneSymbolToMotif', 'MotifList',
362
-
363
-   function (object, geneSymbols, mode) {
364
-     stopifnot(mode %in% c("direct", "indirect"))
365
-     result <- data.frame()
366
-     if(mode %in% c("direct", "both")){
367
-        # result <- rbind(result, newRows
372
+setMethod ('geneToMotif', 'MotifList',
373
+
374
+   function (object, geneSymbols, source) {
375
+     stopifnot(source %in% c("MotifDb", "TFClass"))
376
+     #browser()
377
+     extract.mdb <- function(gene){
378
+        tbl <- as.data.frame(subset(mcols(object), geneSymbol == gene))
379
+        tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
380
+        colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
381
+        tbl
368 382
         }
369
-     if(mode %in% c("indirect", "both")){
370
-        # draw on object@manuallyCuratedGeneMotifAssociationTable
371
-        # result <- rbind(result, newRows
383
+     if(source %in% c("MotifDb")){
384
+        tbls <- lapply(geneSymbols, extract.mdb)
385
+        result <- do.call(rbind, tbls)
386
+        result$from <- "MotifDb"
387
+        }
388
+     if(source %in% c("TFClass")){
389
+        tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, tf.gene %in% geneSymbols)
390
+        tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
391
+        tbl <- tbl[order(tbl$tf.gene),]
392
+        rownames(tbl) <- NULL
393
+        colnames(tbl) <- c("motif", "gene", "pubmedID")
394
+        result <- tbl[, c("gene", "motif", "pubmedID")]
395
+        result$from <- "TFClass"
372 396
         }
373 397
      result
374 398
      })
375 399
 
376 400
 #-------------------------------------------------------------------------------
401
+setMethod('associateTranscriptionFactors', 'MotifList',
402
+
403
+   function(object, tbl.withMotifs, motif.column.name, source, expand.rows){
404
+     stopifnot(source %in% c("MotifDb", "TFClass"))
405
+     tbl.out <- data.frame()
406
+     if(source %in% c("MotifDb")){
407
+        # "direct" means:  lookup up in the object metadata, expect one TF geneSymbol per matrix name
408
+        pfm.ids <- tbl.withMotifs[, motif.column.name]
409
+        matched.rows <- match(pfm.ids, names(as.list(object)))
410
+        #if(length(matched.rows) == nrow(tbl.withMotifs)) {
411
+        tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")]
412
+        tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
413
+        tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
414
+        tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
415
+         #}
416
+        } # direct
417
+     if(source %in% c("TFClass")){
418
+        tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE)
419
+        motif.ids <- tbl.withMotifs[, motif.column.name]
420
+        geneSymbols <- lapply(motif.ids, function(id) paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif)], collapse=";"))
421
+        geneSymbols <- unlist(geneSymbols)
422
+        pubmedIds   <- lapply(motif.ids, function(id) unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif)]))
423
+        pubmedIds   <- as.character(pubmedIds)
424
+        pubmedIds   <- gsub("integer(0)", "", pubmedIds, fixed=TRUE)
425
+        tbl.new     <- data.frame(geneSymbol=geneSymbols, pubmedID=pubmedIds, stringsAsFactors=FALSE)
426
+        tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
427
+        tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
428
+        tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
429
+
430
+        if(expand.rows){
431
+           rows.with.na <- which(is.na(tbl.out$geneSymbol))
432
+           rows.with.geneSymbol <- setdiff(1:nrow(tbl.out), rows.with.na)
433
+           tbl.asIs <- tbl.out[rows.with.na,]
434
+           tbl.toExpand <- tbl.out[rows.with.geneSymbol,]
435
+           geneSymbols.split <- strsplit(tbl.toExpand$geneSymbol, ";")
436
+           counts <- unlist(lapply(geneSymbols.split, length))
437
+           geneSymbols.split.vec <- unlist(geneSymbols.split)
438
+           tbl.expanded <- splitstackshape::expandRows(tbl.toExpand, counts, count.is.col=FALSE, drop=FALSE)
439
+           stopifnot(length(geneSymbols.split.vec) == nrow(tbl.expanded))
440
+           tbl.expanded$geneSymbol <- geneSymbols.split.vec
441
+           tbl.out <- rbind(tbl.expanded, tbl.asIs)
442
+           }
443
+        } # indirect
444
+     tbl.out
445
+     })
446
+
447
+#-------------------------------------------------------------------------------
... ...
@@ -3,7 +3,7 @@ library (RUnit)
3 3
 library (MotIV)
4 4
 library (seqLogo)
5 5
 #------------------------------------------------------------------------------------------------------------------------
6
-run.tests = function ()
6
+runTests = function ()
7 7
 {
8 8
   test.emptyCtor ()
9 9
   test.nonEmptyCtor ()
... ...
@@ -36,10 +36,14 @@ run.tests = function ()
36 36
   test.MotIV.toTable ()
37 37
   test.run_MotIV.motifMatch()
38 38
   test.flyFactorGeneSymbols()
39
-  test.export_jasparFormatStdOut ()
40
-  test.export_jasparFormatToFile ()
39
+  test.export_jasparFormatStdOut()
40
+  test.export_jasparFormatToFile()
41 41
 
42
-} # run.tests
42
+  test.geneToMotif()
43
+  test.motifToGene()
44
+  test.associateTranscriptionFactors()
45
+
46
+} # runTests
43 47
 #------------------------------------------------------------------------------------------------------------------------
44 48
 test.emptyCtor = function ()
45 49
 {
... ...
@@ -743,26 +747,144 @@ test.export_jasparFormatToFile = function ()
743 747
 
744 748
 } # test.exportjasparFormatToFile
745 749
 #------------------------------------------------------------------------------------------------------------------------
746
-test.motifTfGeneMapping <- function()
750
+test.geneToMotif <- function()
747 751
 {
748
-   printf("--- test.motifTfGeneMapping")
752
+   printf("--- test.geneToMotif")
749 753
    mdb <- MotifDb
750 754
 
751
-   set.seed(17)
752
-   random.10 <- sample(seq_len(nrow(mcols(mdb))), 10)
753
-   random.10.b <- sample(seq_len(nrow(mcols(mdb))), 10)
755
+   genes <- c("FOS", "ATF5", "bogus")
756
+
757
+      # use  TFClass family classifcation
758
+   tbl.i <- geneToMotif(mdb, genes, source="TFClass")
759
+   checkEquals(tbl.i$gene,  c("ATF5", "FOS", "FOS"))
760
+   checkEquals(tbl.i$motif,  c("MA0833.1", "MA0099.2", "MA0476.1"))
761
+   checkEquals(tbl.i$from, rep("TFClass", 3))
762
+
763
+      # MotifDb mode uses the MotifDb metadata, pulled from many sources
764
+   tbl.d <- geneToMotif(mdb, genes, source="MotifDb")
765
+   checkEquals(dim(tbl.d), c(12, 6))
766
+   checkEquals(subset(tbl.d, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
767
+      # no recognizable (i.e., jaspar standard) motif name returned by MotifDb metadata
768
+      # MotifDb for ATF5
769
+      # todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1
770
+
771
+    # now try motifs to genes
754 772
 
755
-   motifs <- mcols(mdb)[random.10, "providerId"]
756
-   genes  <- mcols(mdb)[random.10, "geneSymbol"]
773
+} # test.geneToMotif
774
+#------------------------------------------------------------------------------------------------------------------------
775
+test.motifToGene <- function()
776
+{
777
+   printf("--- test.motifToGene")
778
+   mdb <- MotifDb
757 779
 
758
-   tbl.m2tf <- mapMotifToTranscriptionFactorGeneSymbol(mdb, motifs, mode="direct")
759
-   tbl.m2tf <- mapMotifToTranscriptionFactorGeneSymbol(mdb, motifs, mode="indirect")
780
+   motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")
760 781
 
761
-   tbl.tf2m <- mapTranscriptionFactorGeneSymbolToMotif(mdb, genes, mode="direct")
762
-   tbl.tf2m <- mapTranscriptionFactorGeneSymbolToMotif(mdb, genes, mode="indirect")
782
+      # TFClass mode uses  TF family classifcation
783
+   tbl.d <- motifToGene(mdb, motifs, source="MotifDb")
784
+   checkEquals(dim(tbl.d), c(3, 6))
785
+   checkEquals(tbl.d$motif, c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
786
+   checkEquals(tbl.d$geneSymbol, c("Esrra", "ELF1", "Zfp740"))
787
+   checkEquals(tbl.d$dataSource, c("jaspar2016", "SwissRegulon", "UniPROBE"))
788
+   checkEquals(tbl.d$organism,   c("Mmusculus", "Hsapiens", "Mmusculus"))
789
+   checkEquals(tbl.d$from,       rep("MotifDb", 3))
790
+
791
+      # MotifDb mode uses the MotifDb metadata, pulled from many sources
792
+   tbl.i <- motifToGene(mdb, motifs, source="TFClass")
793
+   checkEquals(dim(tbl.i), c(9,4))
794
+   checkEquals(tbl.i$motif, rep("MA0592.2", 9))
795
+   checkEquals(sort(tbl.i$gene), c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR"))
796
+   checkEquals(tbl.i$from,       rep("TFClass", 9))
797
+
798
+} # test.geneToMotif
799
+#------------------------------------------------------------------------------------------------------------------------
800
+test.associateTranscriptionFactors <- function()
801
+{
802
+   printf("--- test.associateTranscriptionFactors")
803
+
804
+   mdb <- MotifDb
805
+   pfms <- query(query(mdb, "jaspar2016"), "sapiens")
763 806
 
764
-      # check these 4 tables for agreement
807
+      # first check motifs with MotifDb-style long names, using MotifDb lookup, in the
808
+      # metadata of MotifDb:
809
+      #      "Hsapiens-jaspar2016-RUNX1-MA0002.1"  "Hsapiens-jaspar2016-TFAP2A-MA0003.1"
765 810
 
811
+   motif.names <- names(pfms[1:5])
812
+   tbl <- data.frame(motifLongName=motif.names, score=runif(5), stringsAsFactors=FALSE)
813
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motifLongName", source="MotifDb", expand.rows=FALSE)
814
+   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
815
+   checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno)))
766 816
 
767
-} # test.motifTfGeneMapping
817
+      # now add in a bogus motif name, one for which there cannot possibly be a TF
818
+
819
+   motif.names[3] <- "bogus"
820
+   tbl <- data.frame(motifLongName=motif.names, score=runif(5), stringsAsFactors=FALSE)
821
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motifLongName", source="MotifDb", expand.rows=FALSE)
822
+   checkTrue(is.na(tbl.anno$geneSymbol[3]))
823
+   checkTrue(is.na(tbl.anno$pubmedID[3]))
824
+
825
+      # now check motifs with short, traditional names, in "TFClass" mode, which uses
826
+      # the tfFamily
827
+      #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
828
+
829
+   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
830
+   tbl <- data.frame(motif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
831
+
832
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motif", source="TFClass", expand.rows=FALSE)
833
+   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
834
+
835
+      # now add in a bogus motif name, one for which there cannot possibly be a TF
836
+
837
+   motif.names <- names(pfms[1:5])
838
+   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
839
+   short.motif.names[3] <- "bogus"
840
+   tbl <- data.frame(motif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
841
+
842
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motif", source="TFClass", expand.rows=FALSE)
843
+   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
844
+
845
+} # test.associateTranscriptionFactors
846
+#------------------------------------------------------------------------------------------------------------------------
847
+#  MA0003.3 is mapped by TFClass to TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E   (pumbedID 23180794)
848
+#  use it as a first test of the "exapand.rows" argument to associateTranscriptionFactors method
849
+#  MA0017.1  is unmapped by TFClass
850
+#  MA0017.2  is mapped to NR2F1
851
+test.associateTranscriptionFactors_expandRows <- function()
852
+{
853
+   printf("--- test.assoicateTranscriptionFactors")
854
+
855
+   mdb <- MotifDb
856
+   motifs <- c("MA0003.3", "MA0017.1", "MA0017.2")
857
+   tbl <- data.frame(motif=motifs, stringsAsFactors=FALSE)
858
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif", source="TFClass", expand.rows=FALSE)
859
+   checkEquals(dim(tbl.anno), c(3,3))
860
+   checkEquals(tbl.anno$geneSymbol[c(1,3)], c("TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E", "NR2F1"))
861
+   checkTrue(is.na(tbl.anno$geneSymbol[2]))
862
+
863
+   tbl.annoX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif", source="TFClass", expand.rows=TRUE)
864
+   checkEquals(dim(tbl.annoX), c(7,3))
865
+   checkEquals(tbl.annoX$motif, c(rep("MA0003.3", 5), "MA0017.2", "MA0017.1"))
866
+   checkEquals(tbl.annoX$geneSymbol[1:6], c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E", "NR2F1"))
867
+   checkTrue(is.na(tbl.annoX$geneSymbol[7]))
868
+
869
+      # now a large scale test
870
+   set.seed(37)
871
+   mdb.human <- query(mdb, "sapiens")   # > 4000 human matrices
872
+   indices <- sample(1:length(mdb.human), size=250)
873
+   motif.long.names <- names(mdb.human)[indices]
874
+   motif.short.names <- mcols(mdb.human)[indices, "providerId"]
875
+   tbl <- data.frame(motif.short=motif.short.names, motif.long=motif.long.names, stringsAsFactors=FALSE)
876
+
877
+   tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.long", source="MotifDb", expand.rows=FALSE)
878
+   tbl.anno.mdbX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.long", source="MotifDb", expand.rows=TRUE)
879
+
880
+   tbl.anno.tfc  <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.short", source="TFClass", expand.rows=FALSE)
881
+   checkEquals(nrow(tbl.anno.tfc), length(motif.short.names))
882
+   checkTrue(length(grep(";", tbl.anno.tfc$geneSymbol)) > 20)
883
+
884
+   tbl.anno.tfcX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.short", source="TFClass", expand.rows=TRUE)
885
+   checkTrue(nrow(tbl.anno.tfcX) > nrow(tbl.anno.tfc))   # 250 vs 779
886
+   checkEquals(length(grep(";", tbl.anno.tfcX$geneSymbol)), 0)
887
+
888
+
889
+} # test.associateTranscriptionFactors_expandRows
768 890
 #------------------------------------------------------------------------------------------------------------------------