Browse code

associateTranscriptionFactors greatly simplified, builds upon motifToGene

paul-shannon authored on 18/05/2018 23:22:10
Showing3 changed files

... ...
@@ -1,14 +1,14 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.21.6
4
+Version: 1.21.7
5 5
 Date: 2018-05-18
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
8 8
 Depends: R (>= 2.15.0), methods, BiocGenerics, S4Vectors, IRanges, Biostrings
9 9
 Suggests: RUnit, seqLogo, MotIV
10 10
 Imports: rtracklayer, splitstackshape
11
-Description: More than 9000 annotated position frequency matrices from 14 public sources, for multiple organisms.
11
+Description: More than 9900 annotated position frequency matrices from 14 public sources, for multiple organisms.
12 12
 License: Artistic-2.0 | file LICENSE
13 13
 License_is_FOSS: no
14 14
 License_restricts_use: yes
... ...
@@ -475,6 +475,9 @@ setMethod ('motifToGene', 'MotifList',
475 475
            }
476 476
         }
477 477
       tbl.out <- rbind(tbl.mdb, tbl.tfc)
478
+      dups <- which(duplicated(tbl.out[, c("motif", "geneSymbol", "organism", "source")]))
479
+      if(length(dups) > 0)
480
+         tbl.out <- tbl.out[-dups,]
478 481
       if(length(name.map) > 0)
479 482
          tbl.out$motif <- as.character(name.map[tbl.out$motif])
480 483
       #browser()
... ...
@@ -525,53 +528,62 @@ setMethod ('geneToMotif', 'MotifList',
525 528
 #-------------------------------------------------------------------------------
526 529
 setMethod('associateTranscriptionFactors', 'MotifList',
527 530
 
528
-   function(object, tbl.withMotifs, source, expand.rows){
529
-     source <- tolower(source)
530
-     stopifnot(source %in% c("motifdb", "tfclass"))
531
-     tbl.out <- data.frame()
532
-     if(source %in% c("motifdb")){
533
-           # lookup up in the object metadata, expect one TF geneSymbol per matrix name
534
-        pfm.ids <- tbl.withMotifs[, "motifName"]
535
-        matched.rows <- match(pfm.ids, names(as.list(object)))
536
-        #if(length(matched.rows) == nrow(tbl.withMotifs)) {
537
-        tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")]
538
-        tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
539
-        tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
540
-        tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
541
-        } # direct
542
-     if(source %in% c("tfclass")){
543
-        if(! "shortMotif" %in% colnames(tbl.withMotifs)){
544
-           stop("MotifDb::assoicateTranscriptionFactors needs a 'shortMotif' column with the TFClass source")
545
-           }
546
-        tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE)
547
-        motif.ids <- tbl.withMotifs[, "shortMotif"]
548
-        geneSymbols <- lapply(motif.ids, function(id)
549
-                                 paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif, fixed=TRUE)], collapse=";"))
550
-        geneSymbols <- unlist(geneSymbols)
551
-        pubmedIds   <- lapply(motif.ids, function(id)
552
-                                 unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif, fixed=TRUE)]))
553
-        pubmedIds   <- as.character(pubmedIds)
554
-        pubmedIds   <- gsub("integer(0)", "", pubmedIds, fixed=TRUE)
555
-        tbl.new     <- data.frame(geneSymbol=geneSymbols, pubmedID=pubmedIds, stringsAsFactors=FALSE)
556
-        tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
557
-        tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
558
-        tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
559
-
560
-        if(expand.rows){
561
-           rows.with.na <- which(is.na(tbl.out$geneSymbol))
562
-           rows.with.geneSymbol <- setdiff(1:nrow(tbl.out), rows.with.na)
563
-           tbl.asIs <- tbl.out[rows.with.na,]
564
-           tbl.toExpand <- tbl.out[rows.with.geneSymbol,]
565
-           geneSymbols.split <- strsplit(tbl.toExpand$geneSymbol, ";")
566
-           counts <- unlist(lapply(geneSymbols.split, length))
567
-           geneSymbols.split.vec <- unlist(geneSymbols.split)
568
-           tbl.expanded <- splitstackshape::expandRows(tbl.toExpand, counts, count.is.col=FALSE, drop=FALSE)
569
-           stopifnot(length(geneSymbols.split.vec) == nrow(tbl.expanded))
570
-           tbl.expanded$geneSymbol <- geneSymbols.split.vec
571
-           tbl.out <- rbind(tbl.expanded, tbl.asIs)
572
-           }
573
-        } # indirect
574
-     tbl.out
575
-     })
531
+     function(object, tbl.withMotifs, source, expand.rows){
532
+        stopifnot("motifName" %in% colnames(tbl.withMotifs))
533
+        tbl.tf <- motifToGene(object, tbl.withMotifs$motifName, source)
534
+        merge(tbl.withMotifs, tbl.tf, by.x="motifName", by.y="motif", all.x=TRUE)
535
+        })
576 536
 
577 537
 #-------------------------------------------------------------------------------
538
+# setMethod('associateTranscriptionFactors', 'MotifList',
539
+#
540
+#    function(object, tbl.withMotifs, source, expand.rows){
541
+#      source <- tolower(source)
542
+#      stopifnot(source %in% c("motifdb", "tfclass"))
543
+#      tbl.out <- data.frame()
544
+#      if(source %in% c("motifdb")){
545
+#            # lookup up in the object metadata, expect one TF geneSymbol per matrix name
546
+#         pfm.ids <- tbl.withMotifs[, "motifName"]
547
+#         matched.rows <- match(pfm.ids, names(as.list(object)))
548
+#         #if(length(matched.rows) == nrow(tbl.withMotifs)) {
549
+#         tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")]
550
+#         tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
551
+#         tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
552
+#         tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
553
+#         } # direct
554
+#      if(source %in% c("tfclass")){
555
+#         if(! "shortMotif" %in% colnames(tbl.withMotifs)){
556
+#            stop("MotifDb::assoicateTranscriptionFactors needs a 'shortMotif' column with the TFClass source")
557
+#            }
558
+#         tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE)
559
+#         motif.ids <- tbl.withMotifs[, "shortMotif"]
560
+#         geneSymbols <- lapply(motif.ids, function(id)
561
+#                                  paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif, fixed=TRUE)], collapse=";"))
562
+#         geneSymbols <- unlist(geneSymbols)
563
+#         pubmedIds   <- lapply(motif.ids, function(id)
564
+#                                  unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif, fixed=TRUE)]))
565
+#         pubmedIds   <- as.character(pubmedIds)
566
+#         pubmedIds   <- gsub("integer(0)", "", pubmedIds, fixed=TRUE)
567
+#         tbl.new     <- data.frame(geneSymbol=geneSymbols, pubmedID=pubmedIds, stringsAsFactors=FALSE)
568
+#         tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
569
+#         tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
570
+#         tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
571
+#
572
+#         if(expand.rows){
573
+#            rows.with.na <- which(is.na(tbl.out$geneSymbol))
574
+#            rows.with.geneSymbol <- setdiff(1:nrow(tbl.out), rows.with.na)
575
+#            tbl.asIs <- tbl.out[rows.with.na,]
576
+#            tbl.toExpand <- tbl.out[rows.with.geneSymbol,]
577
+#            geneSymbols.split <- strsplit(tbl.toExpand$geneSymbol, ";")
578
+#            counts <- unlist(lapply(geneSymbols.split, length))
579
+#            geneSymbols.split.vec <- unlist(geneSymbols.split)
580
+#            tbl.expanded <- splitstackshape::expandRows(tbl.toExpand, counts, count.is.col=FALSE, drop=FALSE)
581
+#            stopifnot(length(geneSymbols.split.vec) == nrow(tbl.expanded))
582
+#            tbl.expanded$geneSymbol <- geneSymbols.split.vec
583
+#            tbl.out <- rbind(tbl.expanded, tbl.asIs)
584
+#            }
585
+#         } # indirect
586
+#      tbl.out
587
+#      })
588
+#
589
+#-------------------------------------------------------------------------------
... ...
@@ -896,12 +896,12 @@ test.motifToGene <- function()
896 896
 
897 897
       # MotifDb mode uses the MotifDb metadata "providerId",
898 898
    tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb")
899
-   checkEquals(dim(tbl.mdb), c(4, 5))
900
-   expected <- sort(c("MA0592.2", "MA0592.2", "ELF1.SwissRegulon", "UP00022"))
899
+   checkEquals(dim(tbl.mdb), c(3, 5))
900
+   expected <- sort(c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
901 901
    actual <- sort(tbl.mdb$motif)
902 902
    checkEquals(actual, expected)
903
-   checkEquals(sort(tbl.mdb$geneSymbol), sort(c("ELF1", "Esrra", "Esrra", "Zfp740")))
904
-   checkEquals(tbl.mdb$source,     rep("MotifDb", 4))
903
+   checkEquals(sort(tbl.mdb$geneSymbol), sort(c("ELF1", "Esrra", "Zfp740")))
904
+   checkEquals(tbl.mdb$source,     rep("MotifDb", 3))
905 905
 
906 906
       # TFClass mode uses  TF family classifcation
907 907
    tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass")
... ...
@@ -948,7 +948,10 @@ test.motifToGene <- function()
948 948
    checkEquals(sort(unique(tbl$motif)), c("Hsapiens-jaspar2016-TFAP2A-MA0003.3", "MA0872.1"))
949 949
    checkEquals(sort(unique(tbl$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
950 950
 
951
-   tbl <- motifToGene(MotifDb, motifs, source="TFClass")
951
+   tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))
952
+   checkTrue(all(motifs %in% tbl$motif))
953
+   checkEquals(sort(unique(tbl$geneSymbol)),
954
+                    c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
952 955
 
953 956
    } # test.motifToGene
954 957
 #------------------------------------------------------------------------------------------------------------------------
... ...
@@ -958,66 +961,80 @@ test.associateTranscriptionFactors <- function()
958 961
 
959 962
    mdb <- MotifDb
960 963
    pfms <- query(mdb, andStrings=c("sapiens", "jaspar2016"))
961
-      # query(mdb, "jaspar2016", "sapiens")
962 964
 
965
+      # query(mdb, "jaspar2016", "sapiens")
963 966
       # first check motifs with MotifDb-style long names, using MotifDb lookup, in the
964 967
       # metadata of MotifDb:
965 968
       #      "Hsapiens-jaspar2016-RUNX1-MA0002.1"  "Hsapiens-jaspar2016-TFAP2A-MA0003.1"
966 969
 
967
-   motif.names <- names(pfms[1:5])
968
-   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
969
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
970
-   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
971
-   checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno)))
972
-   checkEquals(sort(tbl.anno$geneSymbol), sort(c("RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "AR")))
973
-
974
-      # now add in a bogus motif name, one for which there cannot possibly be a TF
975
-
976
-   motif.names[3] <- "bogus"
977
-   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
978
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
979
-   checkTrue(is.na(tbl.anno$geneSymbol[3]))
980
-   checkTrue(is.na(tbl.anno$pubmedID[3]))
981
-
982
-      # now check motifs with short, traditional names, in "TFClass" mode, which uses
983
-      # the tfFamily
984
-      #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
985
-
986
-   motif.names <- names(pfms[1:5])
987
-   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
988
-   tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
989
-
990
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
991
-   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
992
-
993
-      # TFClass only annotates MA0003.3, none of the others
994
-   checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
995
-   checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
996
-   checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
997
-   checkEquals(tbl.anno$pubmedID[4],   "23180794")
998
-
999
-      # now ask for expandsion of the semicolon separated list
1000
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
1001
-   checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
1002
-   checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
1003
-
1004
-      # now add in a bogus motif name, one for which there cannot possibly be a TF
1005
-
1006
-   motif.names <- names(pfms[1:5])
1007
-   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
1008
-   short.motif.names[4] <- "bogus"
1009
-   tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
1010
-
1011
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
1012
-   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
1013
-      # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
1014
-   checkTrue(all(is.na(tbl.anno$geneSymbol)))
1015
-   checkTrue(all(is.na(tbl.anno$pubmedID)))
1016
-
1017
-      # now make sure that the absence of the  TFClass-specific "shortMotif" field is detected
1018
-   motif.names <- names(pfms[1:5])
1019
-   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
1020
-   checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
970
+   motif.names <- c(names(pfms[1:5]), "MA0872.1", "hocus.pocus")
971
+   tbl <- data.frame(motifName=motif.names, score=runif(7), stringsAsFactors=FALSE)
972
+
973
+   tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=TRUE)
974
+   checkEquals(nrow(tbl), nrow(tbl.anno.mdb))
975
+   checkTrue(is.na(tbl.anno.mdb$geneSymbol[grep("hocus.pocus", tbl.anno.mdb$motifName)]))
976
+   checkTrue(all(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)") %in% tbl.anno.mdb$geneSymbol))
977
+
978
+   tbl.anno.tfc <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
979
+   checkTrue(nrow(tbl) < nrow(tbl.anno.tfc))
980
+   checkEquals(sort(unique(tbl.anno.tfc$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
981
+
982
+   tbl.anno.both <- associateTranscriptionFactors(mdb, tbl, source=c("MotifDb", "TFClass"), expand.rows=TRUE)
983
+   checkEquals(length(grep("MotifDb", tbl.anno.both$source)), 6)
984
+   checkEquals(length(grep("TFClass", tbl.anno.both$source)), 10)
985
+
986
+   #   checkEquals(dim(tbl.anno.mdb), c(nrow(tbl), ncol(tbl) + 4))
987
+   #   checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno.mdb)))
988
+   #   checkEquals(sort(tbl.anno.mdb$geneSymbol),
989
+   #               sort(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)")))
990
+   #
991
+   #      # now add in a bogus motif name, one for which there cannot possibly be a TF
992
+   #
993
+   #   motif.names[3] <- "bogus"
994
+   #   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
995
+   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
996
+   #   checkTrue(is.na(tbl.anno$geneSymbol[3]))
997
+   #   checkTrue(is.na(tbl.anno$pubmedID[3]))
998
+   #
999
+   #      # now check motifs with short, traditional names, in "TFClass" mode, which uses
1000
+   #      # the tfFamily
1001
+   #      #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
1002
+   #
1003
+   #   motif.names <- names(pfms[1:5])
1004
+   #   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
1005
+   #   tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
1006
+   #
1007
+   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
1008
+   #   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
1009
+   #
1010
+   #      # TFClass only annotates MA0003.3, none of the others
1011
+   #   checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
1012
+   #   checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
1013
+   #   checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
1014
+   #   checkEquals(tbl.anno$pubmedID[4],   "23180794")
1015
+   #
1016
+   #      # now ask for expandsion of the semicolon separated list
1017
+   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
1018
+   #   checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
1019
+   #   checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
1020
+   #
1021
+   #      # now add in a bogus motif name, one for which there cannot possibly be a TF
1022
+   #
1023
+   #   motif.names <- names(pfms[1:5])
1024
+   #   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
1025
+   #   short.motif.names[4] <- "bogus"
1026
+   #   tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
1027
+   #
1028
+   #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
1029
+   #   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
1030
+   #      # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
1031
+   #   checkTrue(all(is.na(tbl.anno$geneSymbol)))
1032
+   #   checkTrue(all(is.na(tbl.anno$pubmedID)))
1033
+   #
1034
+   #      # now make sure that the absence of the  TFClass-specific "shortMotif" field is detected
1035
+   #   motif.names <- names(pfms[1:5])
1036
+   #   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
1037
+   #   checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
1021 1038
 
1022 1039
       # now some motif names
1023 1040
 } # test.associateTranscriptionFactors