Browse code

now annotates to TFs properly for MotifDb and TFClass sources

paul-shannon authored on 12/09/2017 20:20:46
Showing 3 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.19.6
5
-Date: 2017-09-07
4
+Version: 1.19.7
5
+Date: 2017-09-12
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
8 8
 Depends: R (>= 2.15.0), methods, BiocGenerics, S4Vectors, IRanges, Biostrings
... ...
@@ -3,8 +3,7 @@ setGeneric('query', signature='object', function(object, queryString, ignore.cas
3 3
 setGeneric('motifToGene', signature='object', function(object, motifs, source) standardGeneric('motifToGene'))
4 4
 setGeneric('geneToMotif', signature='object', function(object, geneSymbols, source) standardGeneric('geneToMotif'))
5 5
 setGeneric('associateTranscriptionFactors', signature='object',
6
-           function(object, tbl.withMotifs, motif.column.name, source, expand.rows)
7
-              standardGeneric('associateTranscriptionFactors'))
6
+           function(object, tbl.withMotifs,  source, expand.rows) standardGeneric('associateTranscriptionFactors'))
8 7
 #------------------------------------------------------------------------------------------------------------------------
9 8
 setClass ('MotifList',
10 9
           contains='SimpleList',
... ...
@@ -400,23 +399,25 @@ setMethod ('geneToMotif', 'MotifList',
400 399
 #-------------------------------------------------------------------------------
401 400
 setMethod('associateTranscriptionFactors', 'MotifList',
402 401
 
403
-   function(object, tbl.withMotifs, motif.column.name, source, expand.rows){
402
+   function(object, tbl.withMotifs, source, expand.rows){
404 403
      stopifnot(source %in% c("MotifDb", "TFClass"))
405 404
      tbl.out <- data.frame()
406 405
      if(source %in% c("MotifDb")){
407
-        # "direct" means:  lookup up in the object metadata, expect one TF geneSymbol per matrix name
408
-        pfm.ids <- tbl.withMotifs[, motif.column.name]
406
+           # lookup up in the object metadata, expect one TF geneSymbol per matrix name
407
+        pfm.ids <- tbl.withMotifs[, "motifName"]
409 408
         matched.rows <- match(pfm.ids, names(as.list(object)))
410 409
         #if(length(matched.rows) == nrow(tbl.withMotifs)) {
411 410
         tbl.new <- mcols(object)[matched.rows, c("geneSymbol", "pubmedID")]
412 411
         tbl.new$geneSymbol[nchar(tbl.new$geneSymbol)==0] <- NA
413 412
         tbl.new$pubmedID[nchar(tbl.new$pubmedID)==0] <- NA
414 413
         tbl.out <- as.data.frame(cbind(tbl.withMotifs, tbl.new))
415
-         #}
416 414
         } # direct
417 415
      if(source %in% c("TFClass")){
416
+        if(! "shortMotif" %in% colnames(tbl.withMotifs)){
417
+           stop("MotifDb::assoicateTranscriptionFactors needs a 'shortMotif' column with the TFClass source")
418
+           }
418 419
         tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE)
419
-        motif.ids <- tbl.withMotifs[, motif.column.name]
420
+        motif.ids <- tbl.withMotifs[, "shortMotif"]
420 421
         geneSymbols <- lapply(motif.ids, function(id) paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif)], collapse=";"))
421 422
         geneSymbols <- unlist(geneSymbols)
422 423
         pubmedIds   <- lapply(motif.ids, function(id) unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif)]))
... ...
@@ -809,16 +809,17 @@ test.associateTranscriptionFactors <- function()
809 809
       #      "Hsapiens-jaspar2016-RUNX1-MA0002.1"  "Hsapiens-jaspar2016-TFAP2A-MA0003.1"
810 810
 
811 811
    motif.names <- names(pfms[1:5])
812
-   tbl <- data.frame(motifLongName=motif.names, score=runif(5), stringsAsFactors=FALSE)
813
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motifLongName", source="MotifDb", expand.rows=FALSE)
812
+   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
813
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
814 814
    checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
815 815
    checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno)))
816
+   checkEquals(tbl.anno$geneSymbol, c("RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "AR"))
816 817
 
817 818
       # now add in a bogus motif name, one for which there cannot possibly be a TF
818 819
 
819 820
    motif.names[3] <- "bogus"
820
-   tbl <- data.frame(motifLongName=motif.names, score=runif(5), stringsAsFactors=FALSE)
821
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motifLongName", source="MotifDb", expand.rows=FALSE)
821
+   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
822
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
822 823
    checkTrue(is.na(tbl.anno$geneSymbol[3]))
823 824
    checkTrue(is.na(tbl.anno$pubmedID[3]))
824 825
 
... ...
@@ -826,65 +827,42 @@ test.associateTranscriptionFactors <- function()
826 827
       # the tfFamily
827 828
       #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
828 829
 
830
+   motif.names <- names(pfms[1:5])
829 831
    short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
830
-   tbl <- data.frame(motif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
832
+   tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
831 833
 
832
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motif", source="TFClass", expand.rows=FALSE)
834
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
833 835
    checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
834 836
 
837
+      # TFClass only annotates MA0003.3, none of the others
838
+   checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
839
+   checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
840
+   checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
841
+   checkEquals(tbl.anno$pubmedID[4],   "23180794")
842
+
843
+      # now ask for expandsion of the semicolon separated list
844
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
845
+   checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
846
+   checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
847
+
835 848
       # now add in a bogus motif name, one for which there cannot possibly be a TF
836 849
 
837 850
    motif.names <- names(pfms[1:5])
838 851
    short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
839
-   short.motif.names[3] <- "bogus"
840
-   tbl <- data.frame(motif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
852
+   short.motif.names[4] <- "bogus"
853
+   tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
841 854
 
842
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, "motif", source="TFClass", expand.rows=FALSE)
855
+   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
843 856
    checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
857
+      # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
858
+   checkTrue(all(is.na(tbl.anno$geneSymbol)))
859
+   checkTrue(all(is.na(tbl.anno$pubmedID)))
844 860
 
845
-} # test.associateTranscriptionFactors
846
-#------------------------------------------------------------------------------------------------------------------------
847
-#  MA0003.3 is mapped by TFClass to TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E   (pumbedID 23180794)
848
-#  use it as a first test of the "exapand.rows" argument to associateTranscriptionFactors method
849
-#  MA0017.1  is unmapped by TFClass
850
-#  MA0017.2  is mapped to NR2F1
851
-test.associateTranscriptionFactors_expandRows <- function()
852
-{
853
-   printf("--- test.assoicateTranscriptionFactors")
861
+      # now make sure that the absence of the  TFClass-specific "shortMotif" field is detected
862
+   motif.names <- names(pfms[1:5])
863
+   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
864
+   checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
854 865
 
855
-   mdb <- MotifDb
856
-   motifs <- c("MA0003.3", "MA0017.1", "MA0017.2")
857
-   tbl <- data.frame(motif=motifs, stringsAsFactors=FALSE)
858
-   tbl.anno <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif", source="TFClass", expand.rows=FALSE)
859
-   checkEquals(dim(tbl.anno), c(3,3))
860
-   checkEquals(tbl.anno$geneSymbol[c(1,3)], c("TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E", "NR2F1"))
861
-   checkTrue(is.na(tbl.anno$geneSymbol[2]))
862
-
863
-   tbl.annoX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif", source="TFClass", expand.rows=TRUE)
864
-   checkEquals(dim(tbl.annoX), c(7,3))
865
-   checkEquals(tbl.annoX$motif, c(rep("MA0003.3", 5), "MA0017.2", "MA0017.1"))
866
-   checkEquals(tbl.annoX$geneSymbol[1:6], c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E", "NR2F1"))
867
-   checkTrue(is.na(tbl.annoX$geneSymbol[7]))
868
-
869
-      # now a large scale test
870
-   set.seed(37)
871
-   mdb.human <- query(mdb, "sapiens")   # > 4000 human matrices
872
-   indices <- sample(1:length(mdb.human), size=250)
873
-   motif.long.names <- names(mdb.human)[indices]
874
-   motif.short.names <- mcols(mdb.human)[indices, "providerId"]
875
-   tbl <- data.frame(motif.short=motif.short.names, motif.long=motif.long.names, stringsAsFactors=FALSE)
876
-
877
-   tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.long", source="MotifDb", expand.rows=FALSE)
878
-   tbl.anno.mdbX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.long", source="MotifDb", expand.rows=TRUE)
879
-
880
-   tbl.anno.tfc  <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.short", source="TFClass", expand.rows=FALSE)
881
-   checkEquals(nrow(tbl.anno.tfc), length(motif.short.names))
882
-   checkTrue(length(grep(";", tbl.anno.tfc$geneSymbol)) > 20)
883
-
884
-   tbl.anno.tfcX <- associateTranscriptionFactors(mdb, tbl, motif.column.name="motif.short", source="TFClass", expand.rows=TRUE)
885
-   checkTrue(nrow(tbl.anno.tfcX) > nrow(tbl.anno.tfc))   # 250 vs 779
886
-   checkEquals(length(grep(";", tbl.anno.tfcX$geneSymbol)), 0)
887
-
888
-
889
-} # test.associateTranscriptionFactors_expandRows
866
+
867
+} # test.associateTranscriptionFactors
890 868
 #------------------------------------------------------------------------------------------------------------------------