Browse code

motifToGene sidesteps regexes by grep fixed=TRUE

paul-shannon authored on 03/10/2017 18:13:41
Showing3 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.19.10
4
+Version: 1.19.11
5 5
 Date: 2017-10-03
6 6
 Author: Paul Shannon, Matt Richards
7 7
 Maintainer: Paul Shannon <pshannon@systemsbiology.org>
... ...
@@ -353,14 +353,17 @@ setMethod ('motifToGene', 'MotifList',
353 353
      tbl <- data.frame()
354 354
      if(source %in% c("motifdb")){
355 355
         tbl <- as.data.frame(subset(mcols(object), providerId %in% motifs))
356
+        if(nrow(tbl) == 0)
357
+           return(data.frame())
356 358
         tbl <- unique(tbl [, c("geneSymbol", "providerId", "dataSource", "organism", "pubmedID")])
357 359
         colnames(tbl) <- c("geneSymbol", "motif", "dataSource", "organism", "pubmedID")
358
-        tbl
359 360
         tbl <- tbl[, c("motif", "geneSymbol", "dataSource", "organism", "pubmedID")]
360 361
         tbl$from <- "MotifDb"
361 362
         }
362 363
      if(source %in% c("tfclass")){
363 364
         tbl <- subset(object@manuallyCuratedGeneMotifAssociationTable, motif %in% motifs)
365
+        if(nrow(tbl) == 0)
366
+           return(data.frame())
364 367
         tbl <- unique(tbl[, c("motif", "tf.gene", "pubmedID")])
365 368
         tbl <- tbl[order(tbl$motif),]
366 369
         rownames(tbl) <- NULL
... ...
@@ -424,9 +427,11 @@ setMethod('associateTranscriptionFactors', 'MotifList',
424 427
            }
425 428
         tbl.tfClass <- read.table(system.file(package="MotifDb", "extdata", "tfClass.tsv"), sep="\t", as.is=TRUE, header=TRUE)
426 429
         motif.ids <- tbl.withMotifs[, "shortMotif"]
427
-        geneSymbols <- lapply(motif.ids, function(id) paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif)], collapse=";"))
430
+        geneSymbols <- lapply(motif.ids, function(id)
431
+                                 paste(tbl.tfClass$tf.gene[grep(id, tbl.tfClass$motif, fixed=TRUE)], collapse=";"))
428 432
         geneSymbols <- unlist(geneSymbols)
429
-        pubmedIds   <- lapply(motif.ids, function(id) unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif)]))
433
+        pubmedIds   <- lapply(motif.ids, function(id)
434
+                                 unique(tbl.tfClass$pubmedID[grep(id, tbl.tfClass$motif, fixed=TRUE)]))
430 435
         pubmedIds   <- as.character(pubmedIds)
431 436
         pubmedIds   <- gsub("integer(0)", "", pubmedIds, fixed=TRUE)
432 437
         tbl.new     <- data.frame(geneSymbol=geneSymbols, pubmedID=pubmedIds, stringsAsFactors=FALSE)
... ...
@@ -772,19 +772,19 @@ test.geneToMotif <- function()
772 772
       # MotifDb for ATF5
773 773
       # todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1
774 774
 
775
-    # now try motifs to genes
776
-
777 775
 } # test.geneToMotif
778 776
 #------------------------------------------------------------------------------------------------------------------------
779 777
 test.motifToGene <- function()
780 778
 {
781 779
    printf("--- test.motifToGene")
782
-   mdb <- MotifDb
783 780
 
784 781
    motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")
785 782
 
786
-      # TFClass mode uses  TF family classifcation
787
-   tbl.d <- motifToGene(mdb, motifs, source="MotifDb")
783
+   set.seed(31);
784
+   motifs.long <- names(MotifDb)[sample(1:length(MotifDb), 10)]
785
+
786
+      # MotifDb mode uses the MotifDb metadata, pulled from many sources
787
+   tbl.d <- motifToGene(MotifDb, motifs, source="MotifDb")
788 788
    checkEquals(dim(tbl.d), c(3, 6))
789 789
    checkEquals(tbl.d$motif, c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
790 790
    checkEquals(tbl.d$geneSymbol, c("Esrra", "ELF1", "Zfp740"))
... ...
@@ -792,14 +792,26 @@ test.motifToGene <- function()
792 792
    checkEquals(tbl.d$organism,   c("Mmusculus", "Hsapiens", "Mmusculus"))
793 793
    checkEquals(tbl.d$from,       rep("MotifDb", 3))
794 794
 
795
-      # MotifDb mode uses the MotifDb metadata, pulled from many sources
796
-   tbl.i <- motifToGene(mdb, motifs, source="TFClass")
795
+
796
+      # TFClass mode uses  TF family classifcation
797
+   tbl.i <- motifToGene(MotifDb, motifs, source="TFClass")
797 798
    checkEquals(dim(tbl.i), c(9,4))
798 799
    checkEquals(tbl.i$motif, rep("MA0592.2", 9))
799 800
    checkEquals(sort(tbl.i$gene), c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR"))
800 801
    checkEquals(tbl.i$from,       rep("TFClass", 9))
801 802
 
802
-} # test.geneToMotif
803
+     # test motifs with regex characters in them, or other characters neither letter nor number
804
+   motifs <- sort(c("DMAP1_NCOR{1,2}_SMARC.p2", "ELK1,4_GABP{A,B1}.p3", "SNAI1..3.p2", "EWSR1-FLI1.p2", "ETS1,2.p2"))
805
+   tbl <- motifToGene(MotifDb, motifs, source="MotifDb")
806
+   checkEquals(nrow(tbl), 0)
807
+
808
+   tbl <- motifToGene(MotifDb, motifs, source="tfclass")
809
+   checkEquals(ncol(tbl), 4)
810
+   checkTrue(nrow(tbl) > 80)
811
+   checkTrue(nrow(tbl) < 100)
812
+   checkTrue(all(motifs %in% tbl$motif))
813
+
814
+} # test.motifToGene
803 815
 #------------------------------------------------------------------------------------------------------------------------
804 816
 test.associateTranscriptionFactors <- function()
805 817
 {