Browse code

added test of the gene symbol bug reported by Robert Stojnic

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75491 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 10/04/2013 00:29:11
Showing 2 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: MotifDb
2 2
 Type: Package
3 3
 Title: An Annotated Collection of Protein-DNA Binding Sequence Motifs
4
-Version: 1.3.7
4
+Version: 1.3.8
5 5
 Date: 2013-04-09
6 6
 Author: Paul Shannon
7 7
 Maintainer: Paul Shannon <pshannon@fhcrc.org>
... ...
@@ -35,7 +35,7 @@ run.tests = function ()
35 35
   test.export_memeFormatToFile_run_tomtom ()
36 36
   test.run_MotIV ()
37 37
   test.MotIV.toTable ()
38
-  
38
+  test.flyFactorGeneSymbols()
39 39
 
40 40
 } # run.tests
41 41
 #------------------------------------------------------------------------------------------------------------------------
... ...
@@ -644,5 +644,47 @@ pwmMatch.toTable = function (motifMatch) {
644 644
    }
645 645
    names(df.list) <- names(motifMatch)
646 646
    return (df.list)
647
-}
648
-#------------------------------------------------------------------------------------------------------------------------
647
+
648
+} # pwmMatch.toTable
649
+#------------------------------------------------------------------------------
650
+# Robert Stojnic reports incorrect gene symbols for matrices obtained from
651
+# flyFactorSurvey.
652
+# the solution was to abandon the original strategy of extracting the
653
+# symbol from the matrix (and file) name.
654
+# now, the flybase importer ("inst/scripts/import/flyFactorSurvey/import.R")
655
+# uses FBgn id (which can be reliably extracted) and uses indpendent
656
+# data sources to learn the gene symbol.
657
+#
658
+# robert's email:
659
+#  I'm working on using MotifDb motifs in my PWMEnrich package and I
660
+#  have noticed that there is a slight problem with gene symbols for
661
+#  Drosophila. In particular, the gene symbols do not always correspond
662
+#  to the gene ID and are frequently mis-capitalized. In Drosophila z
663
+#  and Z are two different genes and capitalization does matter if
664
+#  someone is to use the gene symbols. Also, in some cases the symbols
665
+#  are missing hyphens or parenthesis. I have used the gene IDs and the
666
+#  Flybase annotation database to set the correct gene symbols for
667
+#  Drosophila, please find attached the result of my re-annotation.
668
+#
669
+#  looking at his correctedMotifDbDmel.csv 
670
+#
671
+#    head(read.table("correctedMotifDbDmel.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))
672
+#                  providerName oldGeneSymbol newGeneSymbol
673
+#    1 ab_SANGER_10_FBgn0259750            Ab            ab
674
+#    2  ab_SOLEXA_5_FBgn0259750            Ab            ab
675
+#    3 Abd-A_FlyReg_FBgn0000014         Abd-a         abd-A
676
+#    4 Abd-B_FlyReg_FBgn0000015         Abd-b         Abd-B
677
+#    5    AbdA_Cell_FBgn0000014          Abda         abd-A
678
+#    6  AbdA_SOLEXA_FBgn0000014          Abda         abd-A
679
+#
680
+test.flyFactorGeneSymbols <- function()
681
+{
682
+    print ("--- test.flyFactorGeneSymbols")
683
+    mdb = MotifDb
684
+    checkEquals(mcols(query(mdb, "FBgn0259750"))$geneSymbol, c("ab", "ab"))
685
+    checkEquals(mcols(query(mdb, "FBgn0000014"))$geneSymbol, rep("abd-A", 3))
686
+    checkEquals(mcols(query(mdb, "FBgn0000015"))$geneSymbol, rep("Abd-B", 3))
687
+
688
+} # test.flyFactorGeneSymbols
689
+#-------------------------------------------------------------------------------
690
+