inst/unitTests/test_MotifDb.R
b70c1a6e
 library (MotifDb)
 library (RUnit)
000f7c78
 library (MotIV)
b70c1a6e
 library (seqLogo)
1f060cb8
 #----------------------------------------------------------------------------------------------------
 printf <- function(...) print(noquote(sprintf(...)))
 #----------------------------------------------------------------------------------------------------
49ebd9dc
 runTests = function ()
b70c1a6e
 {
   test.emptyCtor ()
   test.nonEmptyCtor ()
   test.MotifDb.normalMode ()
   test.MotifDb.emptyMode ()
   test.allMatricesAreNormalized ()
   test.providerNames ()
   test.geneSymbols ()
2f30a7ba
   test.geneIdsAndTypes ()
b70c1a6e
   test.proteinIds ()
   test.sequenceCount ()
   test.longNames ()
   test.organisms ()
   test.bindingDomains ()
   test.experimentTypes ()
   test.tfFamilies ()
   test.bindingSequences ()
   test.flyBindingDomains ()
   test.pubmedIDs ()
   test.allFullNames ()
   test.subset ()
   test.subsetWithVariables ()
   test.query ()
   test.transformMatrixToMemeRepresentation ()
   test.matrixToMemeText ()
   test.export_memeFormatStdOut ()
   test.export_memeFormatToFile ()
   test.export_memeFormatToFileDuplication ()
   test.export_memeFormatToFile_run_tomtom ()
000f7c78
   test.MotIV.toTable ()
   test.run_MotIV.motifMatch()
77e1c264
   test.flyFactorGeneSymbols()
49ebd9dc
   test.export_jasparFormatStdOut()
   test.export_jasparFormatToFile()
b70c1a6e
 
49ebd9dc
   test.geneToMotif()
   test.motifToGene()
   test.associateTranscriptionFactors()
 
 } # runTests
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
 test.emptyCtor = function ()
 {
   print ('--- test.emptyCtor')
   motif.list = MotifDb:::MotifList ()
2def1eec
   checkEquals (length (motif.list), 0)
b70c1a6e
 
 } # test.emptyCtor
 #------------------------------------------------------------------------------------------------------------------------
 test.nonEmptyCtor = function ()
 {
   print ('--- test.nonEmptyCtor')
   mtx = matrix (runif (20), nrow=4, ncol=5, byrow=T, dimnames=list(c ('A', 'C', 'G', 'T'), as.character (1:5)))
2def1eec
   mtx.normalized = apply (mtx, 2, function (colvector) colvector / sum (colvector))
b70c1a6e
   matrixList = list (mtx.normalized)
 
   tbl.md = data.frame (providerName='',
                        providerId='',
                        dataSource='',
                        geneSymbol='',
                        geneId='',
                        geneIdType='',
                        proteinId='',
                        proteinIdType='',
                        organism='',
                        sequenceCount='',
                        bindingSequence='',
                        bindingDomain='',
                        tfFamily='',
                        experimentType='',
2def1eec
                        pubmedID='',
b70c1a6e
                        stringsAsFactors=FALSE)
   names (matrixList) = 'test'
   rownames (tbl.md) = 'test'
   motif.list = MotifDb:::MotifList (matrixList, tbl.md)
2def1eec
   checkEquals (length (motif.list), 1)
b70c1a6e
 
 } # test.nonEmptyCtor
 #------------------------------------------------------------------------------------------------------------------------
 # 'normal' in that all included data sources are already loaded
 test.MotifDb.normalMode = function ()
 {
   print ('--- test.MotifDb.normalMode')
 
   mdb = MotifDb #  (quiet=TRUE)
     # (5 jun 2012)
     # JASPAR_CORE: 459
     # ScerTF: 196
     # UniPROBE: 380
     # FlyFactorSurvey: 614
     # hPDI: 437
   checkTrue (length (mdb) > 2080)
 
 } # test.MotifDb.normalMode
 #------------------------------------------------------------------------------------------------------------------------
2def1eec
 # this mode is not intended for users, but may see use in the future.
b70c1a6e
 test.MotifDb.emptyMode = function ()
 {
   print ('--- test.MotifDb.emptyMode')
 
   mdb = MotifDb:::.MotifDb (loadAllSources=FALSE, quiet=TRUE)
2def1eec
   checkTrue (length (mdb) == 0)
b70c1a6e
 
 } # test.MotifDb.emptyMode
 #------------------------------------------------------------------------------------------------------------------------
 # 3 jaspar matrices arrive with organism = NA.  find and fix.  make sure here.
 # NA-JASPAR_CORE-TBP-MA0108.2: JASPAR gives <NA> for speciesID
 # NA-JASPAR_CORE-HNF4A-MA0114.1: JASPAR gives <NA> for speciesID
 # NA-JASPAR_CORE-CEBPA-MA0102.2: JASPAR gives '-' for speciesID, website says 'vertebrates'
 
8ff73793
 # Many more NA's exist...need to fix these; here's a quick fix for now
fa5fbc94
 
b70c1a6e
 test.noNAorganisms = function ()
 
 {
   print ('--- test.noNAorganisms')
8ff73793
   #checkEquals (which (is.na (mcols(MotifDb)$organism)), integer (0))
1f060cb8
 
   # There's a fair number of NA organisms, mostly due to including the homer DB
   checkEquals(sum(is.na (mcols(MotifDb)$organism)), 366)
b70c1a6e
 
 } # test.noNAorganisms
 #------------------------------------------------------------------------------------------------------------------------
 test.allMatricesAreNormalized = function ()
 {
   print ('--- test.allMatricesAreNormalized')
   mdb = MotifDb# (quiet=TRUE)
   matrices = mdb@listData
6666b3f4
     # a lenient test required by "Cparvum-UniPROBE-Cgd2_3490.UP00395" and  "Hsapiens-UniPROBE-Sox4.UP00401"
     # for reasons not yet explored.  10e-8 should be be possible
a84b875e
   checkTrue(all(sapply(matrices, function (m) all (abs (colSums (m) - 1.0) < 0.02))))
2def1eec
 
b70c1a6e
 } # test.allMatricesAreNormalized
 #------------------------------------------------------------------------------------------------------------------------
 test.providerNames = function ()
 {
   print ('--- test.getProviderNames')
   mdb = MotifDb # ()
fe92bd53
   pn = mcols(mdb)$providerName
b70c1a6e
   checkEquals (length (which (is.na (pn))), 0)
   checkEquals (length (which (pn == '')), 0)
 
 } # test.providerNames
 #------------------------------------------------------------------------------------------------------------------------
 test.geneSymbols = function ()
 {
   print ('--- test.getGeneSymbols')
   mdb = MotifDb # ()
fe92bd53
   syms = mcols(mdb)$geneSymbol
6666b3f4
   checkEquals (length (which (is.na (syms))), 683)  # no symols yet for the dgf stamlab motifs
b70c1a6e
   checkEquals (length (which (syms == '')), 0)
 
 } # test.geneSymbols
 #------------------------------------------------------------------------------------------------------------------------
 test.geneIdsAndTypes = function ()
 {
   print ('--- test.getGeneIdsAndTypes')
17abb8b8
   mdb = MotifDb
dc17ee90
   tbl <- mcols(mdb)
   geneIds = tbl$geneId
   geneIdTypes = tbl$geneIdType
17abb8b8
   typeCounts = as.list (table (geneIdTypes))
dc17ee90
 
2f30a7ba
   checkTrue(typeCounts$ENTREZ > 2300)
   checkTrue(typeCounts$FLYBASE >= 45)
   checkTrue(typeCounts$SGD >= 600)
   checkTrue(nrow(subset(tbl, is.na(geneIdType))) > 2000)
2def1eec
 
b70c1a6e
   empty.count = length (which (geneIds == ''))
   checkEquals (empty.count, 0)
 
 
 } # test.geneIdsAndTypes
 #------------------------------------------------------------------------------------------------------------------------
 # make sure that all proteinIds have explicit values, either proper identifiers or NA
 # currently tested by looking for empty string assignments
 test.proteinIds = function ()
 {
   print ('--- test.proteinIds')
   mdb = MotifDb # (quiet=TRUE)
8ff73793
   NA.string.count <- sum(is.na(mcols(mdb)$proteinId))
 #  NA.string.count = length (grep ('NA', mcols(mdb)$proteinId))
 
   checkEquals(NA.string.count, 2514)
   # FIX THIS; Currently 2514 don't have protein IDs
   #checkEquals (NA.string.count, 0)
2def1eec
 
fe92bd53
   empty.count = length (which (mcols(mdb)$proteinId==""))
b70c1a6e
   if (empty.count > 0)
2def1eec
     browser ('test.proteinIds')
b70c1a6e
 
   checkEquals (empty.count, 0)
 
      # FlyFactorSurvey, as digested by me, had a blanket assigment of UNIPROT to all proteinIds
      # Herve' pointed out that this applied also to entries with no proteinId.
      # make sure this is fixed
 
8ff73793
   ### FIX THIS TOO! Currently have 913 entries with a proteinIdType and no proteinId
fe92bd53
   x = mcols(mdb)
8ff73793
   # checkEquals (nrow (subset (x, !is.na (proteinIdType) & is.na (proteinId))), 0)
2def1eec
 
b70c1a6e
 
 } # test.proteinIds
 #------------------------------------------------------------------------------------------------------------------------
 # only for UniPROBE do we not have sequence count.  might be possible to get them along with 'insertion sequences'
 test.sequenceCount = function ()
 {
   print ('--- test.sequenceCount')
   mdb = MotifDb # ()
fe92bd53
   x = mcols(mdb)
b70c1a6e
   if (interactive ()) {
     x.up = subset (x, dataSource == 'UniPROBE')
     checkTrue (all (is.na (x.up$sequenceCount)))
     }
   else {
     uniprobe.indices = which (x$dataSource == 'UniPROBE')
     checkTrue (all (is.na (x$sequenceCount [uniprobe.indices])))
     }
 
 } # test.sequenceCount
 #------------------------------------------------------------------------------------------------------------------------
 # make sure that a legitimate organism-dataSource-identifier is supplied for each matrix and as a rowname
 # of the corresponding DataFrame
 test.longNames = function ()
 {
   print ('--- test.longNames')
2def1eec
   mdb = MotifDb
b70c1a6e
   longNames = strsplit (names (mdb), '-')
   organisms = unique (sapply (longNames, '[', 1))
2def1eec
 
a84b875e
   dataSources = unique (lapply (longNames, '[', 2))
b70c1a6e
 
fe92bd53
   recognized.dataSources = unique (mcols(mdb)$dataSource)
   recognized.organisms = unique (mcols(mdb)$organism)
b70c1a6e
     # a few (3) matrices from JASPAR core have NA organism.  make this into a character
     # so that it can be matched up against the 'NA' extracted from longNames just above
   na.indices = which (is.na (recognized.organisms))
   if (length (na.indices) > 0)
a84b875e
      recognized.organisms [na.indices] = 'NA'
b70c1a6e
 
   checkTrue (all (organisms %in% recognized.organisms))
   checkTrue (all (dataSources %in% recognized.dataSources))
 
 } # test.longNames
 #------------------------------------------------------------------------------------------------------------------------
 # make sure that a legitimate organism is specified for each matrix
 test.organisms = function ()
 {
   print ('--- test.organisms')
   mdb = MotifDb # (quiet=TRUE)
fe92bd53
   organisms = mcols(mdb)$organism
b70c1a6e
 
      # jaspar_core has 3 NA speciesId: TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
      # their website shows these as vertebrates, which I map to 'Vertebrata'.  An organismID of '-'
fa5fbc94
   # gets the same treatment, matching website also.
 
8ff73793
   ### Note: this failing test is the same as the test.noNAorganisms test!
fa5fbc94
   # As in case of noNA, need to add organisms for these
8ff73793
   #checkEquals (which (is.na (mcols(MotifDb)$organism)), integer (0))
b70c1a6e
 
fe92bd53
   empty.count = length (which (mcols(mdb)$organism==""))
b70c1a6e
   checkEquals (empty.count, 0)
 
 } # test.organisms
 #------------------------------------------------------------------------------------------------------------------------
 test.bindingDomains = function ()
 {
   print ('--- test.bindingDomains')
   mdb = MotifDb # (quiet=TRUE)
fe92bd53
   checkTrue (length (unique (mcols(mdb)$bindingDomain)) > 1)
b70c1a6e
 
 } # test.bindingDomains
 #------------------------------------------------------------------------------------------------------------------------
 test.flyBindingDomains = function ()
 {
   print ('--- test.flyBindingDomains')
 
fe92bd53
   x = mcols(MotifDb)
b70c1a6e
   tmp = as.list (head (sort (table (subset (x, organism=='Dmelanogaster')$bindingDomain), decreasing=TRUE), n=3))
 
     # these counts will likely change with a fresh load of data from FlyFactorSurvey.
 
   checkEquals (tmp$Homeobox, 212)
   checkEquals (tmp[['zf-C2H2']], 160)
5a4345d9
   checkEquals (tmp[["Helix-Turn-Helix"]], 182)
fa5fbc94
   checkEquals (length (which (is.na (subset (x, organism=='Dmelanogaster')$bindingDomain))), 301) # lots of cisbp
b70c1a6e
 
 } # test.flyBindingDomains
 #------------------------------------------------------------------------------------------------------------------------
 test.experimentTypes = function ()
 {
   print ('--- test.experimentTypes')
   mdb = MotifDb # (quiet=TRUE)
fe92bd53
   x = mcols(mdb)
b70c1a6e
   checkTrue (length (unique (x$experimentType)) >= 18)
   checkEquals (length (which (x$experimentType=='')), 0)
 
 } # test.experimentTypes
 #------------------------------------------------------------------------------------------------------------------------
 test.tfFamilies = function ()
 {
   print ('--- test.tfFamilies')
   mdb = MotifDb # (quiet=TRUE)
fe92bd53
   checkTrue (length (unique (mcols(mdb)$tfFamily)) > 1)
b70c1a6e
 
 } # test.tfFamilies
 #------------------------------------------------------------------------------------------------------------------------
 test.bindingSequences = function ()
 {
   print ('--- test.bindingSequences')
   mdb = MotifDb # (quiet=TRUE)
fe92bd53
   checkTrue (length (unique (mcols(mdb)$bindingSequence)) > 1)
b70c1a6e
 
 } # test.tfFamilies
 #------------------------------------------------------------------------------------------------------------------------
 test.pubmedIDs = function ()
 {
   print ('--- test.pubmedIDs')
fe92bd53
   x = mcols(MotifDb) # (quiet=TRUE))
b70c1a6e
   checkTrue (length (unique (x$pubmedID)) >= 139)
   checkEquals (length (which (x$pubmedID == '')), 0)
 
 } # test.pubmedIDs
 #------------------------------------------------------------------------------------------------------------------------
 # every matrix, and every corresponding metadata table row, has a name formed thus:
 #
 #    dataSource-organism-nativeGeneName
 #
 #  where nativeGeneName is how the matrix is named in the dataSource from which it comes.
 #  thus:
 #          FlyFactorSurvey-Dmelanogaster-Abd.A_FlyReg_FBgn0000014
 #          UniPROBE-Scerevisiae-Asg1-UP00350
 #          ScerTF-Scerevisiae-ABF2-badis
 #          JASPAR_CORE-Rrattus-Ar-MA0007.1
2def1eec
 #
b70c1a6e
 test.allFullNames = function ()
 {
   print ('--- test.allFullNames')
   mdb = MotifDb # (quiet=TRUE)
   matrices = mdb@listData
   fullNames = names (matrices)
 
fe92bd53
   all.dataSources = unique (mcols(mdb)$dataSource)
b70c1a6e
   checkTrue (length (all.dataSources) >= 4)
2def1eec
 
b70c1a6e
   for (source in all.dataSources) {
      this.dataSource <<- source
      matrices.by.source = subset (mdb, dataSource==this.dataSource)
      matrix.name = names (matrices.by.source)[1]
         #  FlyFactorSurvey: Dmelanogaster-FlyFactorSurvey-ab_SANGER_10_FBgn0259750
         #             hPDI: Hsapiens-hPDI-ABCF2
         #      JASPAR_CORE: JASPAR_CORE-Athaliana-AGL3-MA0001.1
         #         UniPROBE: Hsapiens-UniPROBE-Sox4.UP00401
      checkTrue (grep (this.dataSource, matrix.name) == 1)
      #printf ('%20s: %s', source, matrix.name)
      }
 
   return (TRUE)
 
 } # test.allFullNames
 #------------------------------------------------------------------------------------------------------------------------
 test.subset = function ()
 {
   if (interactive ()) {
     print ('--- test.subset')
     mdb = MotifDb
     checkTrue ('geneSymbol' %in% colnames (elementMetadata (mdb)))
     mdb.sub = subset (mdb, geneSymbol=='ABCF2')
     checkEquals (length (mdb.sub), 1)
2def1eec
     } # if interactive
 
b70c1a6e
 } # test.subset
 #------------------------------------------------------------------------------------------------------------------------
 test.subsetWithVariables = function ()
 {
   if (interactive ()) {
     print ('--- test.subsetWithVariables')
2def1eec
 
b70c1a6e
     mdb = MotifDb # ()
     checkTrue ('geneSymbol' %in% colnames (elementMetadata (mdb)))
     target.gene <<- 'ABCF2'
     mdb.sub = subset (mdb, geneSymbol==target.gene)
     checkEquals (length (mdb.sub), 1)
2def1eec
     } # if interactive
 
b70c1a6e
 } # test.subsetWithVariables
 #------------------------------------------------------------------------------------------------------------------------
 test.query = function ()
 {
   print ('--- test.query')
   mdb = MotifDb
 
     # do queries on dataSource counts match those from a contingency table?
fe92bd53
   sources.list = as.list (table (mcols(mdb)$dataSource))
b70c1a6e
   checkEquals (length (query (mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
   checkEquals (length (query (mdb, 'uniprobe')), sources.list$UniPROBE)
   checkEquals (length (query (mdb, 'UniPROBE')), sources.list$UniPROBE)
 
     # gene symbols which begin with 'sox' are quite common.  can we them?
     # there are currently (19 jul 2012) 18, but since this may change, our test is approximate
 
8ff73793
   # Change on 8/1/2017: increase top limit of sox entries as they've expanded
b70c1a6e
   sox.entries = query (mdb, '^sox')
   checkTrue (length (sox.entries) > 10)
8ff73793
   checkTrue (length (sox.entries) < 200)
b70c1a6e
 
     # manual inspection reveals that some of these genes have names which are all capitalized.  test that.
   checkTrue (length (query (mdb, '^sox', ignore.case=TRUE)) > length (query (mdb, '^SOX', ignore.case=FALSE)))
 
     # make sure that queries can be stacked up, and that order of call does not affect the outcome
    uniprobe.sox.matrices = query (query (mdb, 'uniprobe'), '^sox')
    sox.uniprobe.matrices = query (query (mdb, '^sox'), 'uniprobe')
 
    checkEquals (length (uniprobe.sox.matrices), length (sox.uniprobe.matrices))
 
      # an approximate count check
   checkTrue (length (uniprobe.sox.matrices) > 10)
   checkTrue (length (uniprobe.sox.matrices) < 30)
 
fe92bd53
    checkEquals (unique (mcols(uniprobe.sox.matrices)$dataSource), 'UniPROBE')
    checkEquals (unique (mcols(sox.uniprobe.matrices)$dataSource), 'UniPROBE')
    gene.symbols = sort (unique (mcols(uniprobe.sox.matrices)$geneSymbol))
2def1eec
 
dae43ecd
     # query on a string (in this case, an oddly named motif) which contains
     # regular expression characters.  no solution to this yet.
     # query uses base R's grep, in which
     #  x <- query(mdb, "ELK1,4_GABP{A,B1}.p3")
 
b70c1a6e
 } # test.query
 #------------------------------------------------------------------------------------------------------------------------
 test.transformMatrixToMemeRepresentation = function ()
 {
   print ('--- test.transformMatrixToMemeRepresentation')
   mdb = MotifDb # ()
   matrix.agl3 = subset (mdb, dataSource=='JASPAR_CORE' & organism=='Athaliana' & geneSymbol=='AGL3')[[1]]
   checkEquals (dim (matrix.agl3), c (4, 10))
 
     # a transposed frequency is needed for meme.  we store normalized frequency matrices, to just transposition is needed
   tfm = MotifDb:::transformMatrixToMemeRepresentation (matrix.agl3)
   checkEquals (dim (tfm), c (10, 4))
   checkTrue (all (round (rowSums (tfm)) == 1.0))
 
 } # test.transformMatrixToMemeRepresentation
 #------------------------------------------------------------------------------------------------------------------------
 # translate a motif matrix from MotifList into text suitable for meme and tomtom input.
 # choose the top two from UniPROBE.  they reliably have high counts, and easily distinguished normalized frequencies
 test.matrixToMemeText = function ()
 {
   print ('--- test.matrixToMemeText')
   sox4 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
   rtg3 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
 
   text.sox4 = MotifDb:::matrixToMemeText (sox4)
      # check these with t (sox4 [[1]])
   line1.sox4  = " 0.2457457130  0.1950426500  0.2287887620  0.3304228750"
   line14.sox4 = " 0.2821643030  0.2286132160  0.1585395830  0.3306828990"
2def1eec
 
b70c1a6e
   checkEquals (length (text.sox4), 29)
   checkEquals (text.sox4 [1], "MEME version 4")
   checkEquals (text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
   checkEquals (grep (line1.sox4, text.sox4), 12)
   checkEquals (grep (line14.sox4, text.sox4), 25)
 
   text.rtg3 = MotifDb:::matrixToMemeText (rtg3)
   line1.rtg3  = " 0.3935122858  0.1453016447  0.3308830322  0.1303030373"
   line20.rtg3 = " 0.2490417648  0.3966478493  0.1083586569  0.2459517291"
   checkEquals (length (text.rtg3), 35)         # 4 trailing empty lines
   checkEquals (text.rtg3 [1], "MEME version 4")
   checkEquals (text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
   checkEquals (grep (line1.rtg3, text.rtg3), 12)
   checkEquals (grep (line20.rtg3, text.rtg3), 31)
 
     # now call with both matrices, and see if the right results are returned
   text.both = MotifDb:::matrixToMemeText (c (sox4, rtg3))
   checkEquals (text.both [1], "MEME version 4")
   checkEquals (grep (line1.sox4, text.both), 12)
   checkEquals (grep (line14.sox4, text.both), 25)
   checkEquals (grep (line1.rtg3, text.both),  29)
   checkEquals (grep (line20.rtg3, text.both), 48)
 
 } # test.matrixToMemeText
 #------------------------------------------------------------------------------------------------------------------------
 # translate a motif matrix from MotifList into text suitable for meme and tomtom input.
 # choose the top two from UniPROBE.  they reliably have high counts, and easily distinguished normalized frequencies
 #test.matrixToMemeText_mapVersion = function ()
 #{
 #  print ('--- test.matrixToMemeText_mapVersion')
 #  sox4 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
 #  rtg3 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
 #
 #  text.sox4 = MotifDb:::mapped.broken.matrixToMemeText (sox4)
 #  text.rtg3 = MotifDb:::mapped.broken.matrixToMemeText (rtg3)
 #  text.both = MotifDb:::mapped.broken.matrixToMemeText (c (sox4, rtg3))
 #
 #     # check these with t (sox4 [[1]])
 #  line1.sox4  = " 0.2457457130  0.1950426500  0.2287887620  0.3304228750"
 #  line14.sox4 = " 0.2821643030  0.2286132160  0.1585395830  0.3306828990"
 #
 #  checkEquals (length (text.sox4), 29)
 #  checkEquals (text.sox4 [1], "MEME version 4")
 #  checkEquals (text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
 #  checkEquals (grep (line1.sox4, text.sox4), 12)
 #  checkEquals (grep (line14.sox4, text.sox4), 25)
 #
 #  line1.rtg3  = " 0.3935122858  0.1453016447  0.3308830322  0.1303030373"
 #  line20.rtg3 = " 0.2490417648  0.3966478493  0.1083586569  0.2459517291"
 #  checkEquals (length (text.rtg3), 35)         # 4 trailing empty lines
 #  checkEquals (text.rtg3 [1], "MEME version 4")
 #  checkEquals (text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
 #  checkEquals (grep (line1.rtg3, text.rtg3), 12)
 #  checkEquals (grep (line20.rtg3, text.rtg3), 31)
 #
 #    # now call with both matrices, and see if the right results are returned
 #  checkEquals (text.both [1], "MEME version 4")
 #  checkEquals (grep (line1.sox4, text.both), 12)
 #  checkEquals (grep (line14.sox4, text.both), 25)
 #  checkEquals (grep (line1.rtg3, text.both),  29)
 #  checkEquals (grep (line20.rtg3, text.both), 48)
 #
 #} # test.matrixToMemeText_mapVersion
 #------------------------------------------------------------------------------------------------------------------------
 test.export_memeFormatStdOut = function ()
 {
   print ('--- test.export_memeFormatStdOut')
   mdb = MotifDb # ()
   mdb.chicken = subset (mdb, organism=='Gallus')
5a4345d9
   checkEquals (length (mdb.chicken), 3)
b70c1a6e
     # text is cat-ed to stdout, so not avaialable here to check.
     # but just like print, export also returns the text invisibly.
     # so that CAN be checked.
2def1eec
 
b70c1a6e
   meme.text = export (mdb.chicken, format='meme')
   checkEquals (length (meme.text), 1)   # just one long string
   checkTrue (is.character (meme.text))
   checkTrue (nchar (meme.text) > 800)   # 1002 as of (10 aug 2012)
   return (TRUE)
 
 } # test.exportMemeFormatToStdOut
 #------------------------------------------------------------------------------------------------------------------------
 test.export_memeFormatToFile = function ()
 {
   print ('--- test.export_memeFormatToFile')
   mdb = MotifDb # ()
   mdb.chicken = subset (mdb, organism=='Gallus')
5a4345d9
   checkEquals (length (mdb.chicken), 3)
b70c1a6e
   output.file = tempfile ()
   meme.text = export (mdb.chicken, output.file, 'meme')
   retrieved = scan (output.file, what=character (0), sep='\n', quiet=TRUE)
   invisible (retrieved)
 
 } # test.exportMemeFormatToFile
 #------------------------------------------------------------------------------------------------------------------------
 test.export_memeFormatToFileDuplication = function ()
 {
   print ('--- test.export_memeFormatToFileDuplication')
   mdb = MotifDb # ()
   mdb.mouse = subset (mdb, organism=='Mmusculus')
fa5fbc94
   checkEquals (length (mdb.mouse), 1251)
b70c1a6e
   output.file = 'mouse.txt' # tempfile ()
   max = 3
   meme.text = export (mdb.mouse [1:max], output.file, 'meme')
4ff3698f
   retrieved = scan (output.file, what=character (0), sep='\n', quiet=TRUE)
b70c1a6e
   invisible (retrieved)
 
 } # test.exportMemeFormatToFileDuplication
 #------------------------------------------------------------------------------------------------------------------------
 test.export_memeFormatToFile_run_tomtom = function (max=50)
 {
   if (interactive ()) {
     print ('--- test.export_memeFormatToFile_run_tomtom')
     mdb = MotifDb
     sox4.mouse = subset (mdb, organism=='Mmusculus' & geneSymbol=='Sox4')
     all.human = subset (mdb, organism=='Hsapiens')
 
     tomtom.tmp.dir = tempfile ()
     print (tomtom.tmp.dir)
     stopifnot (dir.create (tomtom.tmp.dir))
     sox4.file.path = file.path (tomtom.tmp.dir, 'sox4.mouse.text')
     all.human.file.path = file.path (tomtom.tmp.dir,'all.human.text')
     export (sox4.mouse, sox4.file.path, 'meme')
     export (all.human, all.human.file.path, 'meme')
 
        # find similarity of motif #1 to all the motifs in mdbMany
 
a84b875e
        # cannot rely upon tomtom being present
 
     #cmd = sprintf ('tomtom -no-ssc -oc %s -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s',
     #                tomtom.tmp.dir, sox4.file.path, all.human.file.path)
     #system (cmd)
     #cmd = sprintf ('open %s/tomtom.html', tomtom.tmp.dir)
     #system (cmd)
b70c1a6e
     } # if interactive
 
 } # test.export_memeFormatToFile_run_tomtom
 #------------------------------------------------------------------------------------------------------------------------
000f7c78
 # MotIV::motifMatch fails with MotIV_1.25.0.  will look into this in September, 2015, pshannon
 test.run_MotIV.motifMatch = function ()
b70c1a6e
 {
   library (MotIV)
000f7c78
   print ('--- test.run_MotIV.motifMatch')
9ad58ff7
   mdb <- MotifDb # ()
b70c1a6e
 
9ad58ff7
   db.tmp <- mdb@listData
b70c1a6e
 
      # match motif 1 against the entire MotifDb  collection
9ad58ff7
   motif.hits <- motifMatch (db.tmp [1], database<-db.tmp)
b70c1a6e
      # the long way to extract the matrix name.  see MotIV.toTable below for more convenient way
   checkEquals (motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names (db.tmp)[1])
 
000f7c78
      # match the last motif against all
   last <- length(db.tmp)
9ad58ff7
      # MotIV:motifMatch works differently on linux and macos.  by asking for 50 matches,
      # the search target (db.tmp[last]) is sure to be in the hit list.
   motif.hits <-  motifMatch (db.tmp [last], database=db.tmp, top=50)
   tbl.hits <- MotIV.toTable (motif.hits)
000f7c78
     # the 5 hits return should include the one we tried to match, but the MotIV search strategy
     # may not place it first
   checkTrue(names(db.tmp[last]) %in% tbl.hits$name)
b70c1a6e
   invisible (tbl.hits)
2def1eec
 
000f7c78
 } # test.run_MotIV.motifMatch
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
 MotIV.toTable = function (match)
 {
   stopifnot (length (match@bestMatch) >= 1)
   alignments = match@bestMatch[[1]]@aligns
 
   df = data.frame (stringsAsFactors=FALSE)
   for (alignment in alignments) {
     x = alignment
     name = x@TF@name
     eVal = x@evalue
     sequence = x@sequence
     match = x@match
     strand = x@strand
     df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
     } # for alignment
 
   return (df)
 
2def1eec
 } # MotIV.toTable
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
 test.MotIV.toTable = function ()
 {
   print ('--- test.MotIVtoTable')
   mdb = MotifDb # ()
   test.hits = motifMatch (mdb[1]@listData, database=jaspar)
   tbl.hits =  MotIV.toTable (test.hits)
   checkEquals (dim (tbl.hits), c (5, 5))
07b8ea8b
   checkEquals (sort(colnames (tbl.hits)), sort(c("name", "eVal", "sequence", "match", "strand")))
b70c1a6e
 
2def1eec
 } # test.MotIV.toTable
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
 pwmMatch.toTable = function (motifMatch) {
    if (length (motifMatch@bestMatch) == 0)
    return (NA)
 
    df.list = vector("list", length(motifMatch@bestMatch))
    for (k in seq(length(motifMatch@bestMatch)))
    {
        alignments = motifMatch@bestMatch[[k]]@aligns
        df = data.frame (stringsAsFactors=FALSE)
        for (alignment in alignments) {
            x = alignment
            name = x@TF@name
            eVal = x@evalue
            sequence = x@sequence
            match = x@match
            strand = x@strand
            df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
        } # for alignment
        df.list[[k]]=df
    }
    names(df.list) <- names(motifMatch)
    return (df.list)
77e1c264
 
 } # pwmMatch.toTable
 #------------------------------------------------------------------------------
 # Robert Stojnic reports incorrect gene symbols for matrices obtained from
 # flyFactorSurvey.
 # the solution was to abandon the original strategy of extracting the
 # symbol from the matrix (and file) name.
 # now, the flybase importer ("inst/scripts/import/flyFactorSurvey/import.R")
 # uses FBgn id (which can be reliably extracted) and uses indpendent
 # data sources to learn the gene symbol.
 #
 # robert's email:
 #  I'm working on using MotifDb motifs in my PWMEnrich package and I
 #  have noticed that there is a slight problem with gene symbols for
 #  Drosophila. In particular, the gene symbols do not always correspond
 #  to the gene ID and are frequently mis-capitalized. In Drosophila z
 #  and Z are two different genes and capitalization does matter if
 #  someone is to use the gene symbols. Also, in some cases the symbols
 #  are missing hyphens or parenthesis. I have used the gene IDs and the
 #  Flybase annotation database to set the correct gene symbols for
 #  Drosophila, please find attached the result of my re-annotation.
 #
2def1eec
 #  looking at his correctedMotifDbDmel.csv
77e1c264
 #
 #    head(read.table("correctedMotifDbDmel.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))
 #                  providerName oldGeneSymbol newGeneSymbol
 #    1 ab_SANGER_10_FBgn0259750            Ab            ab
 #    2  ab_SOLEXA_5_FBgn0259750            Ab            ab
 #    3 Abd-A_FlyReg_FBgn0000014         Abd-a         abd-A
 #    4 Abd-B_FlyReg_FBgn0000015         Abd-b         Abd-B
 #    5    AbdA_Cell_FBgn0000014          Abda         abd-A
 #    6  AbdA_SOLEXA_FBgn0000014          Abda         abd-A
 #
 test.flyFactorGeneSymbols <- function()
 {
     print ("--- test.flyFactorGeneSymbols")
     mdb = MotifDb
07b8ea8b
     checkEquals(sort(mcols(query(mdb, "FBgn0259750"))$geneSymbol),
                 sort(c("FBgn0259750", "FBgn0259750")))
77e1c264
     checkEquals(mcols(query(mdb, "FBgn0000014"))$geneSymbol, rep("abd-A", 3))
     checkEquals(mcols(query(mdb, "FBgn0000015"))$geneSymbol, rep("Abd-B", 3))
 
 } # test.flyFactorGeneSymbols
 #-------------------------------------------------------------------------------
8c9d39f9
 test.export_jasparFormatStdOut = function ()
 {
   print ('--- test.export_jasparFormatStdOut')
   mdb = MotifDb # ()
   mdb.chicken = subset (mdb, organism=='Gallus')
   checkEquals (length (mdb.chicken), 3)
     # text is cat-ed to stdout, so not avaialable here to check.
     # but just like print, export also returns the text invisibly.
     # so that CAN be checked.
2def1eec
 
8c9d39f9
   jaspar.text = export (mdb.chicken, format='jaspar')
   checkEquals (length (jaspar.text), 1)   # just one long string
   checkTrue (is.character (jaspar.text))
   checkTrue (nchar (jaspar.text) > 800)   # 1002 as of (10 aug 2012)
   return (TRUE)
 
 } # test.exportjasparFormatToStdOut
 #------------------------------------------------------------------------------------------------------------------------
 test.export_jasparFormatToFile = function ()
 {
   print ('--- test.export_jasparFormatToFile')
   mdb = MotifDb # ()
   mdb.chicken = subset (mdb, organism=='Gallus')
   checkEquals (length (mdb.chicken), 3)
   output.file = tempfile ()
   jaspar.text = export (mdb.chicken, output.file, 'jaspar')
   retrieved = scan (output.file, what=character (0), sep='\n', quiet=TRUE)
   invisible (retrieved)
77e1c264
 
8c9d39f9
 } # test.exportjasparFormatToFile
2def1eec
 #------------------------------------------------------------------------------------------------------------------------
49ebd9dc
 test.geneToMotif <- function()
2def1eec
 {
49ebd9dc
    printf("--- test.geneToMotif")
2def1eec
    mdb <- MotifDb
 
49ebd9dc
    genes <- c("FOS", "ATF5", "bogus")
 
       # use  TFClass family classifcation
4acf0d78
    tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS")   # intentional mis-capitalization
07b8ea8b
    checkEquals(sort(tbl.tfClass$gene),  sort(c("ATF5", "FOS", "FOS")))
    checkEquals(sort(tbl.tfClass$motif),  sort(c("MA0833.1", "MA0099.2", "MA0476.1")))
4acf0d78
    checkEquals(tbl.tfClass$source, rep("TFClass", 3))
49ebd9dc
 
       # MotifDb mode uses the MotifDb metadata, pulled from many sources
4acf0d78
    tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb")     # intentional mis-capitalization
    checkEquals(dim(tbl.mdb), c(12, 6))
    checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
49ebd9dc
       # no recognizable (i.e., jaspar standard) motif name returned by MotifDb metadata
       # MotifDb for ATF5
       # todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1
 
 } # test.geneToMotif
 #------------------------------------------------------------------------------------------------------------------------
 test.motifToGene <- function()
 {
    printf("--- test.motifToGene")
e90f5267
    printf("Sys.getlocale: %s", Sys.getlocale())
2def1eec
 
49ebd9dc
    motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")
2def1eec
 
0aa3bd26
       # MotifDb mode uses the MotifDb metadata "providerId",
    tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb")
4acf0d78
    checkEquals(dim(tbl.mdb), c(3, 6))
1ae931f9
    expected <- sort(c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
    actual <- sort(tbl.mdb$motif)
    checkEquals(actual, expected)
07b8ea8b
    checkEquals(sort(tbl.mdb$geneSymbol), sort(c("Esrra", "ELF1", "Zfp740")))
    checkEquals(sort(tbl.mdb$dataSource), sort(c("jaspar2016", "SwissRegulon", "UniPROBE")))
    checkEquals(sort(tbl.mdb$organism),   sort(c("Mmusculus", "Hsapiens", "Mmusculus")))
    checkEquals(sort(tbl.mdb$source),     rep("MotifDb", 3))
d030e74b
 
       # TFClass mode uses  TF family classifcation
0aa3bd26
    tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass")
4acf0d78
    checkEquals(dim(tbl.tfClass), c(9,4))
    checkEquals(tbl.tfClass$motif, rep("MA0592.2", 9))
07b8ea8b
    checkEquals(sort(tbl.tfClass$gene), sort(c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR")))
4acf0d78
    checkEquals(tbl.tfClass$source,       rep("TFClass", 9))
49ebd9dc
 
d030e74b
      # test motifs with regex characters in them, or other characters neither letter nor number
    motifs <- sort(c("DMAP1_NCOR{1,2}_SMARC.p2", "ELK1,4_GABP{A,B1}.p3", "SNAI1..3.p2", "EWSR1-FLI1.p2", "ETS1,2.p2"))
    tbl <- motifToGene(MotifDb, motifs, source="MotifDb")
    checkEquals(nrow(tbl), 0)
 
    tbl <- motifToGene(MotifDb, motifs, source="tfclass")
    checkEquals(ncol(tbl), 4)
    checkTrue(nrow(tbl) > 80)
    checkTrue(nrow(tbl) < 100)
    checkTrue(all(motifs %in% tbl$motif))
 
 } # test.motifToGene
49ebd9dc
 #------------------------------------------------------------------------------------------------------------------------
 test.associateTranscriptionFactors <- function()
 {
    printf("--- test.associateTranscriptionFactors")
 
    mdb <- MotifDb
    pfms <- query(query(mdb, "jaspar2016"), "sapiens")
2def1eec
 
49ebd9dc
       # first check motifs with MotifDb-style long names, using MotifDb lookup, in the
       # metadata of MotifDb:
       #      "Hsapiens-jaspar2016-RUNX1-MA0002.1"  "Hsapiens-jaspar2016-TFAP2A-MA0003.1"
2def1eec
 
49ebd9dc
    motif.names <- names(pfms[1:5])
094b985a
    tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
    tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
49ebd9dc
    checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
    checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno)))
07b8ea8b
    checkEquals(sort(tbl.anno$geneSymbol), sort(c("RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "AR")))
2def1eec
 
49ebd9dc
       # now add in a bogus motif name, one for which there cannot possibly be a TF
 
    motif.names[3] <- "bogus"
094b985a
    tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
    tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
49ebd9dc
    checkTrue(is.na(tbl.anno$geneSymbol[3]))
    checkTrue(is.na(tbl.anno$pubmedID[3]))
 
       # now check motifs with short, traditional names, in "TFClass" mode, which uses
       # the tfFamily
       #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
 
094b985a
    motif.names <- names(pfms[1:5])
49ebd9dc
    short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
094b985a
    tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
49ebd9dc
 
094b985a
    tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
49ebd9dc
    checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
 
094b985a
       # TFClass only annotates MA0003.3, none of the others
    checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
    checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
    checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
    checkEquals(tbl.anno$pubmedID[4],   "23180794")
 
       # now ask for expandsion of the semicolon separated list
    tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
    checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
    checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
 
49ebd9dc
       # now add in a bogus motif name, one for which there cannot possibly be a TF
 
    motif.names <- names(pfms[1:5])
    short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
094b985a
    short.motif.names[4] <- "bogus"
    tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
49ebd9dc
 
094b985a
    tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
49ebd9dc
    checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
094b985a
       # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
    checkTrue(all(is.na(tbl.anno$geneSymbol)))
    checkTrue(all(is.na(tbl.anno$pubmedID)))
49ebd9dc
 
094b985a
       # now make sure that the absence of the  TFClass-specific "shortMotif" field is detected
    motif.names <- names(pfms[1:5])
    tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
    checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
49ebd9dc
 
094b985a
 
 } # test.associateTranscriptionFactors
2def1eec
 #------------------------------------------------------------------------------------------------------------------------