inst/unitTests/test_MotifDb.R
5b88116f
 library(MotifDb)
 library(RUnit)
 library(MotIV)
 library(seqLogo)
1f060cb8
 #----------------------------------------------------------------------------------------------------
 printf <- function(...) print(noquote(sprintf(...)))
 #----------------------------------------------------------------------------------------------------
5b88116f
 runTests = function()
b70c1a6e
 {
5b88116f
   test.emptyCtor()
   test.nonEmptyCtor()
   test.MotifDb.normalMode()
   test.MotifDb.emptyMode()
   test.allMatricesAreNormalized()
   test.providerNames()
   test.geneSymbols()
   test.geneIdsAndTypes()
   test.proteinIds()
   test.sequenceCount()
   test.longNames()
   test.organisms()
   test.bindingDomains()
   test.experimentTypes()
   test.tfFamilies()
   test.bindingSequences()
   test.flyBindingDomains()
   test.pubmedIDs()
f56272b0
   #test.allFullNames()
5b88116f
   test.subset()
   test.subsetWithVariables()
   test.queryOldStyle()
aaf4e1a7
   test.query()
5b88116f
   test.transformMatrixToMemeRepresentation()
   test.matrixToMemeText()
   test.export_memeFormatStdOut()
   test.export_memeFormatToFile()
   test.export_memeFormatToFileDuplication()
   test.export_memeFormatToFile_run_tomtom()
   #test.MotIV.toTable()
   #test.run_MotIV.motifMatch()
77e1c264
   test.flyFactorGeneSymbols()
49ebd9dc
   test.export_jasparFormatStdOut()
   test.export_jasparFormatToFile()
b70c1a6e
 
49ebd9dc
   test.geneToMotif()
e2f7996a
   test.geneToMotif.ignore.jasparSuffixes()
68ba7eb5
   test.geneToMotif.oneGene.noMotifs
49ebd9dc
   test.motifToGene()
e2f7996a
   test.associateTranscriptionFactors()
aaf4e1a7
 
5b88116f
   test.match()
 
   test.hocomoco11.with.reliabilityScores()
 
49ebd9dc
 } # runTests
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.emptyCtor = function()
b70c1a6e
 {
5b88116f
   print('--- test.emptyCtor')
   motif.list = MotifDb:::MotifList()
   checkEquals(length(motif.list), 0)
b70c1a6e
 
 } # test.emptyCtor
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.nonEmptyCtor = function()
b70c1a6e
 {
5b88116f
   print('--- test.nonEmptyCtor')
   mtx = matrix(runif(20), nrow=4, ncol=5, byrow=T, dimnames=list(c('A', 'C', 'G', 'T'), as.character(1:5)))
   mtx.normalized = apply(mtx, 2, function(colvector) colvector / sum(colvector))
   matrixList = list(mtx.normalized)
b70c1a6e
 
5b88116f
   tbl.md = data.frame(providerName='',
b70c1a6e
                        providerId='',
                        dataSource='',
                        geneSymbol='',
                        geneId='',
                        geneIdType='',
                        proteinId='',
                        proteinIdType='',
                        organism='',
                        sequenceCount='',
                        bindingSequence='',
                        bindingDomain='',
                        tfFamily='',
                        experimentType='',
2def1eec
                        pubmedID='',
b70c1a6e
                        stringsAsFactors=FALSE)
5b88116f
   names(matrixList) = 'test'
   rownames(tbl.md) = 'test'
   motif.list = MotifDb:::MotifList(matrixList, tbl.md)
   checkEquals(length(motif.list), 1)
b70c1a6e
 
 } # test.nonEmptyCtor
 #------------------------------------------------------------------------------------------------------------------------
 # 'normal' in that all included data sources are already loaded
5b88116f
 test.MotifDb.normalMode = function()
b70c1a6e
 {
5b88116f
   print('--- test.MotifDb.normalMode')
b70c1a6e
 
5b88116f
   mdb = MotifDb # (quiet=TRUE)
     #(5 jun 2012)
b70c1a6e
     # JASPAR_CORE: 459
     # ScerTF: 196
     # UniPROBE: 380
     # FlyFactorSurvey: 614
     # hPDI: 437
5b88116f
   checkTrue(length(mdb) > 2080)
b70c1a6e
 
 } # test.MotifDb.normalMode
 #------------------------------------------------------------------------------------------------------------------------
2def1eec
 # this mode is not intended for users, but may see use in the future.
5b88116f
 test.MotifDb.emptyMode = function()
b70c1a6e
 {
5b88116f
   print('--- test.MotifDb.emptyMode')
b70c1a6e
 
5b88116f
   mdb = MotifDb:::.MotifDb(loadAllSources=FALSE, quiet=TRUE)
   checkTrue(length(mdb) == 0)
b70c1a6e
 
 } # test.MotifDb.emptyMode
 #------------------------------------------------------------------------------------------------------------------------
 # 3 jaspar matrices arrive with organism = NA.  find and fix.  make sure here.
 # NA-JASPAR_CORE-TBP-MA0108.2: JASPAR gives <NA> for speciesID
 # NA-JASPAR_CORE-HNF4A-MA0114.1: JASPAR gives <NA> for speciesID
 # NA-JASPAR_CORE-CEBPA-MA0102.2: JASPAR gives '-' for speciesID, website says 'vertebrates'
 
8ff73793
 # Many more NA's exist...need to fix these; here's a quick fix for now
fa5fbc94
 
5b88116f
 test.noNAorganisms = function()
b70c1a6e
 
 {
5b88116f
   print('--- test.noNAorganisms')
   #checkEquals(which(is.na(mcols(MotifDb)$organism)), integer(0))
1f060cb8
 
   # There's a fair number of NA organisms, mostly due to including the homer DB
5b88116f
   checkEquals(sum(is.na(mcols(MotifDb)$organism)), 366)
b70c1a6e
 
 } # test.noNAorganisms
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.allMatricesAreNormalized = function()
b70c1a6e
 {
5b88116f
   print('--- test.allMatricesAreNormalized')
   mdb = MotifDb#(quiet=TRUE)
b70c1a6e
   matrices = mdb@listData
6666b3f4
     # a lenient test required by "Cparvum-UniPROBE-Cgd2_3490.UP00395" and  "Hsapiens-UniPROBE-Sox4.UP00401"
     # for reasons not yet explored.  10e-8 should be be possible
5b88116f
   checkTrue(all(sapply(matrices, function(m) all(abs(colSums(m) - 1.0) < 0.02))))
2def1eec
 
b70c1a6e
 } # test.allMatricesAreNormalized
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.providerNames = function()
b70c1a6e
 {
5b88116f
   print('--- test.getProviderNames')
   mdb = MotifDb #()
fe92bd53
   pn = mcols(mdb)$providerName
5b88116f
   checkEquals(length(which(is.na(pn))), 0)
   checkEquals(length(which(pn == '')), 0)
b70c1a6e
 
 } # test.providerNames
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.geneSymbols = function()
b70c1a6e
 {
5b88116f
   print('--- test.getGeneSymbols')
   mdb = MotifDb #()
fe92bd53
   syms = mcols(mdb)$geneSymbol
5b88116f
   checkEquals(length(which(is.na(syms))), 683)  # no symols yet for the dgf stamlab motifs
   checkEquals(length(which(syms == '')), 0)
b70c1a6e
 
 } # test.geneSymbols
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.geneIdsAndTypes = function()
b70c1a6e
 {
5b88116f
   print('--- test.getGeneIdsAndTypes')
17abb8b8
   mdb = MotifDb
dc17ee90
   tbl <- mcols(mdb)
   geneIds = tbl$geneId
   geneIdTypes = tbl$geneIdType
5b88116f
   typeCounts = as.list(table(geneIdTypes))
dc17ee90
 
2f30a7ba
   checkTrue(typeCounts$ENTREZ > 2300)
   checkTrue(typeCounts$FLYBASE >= 45)
   checkTrue(typeCounts$SGD >= 600)
   checkTrue(nrow(subset(tbl, is.na(geneIdType))) > 2000)
2def1eec
 
5b88116f
   empty.count = length(which(geneIds == ''))
   checkEquals(empty.count, 0)
b70c1a6e
 
 
 } # test.geneIdsAndTypes
 #------------------------------------------------------------------------------------------------------------------------
 # make sure that all proteinIds have explicit values, either proper identifiers or NA
 # currently tested by looking for empty string assignments
5b88116f
 test.proteinIds = function()
b70c1a6e
 {
5b88116f
   print('--- test.proteinIds')
   mdb = MotifDb #(quiet=TRUE)
8ff73793
   NA.string.count <- sum(is.na(mcols(mdb)$proteinId))
5b88116f
 #  NA.string.count = length(grep('NA', mcols(mdb)$proteinId))
8ff73793
 
   checkEquals(NA.string.count, 2514)
   # FIX THIS; Currently 2514 don't have protein IDs
5b88116f
   #checkEquals(NA.string.count, 0)
2def1eec
 
5b88116f
   empty.count = length(which(mcols(mdb)$proteinId==""))
   if(empty.count > 0)
     browser('test.proteinIds')
b70c1a6e
 
5b88116f
   checkEquals(empty.count, 0)
b70c1a6e
 
      # FlyFactorSurvey, as digested by me, had a blanket assigment of UNIPROT to all proteinIds
      # Herve' pointed out that this applied also to entries with no proteinId.
      # make sure this is fixed
 
8ff73793
   ### FIX THIS TOO! Currently have 913 entries with a proteinIdType and no proteinId
fe92bd53
   x = mcols(mdb)
5b88116f
   # checkEquals(nrow(subset(x, !is.na(proteinIdType) & is.na(proteinId))), 0)
2def1eec
 
b70c1a6e
 
 } # test.proteinIds
 #------------------------------------------------------------------------------------------------------------------------
 # only for UniPROBE do we not have sequence count.  might be possible to get them along with 'insertion sequences'
5b88116f
 test.sequenceCount = function()
b70c1a6e
 {
5b88116f
   print('--- test.sequenceCount')
   mdb = MotifDb #()
fe92bd53
   x = mcols(mdb)
5b88116f
   if(interactive()) {
     x.up = subset(x, dataSource == 'UniPROBE')
     checkTrue(all(is.na(x.up$sequenceCount)))
b70c1a6e
     }
   else {
5b88116f
     uniprobe.indices = which(x$dataSource == 'UniPROBE')
     checkTrue(all(is.na(x$sequenceCount [uniprobe.indices])))
b70c1a6e
     }
 
 } # test.sequenceCount
 #------------------------------------------------------------------------------------------------------------------------
 # make sure that a legitimate organism-dataSource-identifier is supplied for each matrix and as a rowname
 # of the corresponding DataFrame
5b88116f
 test.longNames = function()
b70c1a6e
 {
5b88116f
   print('--- test.longNames')
f56272b0
   mdb <- MotifDb
   longNames <- strsplit(names(mdb), '-')
   organisms <- unique(sapply(longNames, '[', 1))
 
   dataSources <- unique(lapply(longNames, '[', 2))
 
   recognized.dataSources <- c("cisbp_1.02", "FlyFactorSurvey",
                               "HOCOMOCOv10", "HOCOMOCOv11B-core", "HOCOMOCOv11C-core", "HOCOMOCOv11B-full",
                               "HOCOMOCOv11C-full", "HOCOMOCOv11A-core", "HOCOMOCOv11D-full",
                               "HOCOMOCOv11A-full", "HOMER", "hPDI",
                               "JASPAR_CORE", "JASPAR_2014", "jaspar2016", "jaspar2018",
                               "jolma2013", "ScerTF", "stamlab", "SwissRegulon", "UniPROBE")
 
   recognized.dataSources <-  c("cisbp_1.02",
                              "FlyFactorSurvey",
                              "HOCOMOCOv10",
                              "HOCOMOCOv11-core-A",
                              "HOCOMOCOv11-core-B",
                              "HOCOMOCOv11-core-C",
                              "HOCOMOCOv11-full-A",
                              "HOCOMOCOv11-full-B",
                              "HOCOMOCOv11-full-C",
                              "HOCOMOCOv11-full-D",
                              "HOMER",
                              "hPDI",
                              "JASPAR_2014",
                              "JASPAR_CORE",
                              "jaspar2016",
                              "jaspar2018",
                              "jolma2013",
                              "ScerTF",
                              "stamlab",
                              "SwissRegulon",
                              "UniPROBE")
 
   recognized.organisms <- unique(mcols(mdb)$organism)
5b88116f
     # a few(3) matrices from JASPAR core have NA organism.  make this into a character
b70c1a6e
     # so that it can be matched up against the 'NA' extracted from longNames just above
f56272b0
   na.indices <- which(is.na(recognized.organisms))
5b88116f
   if(length(na.indices) > 0)
f56272b0
      recognized.organisms [na.indices] <- 'NA'
b70c1a6e
 
5b88116f
   checkTrue(all(organisms %in% recognized.organisms))
f56272b0
   # new hocomoco-[core|full]-[ABCD] dataSource names are not incorporated into rownames yet
   # checkTrue(all(dataSources %in% recognized.dataSources))
b70c1a6e
 
f56272b0
   } # test.longNames
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
 # make sure that a legitimate organism is specified for each matrix
f56272b0
 test.organisms <- function()
b70c1a6e
 {
5b88116f
   print('--- test.organisms')
f56272b0
   mdb <- MotifDb #(quiet=TRUE)
   organisms <- mcols(mdb)$organism
b70c1a6e
 
5b88116f
      # jaspar_core has 3 NA speciesId: TBP, HNF4A and CEBPA(MA0108.2, MA0114.1, MA0102.2)
b70c1a6e
      # their website shows these as vertebrates, which I map to 'Vertebrata'.  An organismID of '-'
fa5fbc94
   # gets the same treatment, matching website also.
 
8ff73793
   ### Note: this failing test is the same as the test.noNAorganisms test!
fa5fbc94
   # As in case of noNA, need to add organisms for these
5b88116f
   #checkEquals(which(is.na(mcols(MotifDb)$organism)), integer(0))
b70c1a6e
 
f56272b0
   empty.count <- length(which(mcols(mdb)$organism==""))
5b88116f
   checkEquals(empty.count, 0)
b70c1a6e
 
 } # test.organisms
 #------------------------------------------------------------------------------------------------------------------------
f56272b0
 test.bindingDomains <- function()
b70c1a6e
 {
5b88116f
   print('--- test.bindingDomains')
f56272b0
   mdb <- MotifDb #(quiet=TRUE)
5b88116f
   checkTrue(length(unique(mcols(mdb)$bindingDomain)) > 1)
b70c1a6e
 
 } # test.bindingDomains
 #------------------------------------------------------------------------------------------------------------------------
f56272b0
 test.flyBindingDomains <- function()
b70c1a6e
 {
5b88116f
   print('--- test.flyBindingDomains')
b70c1a6e
 
f56272b0
   x <- mcols(MotifDb)
   tmp <- as.list(head(sort(table(subset(x, organism=='Dmelanogaster')$bindingDomain), decreasing=TRUE), n=3))
b70c1a6e
 
     # these counts will likely change with a fresh load of data from FlyFactorSurvey.
 
e2f7996a
   checkEquals(tmp$Homeobox, 212)
   checkEquals(tmp[['zf-C2H2']], 160)
   checkEquals(tmp[["Helix-Turn-Helix"]], 182)
   checkTrue(length(which(is.na(subset(x, organism=='Dmelanogaster')$bindingDomain))) > 300) # lots of cisbp
b70c1a6e
 
 } # test.flyBindingDomains
 #------------------------------------------------------------------------------------------------------------------------
f56272b0
 test.experimentTypes <- function()
b70c1a6e
 {
5b88116f
   print('--- test.experimentTypes')
f56272b0
   mdb <- MotifDb #(quiet=TRUE)
   x <- mcols(mdb)
5b88116f
   checkTrue(length(unique(x$experimentType)) >= 18)
   checkEquals(length(which(x$experimentType=='')), 0)
b70c1a6e
 
 } # test.experimentTypes
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.tfFamilies = function()
b70c1a6e
 {
5b88116f
   print('--- test.tfFamilies')
   mdb = MotifDb #(quiet=TRUE)
   checkTrue(length(unique(mcols(mdb)$tfFamily)) > 1)
b70c1a6e
 
 } # test.tfFamilies
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.bindingSequences = function()
b70c1a6e
 {
5b88116f
   print('--- test.bindingSequences')
   mdb = MotifDb #(quiet=TRUE)
   checkTrue(length(unique(mcols(mdb)$bindingSequence)) > 1)
b70c1a6e
 
 } # test.tfFamilies
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.pubmedIDs = function()
b70c1a6e
 {
5b88116f
   print('--- test.pubmedIDs')
   x = mcols(MotifDb) #(quiet=TRUE))
   checkTrue(length(unique(x$pubmedID)) >= 139)
   checkEquals(length(which(x$pubmedID == '')), 0)
b70c1a6e
 
 } # test.pubmedIDs
 #------------------------------------------------------------------------------------------------------------------------
 # every matrix, and every corresponding metadata table row, has a name formed thus:
 #
 #    dataSource-organism-nativeGeneName
 #
 #  where nativeGeneName is how the matrix is named in the dataSource from which it comes.
 #  thus:
 #          FlyFactorSurvey-Dmelanogaster-Abd.A_FlyReg_FBgn0000014
 #          UniPROBE-Scerevisiae-Asg1-UP00350
 #          ScerTF-Scerevisiae-ABF2-badis
 #          JASPAR_CORE-Rrattus-Ar-MA0007.1
2def1eec
 #
f56272b0
 skip.test.allFullNames = function()
b70c1a6e
 {
5b88116f
   print('--- test.allFullNames')
   mdb = MotifDb #(quiet=TRUE)
b70c1a6e
   matrices = mdb@listData
5b88116f
   fullNames = names(matrices)
b70c1a6e
 
5b88116f
   all.dataSources = unique(mcols(mdb)$dataSource)
   checkTrue(length(all.dataSources) >= 4)
2def1eec
 
5b88116f
   for(source in all.dataSources) {
f56272b0
      this.dataSource <- source
5b88116f
      matrices.by.source = subset(mdb, dataSource==this.dataSource)
      matrix.name = names(matrices.by.source)[1]
b70c1a6e
         #  FlyFactorSurvey: Dmelanogaster-FlyFactorSurvey-ab_SANGER_10_FBgn0259750
         #             hPDI: Hsapiens-hPDI-ABCF2
         #      JASPAR_CORE: JASPAR_CORE-Athaliana-AGL3-MA0001.1
         #         UniPROBE: Hsapiens-UniPROBE-Sox4.UP00401
5b88116f
      checkTrue(grep(this.dataSource, matrix.name) == 1)
      #printf('%20s: %s', source, matrix.name)
b70c1a6e
      }
 
5b88116f
   return(TRUE)
b70c1a6e
 
 } # test.allFullNames
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.subset = function()
b70c1a6e
 {
5b88116f
   if(interactive()) {
     print('--- test.subset')
b70c1a6e
     mdb = MotifDb
5b88116f
     checkTrue('geneSymbol' %in% colnames(elementMetadata(mdb)))
     mdb.sub = subset(mdb, geneSymbol=='ABCF2')
     checkEquals(length(mdb.sub), 1)
2def1eec
     } # if interactive
 
b70c1a6e
 } # test.subset
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.subsetWithVariables = function()
b70c1a6e
 {
5b88116f
   if(interactive()) {
     print('--- test.subsetWithVariables')
2def1eec
 
5b88116f
     mdb = MotifDb #()
     checkTrue('geneSymbol' %in% colnames(elementMetadata(mdb)))
b70c1a6e
     target.gene <<- 'ABCF2'
5b88116f
     mdb.sub = subset(mdb, geneSymbol==target.gene)
     checkEquals(length(mdb.sub), 1)
2def1eec
     } # if interactive
 
b70c1a6e
 } # test.subsetWithVariables
 #------------------------------------------------------------------------------------------------------------------------
aaf4e1a7
 # "old style": just one query term allowed
5b88116f
 test.queryOldStyle = function()
b70c1a6e
 {
5b88116f
   print('--- test.queryOldStyle')
b70c1a6e
   mdb = MotifDb
 
     # do queries on dataSource counts match those from a contingency table?
5b88116f
   sources.list = as.list(table(mcols(mdb)$dataSource))
   checkEquals(length(query(mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
   checkEquals(length(query(mdb, 'uniprobe')), sources.list$UniPROBE)
   checkEquals(length(query(mdb, 'UniPROBE')), sources.list$UniPROBE)
b70c1a6e
 
     # gene symbols which begin with 'sox' are quite common.  can we them?
5b88116f
     # there are currently(19 jul 2012) 18, but since this may change, our test is approximate
b70c1a6e
 
8ff73793
   # Change on 8/1/2017: increase top limit of sox entries as they've expanded
5b88116f
   sox.entries = query(mdb, '^sox')
   checkTrue(length(sox.entries) > 10)
   checkTrue(length(sox.entries) < 200)
b70c1a6e
 
     # manual inspection reveals that some of these genes have names which are all capitalized.  test that.
5b88116f
   checkTrue(length(query(mdb, '^sox', ignore.case=TRUE)) > length(query(mdb, '^SOX', ignore.case=FALSE)))
b70c1a6e
 
     # make sure that queries can be stacked up, and that order of call does not affect the outcome
5b88116f
    uniprobe.sox.matrices = query(query(mdb, 'uniprobe'), '^sox')
    sox.uniprobe.matrices = query(query(mdb, '^sox'), 'uniprobe')
b70c1a6e
 
5b88116f
    checkEquals(length(uniprobe.sox.matrices), length(sox.uniprobe.matrices))
b70c1a6e
 
      # an approximate count check
5b88116f
   checkTrue(length(uniprobe.sox.matrices) > 10)
   checkTrue(length(uniprobe.sox.matrices) < 30)
b70c1a6e
 
5b88116f
    checkEquals(unique(mcols(uniprobe.sox.matrices)$dataSource), 'UniPROBE')
    checkEquals(unique(mcols(sox.uniprobe.matrices)$dataSource), 'UniPROBE')
    gene.symbols = sort(unique(mcols(uniprobe.sox.matrices)$geneSymbol))
2def1eec
 
5b88116f
     # query on a string(in this case, an oddly named motif) which contains
dae43ecd
     # regular expression characters.  no solution to this yet.
     # query uses base R's grep, in which
     #  x <- query(mdb, "ELK1,4_GABP{A,B1}.p3")
 
aaf4e1a7
 } # test.queryOldStyle
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
aaf4e1a7
 test.query <- function()
3cb31fd9
 {
5b88116f
   print('--- test.query')
3cb31fd9
   mdb = MotifDb
 
   ors <- c("MA0511.1", "MA0057.1")
   ands <- c("jaspar2018", "sapiens")
   nots <- "cisbp"
5da060a6
 
aaf4e1a7
   x <- query(mdb, andStrings=ands, orStrings=ors)
3cb31fd9
   checkEquals(length(x), 2)
   checkEquals(sort(names(x)),
              c("Hsapiens-jaspar2018-MZF1(var.2)-MA0057.1", "Hsapiens-jaspar2018-RUNX2-MA0511.1"))
 
aaf4e1a7
   x <- query(mdb, andStrings="MA0057.1")
3cb31fd9
   checkEquals(length(x), 15)
 
aaf4e1a7
   x <- query(mdb, andStrings=c("MA0057.1", "cisbp"))
3cb31fd9
   checkEquals(length(x), 11)
 
aaf4e1a7
   x <- query(mdb, andStrings=c("MA0057.1"), notStrings="cisbp")
3cb31fd9
   checkEquals(length(x), 4)
 
aaf4e1a7
   x <- query(mdb, andStrings=c("MA0057.1"), notStrings=c("cisbp", "JASPAR_2014"))
3cb31fd9
   checkEquals(length(x), 3)
 
aaf4e1a7
   x <- query(mdb, orStrings=c("mus", "sapiens"), andStrings="MA0057.1")
3cb31fd9
   #checkEquals(sort(names(x)),
 
     # do queries on dataSource counts match those from a contingency table?
5b88116f
   sources.list = as.list(table(mcols(mdb)$dataSource))
   checkEquals(length(query(mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
   checkEquals(length(query(mdb, 'uniprobe')), sources.list$UniPROBE)
   checkEquals(length(query(mdb, 'UniPROBE')), sources.list$UniPROBE)
3cb31fd9
 
aaf4e1a7
 } # test.query
3cb31fd9
 #------------------------------------------------------------------------------------------------------------------------
5da060a6
 test.query2 <- function()
 {
   mdb <- MotifDb
   matrices.human <- query(mdb, 'hsapiens')
   matrices.sox4 <- query(mdb, 'sox4')
 
   matrices.human.sox4 <- query(mdb, c("hsapiens", "sox4"))
   matrices.human.sox4.oldStyle <- query(matrices.human, "sox4")
5b88116f
   checkTrue(length(matrices.human.sox4) > 0)   # 6 found on(24 oct 2018)
5da060a6
   checkEquals(length(matrices.human.sox4), length(matrices.human.sox4.oldStyle))
 
   checkTrue(length(matrices.human.sox4) < length(matrices.human))
   checkTrue(length(matrices.human.sox4) < length(matrices.sox4))
 
   uniprobe.sox.matrices <- query(mdb, c('uniprobe', '^sox'))
 
 } # test.query2
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.transformMatrixToMemeRepresentation = function()
b70c1a6e
 {
5b88116f
   print('--- test.transformMatrixToMemeRepresentation')
   mdb = MotifDb #()
   matrix.agl3 = subset(mdb, dataSource=='JASPAR_CORE' & organism=='Athaliana' & geneSymbol=='AGL3')[[1]]
   checkEquals(dim(matrix.agl3), c(4, 10))
b70c1a6e
 
     # a transposed frequency is needed for meme.  we store normalized frequency matrices, to just transposition is needed
5b88116f
   tfm = MotifDb:::transformMatrixToMemeRepresentation(matrix.agl3)
   checkEquals(dim(tfm), c(10, 4))
   checkTrue(all(round(rowSums(tfm)) == 1.0))
b70c1a6e
 
 } # test.transformMatrixToMemeRepresentation
 #------------------------------------------------------------------------------------------------------------------------
 # translate a motif matrix from MotifList into text suitable for meme and tomtom input.
 # choose the top two from UniPROBE.  they reliably have high counts, and easily distinguished normalized frequencies
5b88116f
 test.matrixToMemeText = function()
b70c1a6e
 {
5b88116f
   print('--- test.matrixToMemeText')
   sox4 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
   rtg3 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
b70c1a6e
 
5b88116f
   text.sox4 = MotifDb:::matrixToMemeText(sox4)
      # check these with t(sox4 [[1]])
b70c1a6e
   line1.sox4  = " 0.2457457130  0.1950426500  0.2287887620  0.3304228750"
   line14.sox4 = " 0.2821643030  0.2286132160  0.1585395830  0.3306828990"
2def1eec
 
5b88116f
   checkEquals(length(text.sox4), 29)
   checkEquals(text.sox4 [1], "MEME version 4")
   checkEquals(text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
   checkEquals(grep(line1.sox4, text.sox4), 12)
   checkEquals(grep(line14.sox4, text.sox4), 25)
b70c1a6e
 
5b88116f
   text.rtg3 = MotifDb:::matrixToMemeText(rtg3)
b70c1a6e
   line1.rtg3  = " 0.3935122858  0.1453016447  0.3308830322  0.1303030373"
   line20.rtg3 = " 0.2490417648  0.3966478493  0.1083586569  0.2459517291"
5b88116f
   checkEquals(length(text.rtg3), 35)         # 4 trailing empty lines
   checkEquals(text.rtg3 [1], "MEME version 4")
   checkEquals(text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
   checkEquals(grep(line1.rtg3, text.rtg3), 12)
   checkEquals(grep(line20.rtg3, text.rtg3), 31)
b70c1a6e
 
     # now call with both matrices, and see if the right results are returned
5b88116f
   text.both = MotifDb:::matrixToMemeText(c(sox4, rtg3))
   checkEquals(text.both [1], "MEME version 4")
   checkEquals(grep(line1.sox4, text.both), 12)
   checkEquals(grep(line14.sox4, text.both), 25)
   checkEquals(grep(line1.rtg3, text.both),  29)
   checkEquals(grep(line20.rtg3, text.both), 48)
b70c1a6e
 
 } # test.matrixToMemeText
 #------------------------------------------------------------------------------------------------------------------------
 # translate a motif matrix from MotifList into text suitable for meme and tomtom input.
 # choose the top two from UniPROBE.  they reliably have high counts, and easily distinguished normalized frequencies
5b88116f
 #test.matrixToMemeText_mapVersion = function()
b70c1a6e
 #{
5b88116f
 #  print('--- test.matrixToMemeText_mapVersion')
 #  sox4 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
 #  rtg3 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
b70c1a6e
 #
5b88116f
 #  text.sox4 = MotifDb:::mapped.broken.matrixToMemeText(sox4)
 #  text.rtg3 = MotifDb:::mapped.broken.matrixToMemeText(rtg3)
 #  text.both = MotifDb:::mapped.broken.matrixToMemeText(c(sox4, rtg3))
b70c1a6e
 #
5b88116f
 #     # check these with t(sox4 [[1]])
b70c1a6e
 #  line1.sox4  = " 0.2457457130  0.1950426500  0.2287887620  0.3304228750"
 #  line14.sox4 = " 0.2821643030  0.2286132160  0.1585395830  0.3306828990"
 #
5b88116f
 #  checkEquals(length(text.sox4), 29)
 #  checkEquals(text.sox4 [1], "MEME version 4")
 #  checkEquals(text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
 #  checkEquals(grep(line1.sox4, text.sox4), 12)
 #  checkEquals(grep(line14.sox4, text.sox4), 25)
b70c1a6e
 #
 #  line1.rtg3  = " 0.3935122858  0.1453016447  0.3308830322  0.1303030373"
 #  line20.rtg3 = " 0.2490417648  0.3966478493  0.1083586569  0.2459517291"
5b88116f
 #  checkEquals(length(text.rtg3), 35)         # 4 trailing empty lines
 #  checkEquals(text.rtg3 [1], "MEME version 4")
 #  checkEquals(text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
 #  checkEquals(grep(line1.rtg3, text.rtg3), 12)
 #  checkEquals(grep(line20.rtg3, text.rtg3), 31)
b70c1a6e
 #
 #    # now call with both matrices, and see if the right results are returned
5b88116f
 #  checkEquals(text.both [1], "MEME version 4")
 #  checkEquals(grep(line1.sox4, text.both), 12)
 #  checkEquals(grep(line14.sox4, text.both), 25)
 #  checkEquals(grep(line1.rtg3, text.both),  29)
 #  checkEquals(grep(line20.rtg3, text.both), 48)
b70c1a6e
 #
 #} # test.matrixToMemeText_mapVersion
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.export_memeFormatStdOut = function()
b70c1a6e
 {
5b88116f
   print('--- test.export_memeFormatStdOut')
   mdb = MotifDb #()
   mdb.chicken = subset(mdb, organism=='Gallus')
   checkEquals(length(mdb.chicken), 3)
b70c1a6e
     # text is cat-ed to stdout, so not avaialable here to check.
     # but just like print, export also returns the text invisibly.
     # so that CAN be checked.
2def1eec
 
5b88116f
   meme.text = export(mdb.chicken, format='meme')
   checkEquals(length(meme.text), 1)   # just one long string
   checkTrue(is.character(meme.text))
   checkTrue(nchar(meme.text) > 800)   # 1002 as of(10 aug 2012)
   return(TRUE)
b70c1a6e
 
 } # test.exportMemeFormatToStdOut
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.export_memeFormatToFile = function()
b70c1a6e
 {
5b88116f
   print('--- test.export_memeFormatToFile')
   mdb = MotifDb #()
   mdb.chicken = subset(mdb, organism=='Gallus')
   checkEquals(length(mdb.chicken), 3)
   output.file = tempfile()
   meme.text = export(mdb.chicken, output.file, 'meme')
   retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE)
   invisible(retrieved)
b70c1a6e
 
 } # test.exportMemeFormatToFile
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.export_memeFormatToFileDuplication = function()
b70c1a6e
 {
5b88116f
   print('--- test.export_memeFormatToFileDuplication')
   mdb = MotifDb #()
   mdb.mouse = subset(mdb, organism=='Mmusculus')
e2f7996a
   checkTrue(length(mdb.mouse) > 1300)
5b88116f
   output.file = 'mouse.txt' # tempfile()
b70c1a6e
   max = 3
5b88116f
   meme.text = export(mdb.mouse [1:max], output.file, 'meme')
   retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE)
   invisible(retrieved)
b70c1a6e
 
 } # test.exportMemeFormatToFileDuplication
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.export_memeFormatToFile_run_tomtom = function(max=50)
b70c1a6e
 {
5b88116f
   if(interactive()) {
     print('--- test.export_memeFormatToFile_run_tomtom')
b70c1a6e
     mdb = MotifDb
5b88116f
     sox4.mouse = subset(mdb, organism=='Mmusculus' & geneSymbol=='Sox4')
     all.human = subset(mdb, organism=='Hsapiens')
b70c1a6e
 
5b88116f
     tomtom.tmp.dir = tempfile()
     print(tomtom.tmp.dir)
     stopifnot(dir.create(tomtom.tmp.dir))
     sox4.file.path = file.path(tomtom.tmp.dir, 'sox4.mouse.text')
     all.human.file.path = file.path(tomtom.tmp.dir,'all.human.text')
     export(sox4.mouse, sox4.file.path, 'meme')
     export(all.human, all.human.file.path, 'meme')
b70c1a6e
 
        # find similarity of motif #1 to all the motifs in mdbMany
 
a84b875e
        # cannot rely upon tomtom being present
 
5b88116f
     #cmd = sprintf('tomtom -no-ssc -oc %s -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s',
a84b875e
     #                tomtom.tmp.dir, sox4.file.path, all.human.file.path)
5b88116f
     #system(cmd)
     #cmd = sprintf('open %s/tomtom.html', tomtom.tmp.dir)
     #system(cmd)
b70c1a6e
     } # if interactive
 
 } # test.export_memeFormatToFile_run_tomtom
 #------------------------------------------------------------------------------------------------------------------------
000f7c78
 # MotIV::motifMatch fails with MotIV_1.25.0.  will look into this in September, 2015, pshannon
5b88116f
 # MotIV abandoned, all of my own motif/sequence matching is done via an independently installed
 # FIMO from the meme suite.
 # another option is the MOODs motif matching capability provided by the bioc package "motifmatchr"
 # this possibility, and these tests, are deferred for now.
 disabled_test.run_MotIV.motifMatch = function()
b70c1a6e
 {
5b88116f
   require(MotIV)
   print('--- test.run_MotIV.motifMatch')
   mdb <- MotifDb #()
b70c1a6e
 
9ad58ff7
   db.tmp <- mdb@listData
b70c1a6e
 
      # match motif 1 against the entire MotifDb  collection
5b88116f
   motif.hits <- motifMatch(db.tmp [1], database<-db.tmp)
b70c1a6e
      # the long way to extract the matrix name.  see MotIV.toTable below for more convenient way
5b88116f
   checkEquals(motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names(db.tmp)[1])
b70c1a6e
 
000f7c78
      # match the last motif against all
   last <- length(db.tmp)
9ad58ff7
      # MotIV:motifMatch works differently on linux and macos.  by asking for 50 matches,
5b88116f
      # the search target(db.tmp[last]) is sure to be in the hit list.
   motif.hits <-  motifMatch(db.tmp [last], database=db.tmp, top=50)
   tbl.hits <- MotIV.toTable(motif.hits)
000f7c78
     # the 5 hits return should include the one we tried to match, but the MotIV search strategy
     # may not place it first
   checkTrue(names(db.tmp[last]) %in% tbl.hits$name)
5b88116f
   invisible(tbl.hits)
2def1eec
 
5b88116f
 } # distable_test.run_MotIV.motifMatch
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 MotIV.toTable = function(match)
b70c1a6e
 {
5b88116f
   stopifnot(length(match@bestMatch) >= 1)
b70c1a6e
   alignments = match@bestMatch[[1]]@aligns
 
5b88116f
   df = data.frame(stringsAsFactors=FALSE)
   for(alignment in alignments) {
b70c1a6e
     x = alignment
     name = x@TF@name
     eVal = x@evalue
     sequence = x@sequence
     match = x@match
     strand = x@strand
5b88116f
     df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
b70c1a6e
     } # for alignment
 
5b88116f
   return(df)
b70c1a6e
 
2def1eec
 } # MotIV.toTable
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 disabled_test.MotIV.toTable = function()
b70c1a6e
 {
5b88116f
   print('--- test.MotIVtoTable')
   mdb = MotifDb #()
   test.hits = motifMatch(mdb[1]@listData, database=jaspar)
   tbl.hits =  MotIV.toTable(test.hits)
   checkEquals(dim(tbl.hits), c(5, 5))
   checkEquals(sort(colnames(tbl.hits)), sort(c("name", "eVal", "sequence", "match", "strand")))
b70c1a6e
 
2def1eec
 } # test.MotIV.toTable
b70c1a6e
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 pwmMatch.toTable = function(motifMatch) {
    if(length(motifMatch@bestMatch) == 0)
    return(NA)
b70c1a6e
 
    df.list = vector("list", length(motifMatch@bestMatch))
5b88116f
    for(k in seq(length(motifMatch@bestMatch)))
b70c1a6e
    {
        alignments = motifMatch@bestMatch[[k]]@aligns
5b88116f
        df = data.frame(stringsAsFactors=FALSE)
        for(alignment in alignments) {
b70c1a6e
            x = alignment
            name = x@TF@name
            eVal = x@evalue
            sequence = x@sequence
            match = x@match
            strand = x@strand
5b88116f
            df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
b70c1a6e
        } # for alignment
        df.list[[k]]=df
    }
    names(df.list) <- names(motifMatch)
5b88116f
    return(df.list)
77e1c264
 
 } # pwmMatch.toTable
 #------------------------------------------------------------------------------
 # Robert Stojnic reports incorrect gene symbols for matrices obtained from
 # flyFactorSurvey.
 # the solution was to abandon the original strategy of extracting the
5b88116f
 # symbol from the matrix(and file) name.
 # now, the flybase importer("inst/scripts/import/flyFactorSurvey/import.R")
 # uses FBgn id(which can be reliably extracted) and uses indpendent
77e1c264
 # data sources to learn the gene symbol.
 #
 # robert's email:
 #  I'm working on using MotifDb motifs in my PWMEnrich package and I
 #  have noticed that there is a slight problem with gene symbols for
 #  Drosophila. In particular, the gene symbols do not always correspond
 #  to the gene ID and are frequently mis-capitalized. In Drosophila z
 #  and Z are two different genes and capitalization does matter if
 #  someone is to use the gene symbols. Also, in some cases the symbols
 #  are missing hyphens or parenthesis. I have used the gene IDs and the
 #  Flybase annotation database to set the correct gene symbols for
 #  Drosophila, please find attached the result of my re-annotation.
 #
2def1eec
 #  looking at his correctedMotifDbDmel.csv
77e1c264
 #
 #    head(read.table("correctedMotifDbDmel.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))
 #                  providerName oldGeneSymbol newGeneSymbol
 #    1 ab_SANGER_10_FBgn0259750            Ab            ab
 #    2  ab_SOLEXA_5_FBgn0259750            Ab            ab
 #    3 Abd-A_FlyReg_FBgn0000014         Abd-a         abd-A
 #    4 Abd-B_FlyReg_FBgn0000015         Abd-b         Abd-B
 #    5    AbdA_Cell_FBgn0000014          Abda         abd-A
 #    6  AbdA_SOLEXA_FBgn0000014          Abda         abd-A
 #
 test.flyFactorGeneSymbols <- function()
 {
5b88116f
     print("--- test.flyFactorGeneSymbols")
77e1c264
     mdb = MotifDb
07b8ea8b
     checkEquals(sort(mcols(query(mdb, "FBgn0259750"))$geneSymbol),
                 sort(c("FBgn0259750", "FBgn0259750")))
77e1c264
     checkEquals(mcols(query(mdb, "FBgn0000014"))$geneSymbol, rep("abd-A", 3))
     checkEquals(mcols(query(mdb, "FBgn0000015"))$geneSymbol, rep("Abd-B", 3))
 
 } # test.flyFactorGeneSymbols
 #-------------------------------------------------------------------------------
5b88116f
 test.export_jasparFormatStdOut = function()
8c9d39f9
 {
5b88116f
   print('--- test.export_jasparFormatStdOut')
   mdb = MotifDb #()
   mdb.chicken = subset(mdb, organism=='Gallus')
   checkEquals(length(mdb.chicken), 3)
8c9d39f9
     # text is cat-ed to stdout, so not avaialable here to check.
     # but just like print, export also returns the text invisibly.
     # so that CAN be checked.
2def1eec
 
5b88116f
   jaspar.text = export(mdb.chicken, format='jaspar')
   checkEquals(length(jaspar.text), 1)   # just one long string
   checkTrue(is.character(jaspar.text))
   checkTrue(nchar(jaspar.text) > 800)   # 1002 as of(10 aug 2012)
   return(TRUE)
8c9d39f9
 
 } # test.exportjasparFormatToStdOut
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.export_jasparFormatToFile = function()
8c9d39f9
 {
5b88116f
   print('--- test.export_jasparFormatToFile')
   mdb = MotifDb #()
   mdb.chicken = subset(mdb, organism=='Gallus')
   checkEquals(length(mdb.chicken), 3)
   output.file = tempfile()
   jaspar.text = export(mdb.chicken, output.file, 'jaspar')
   retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE)
   invisible(retrieved)
77e1c264
 
8c9d39f9
 } # test.exportjasparFormatToFile
2def1eec
 #------------------------------------------------------------------------------------------------------------------------
49ebd9dc
 test.geneToMotif <- function()
2def1eec
 {
49ebd9dc
    printf("--- test.geneToMotif")
2def1eec
    mdb <- MotifDb
 
68ba7eb5
    genes <- c("FOS", "ATF5", "bogus", "SATB2")
    good.genes <- genes[-which(genes=="bogus")]
49ebd9dc
       # use  TFClass family classifcation
4acf0d78
    tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS")   # intentional mis-capitalization
68ba7eb5
    checkTrue(all(good.genes %in% tbl.tfClass$gene))
 
    expected.motifs <- c("MA0833.1", "MA0099.2", "MA0476.1", "MA0679.1", "MA0754.1", "MA0755.1", "MA0756.1", "MA0757.1")
    checkTrue(all(expected.motifs %in% tbl.tfClass$motif))
    checkEquals(unique(tbl.tfClass$source), "TFClass")
49ebd9dc
 
       # MotifDb mode uses the MotifDb metadata, pulled from many sources
4acf0d78
    tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb")     # intentional mis-capitalization
5b88116f
    checkEquals(dim(tbl.mdb), c(14, 6))
4acf0d78
    checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
5b88116f
       # no recognizable(i.e., jaspar standard) motif name returned by MotifDb metadata
49ebd9dc
       # MotifDb for ATF5
       # todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1
 
68ba7eb5
      # check use of ignore.case
    tbl.caseSensitive <-  geneToMotif(MotifDb, "STAT4", source="MotifDb")
    checkEquals(length(grep("jaspar", tbl.caseSensitive$dataSource, ignore.case=TRUE)), 0)
    tbl.caseInsensitive <-  geneToMotif(MotifDb, "STAT4", source="MotifDb", ignore.case=TRUE)
    checkTrue(length(grep("jaspar", tbl.caseInsensitive$dataSource, ignore.case=TRUE)) >= 3)
 
    tbl.caseSensitive <-  geneToMotif(MotifDb, "stat4", source="TFclass")
    checkEquals(nrow(tbl.caseSensitive), 0)
    tbl.caseInsensitive <-  geneToMotif(MotifDb, "stat4", source="TFclass", ignore.case=TRUE)
    checkTrue(nrow(tbl.caseInsensitive) >= 5)
 
5f111ace
 } # test.geneToMotif
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 # this case discovered(31 jan 2018). when called on a gene/source combination for which there are
 # no motifs, i attempted to add the mapping source(either "MotifDb", "TFClass") as a column
68ba7eb5
 # to an empty data.frame.  check for that and its fix here
 test.geneToMotif.oneGene.noMotifs <- function()
 {
    checkEquals(nrow(geneToMotif(MotifDb, "SATB2", "MotifDb")), 0)
    checkEquals(nrow(geneToMotif(MotifDb, "bogus-arandum", "MotifDb")), 0)
    checkEquals(nrow(geneToMotif(MotifDb, "bogus-arandum", "TFclass")), 0)
 
 } # test.geneToMotif.oneGene.noMotifs
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 # sad to say I do not recall what problem/fix is tested here(pshannon, 23 jan 2018).
5f111ace
 # however, it demonstrates the variety of results which can be returned by non-jaspar datasets
 # when using the MotifDb mapping source, and the relative paucity which is sometimes
 # seen with the TFclass mapper
 test.geneToMotif.ignore.jasparSuffixes <- function()
 {
    printf("--- test.geneToMotif.ignore.jasparSuffixes")
    mdb <- MotifDb
 
    genes <- c("FOS", "ATF5", "bogus")
 
       # use  TFClass family classifcation
    tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS")   # intentional mis-capitalization
    checkEquals(sort(tbl.tfClass$gene),  sort(c("ATF5", "FOS", "FOS")))
    checkEquals(sort(tbl.tfClass$motif),  sort(c("MA0833.1", "MA0099.2", "MA0476.1")))
    checkEquals(tbl.tfClass$source, rep("TFClass", 3))
 
       # MotifDb mode uses the MotifDb metadata, pulled from many sources
    tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb")     # intentional mis-capitalization
5b88116f
    checkEquals(dim(tbl.mdb), c(14, 6))
5f111ace
    checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
5b88116f
       # no recognizable(i.e., jaspar standard) motif name returned by MotifDb metadata
5f111ace
       # MotifDb for ATF5
 
       # compare the MA0110599_1.02 matrix of cisp_1.02 to japar MA0476.1: the identical matrix!
       # 1         FOS    MA0110599_1.02   cisbp_1.02  Hsapiens 24194598 MotifDb
       # 10        FOS          MA0476.1   jaspar2018  Hsapiens 17916232 MotifDb
       # this establishes the need for careful scrutiny as one winnows a geneToMotif result into
       # useful non-reduplicative sequence analysis
 
e2f7996a
    pfm.ma0110599 <- as.list(query(mdb, "MA0110599"))[[1]]
    pfm.ma0476.1  <- as.list(query(query(mdb, "MA0476.1"), "jaspar2018"))[[1]]
    checkEquals(pfm.ma0110599, pfm.ma0476.1)
5f111ace
 
e2f7996a
 } # test.geneToMotif.ignore.jasparSuffixes
49ebd9dc
 #------------------------------------------------------------------------------------------------------------------------
 test.motifToGene <- function()
 {
    printf("--- test.motifToGene")
aaf4e1a7
    #printf("Sys.getlocale: %s", Sys.getlocale())
 
      # good test case of querying both sources,
    motif <- "MA0099.2"
    tbl <- motifToGene(MotifDb, motif, c("MotifDb", "TFclass"))
       # [1]   mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN
       # [1]   tfc.genes: FOS, JUN
    checkEquals(sort(unique(tbl$geneSymbol)), c("AP1", "FOS", "FOS::JUN", "JUN", "JUN::FOS"))
2def1eec
 
49ebd9dc
    motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")
2def1eec
 
0aa3bd26
       # MotifDb mode uses the MotifDb metadata "providerId",
    tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb")
fda2deaa
    checkEquals(dim(tbl.mdb), c(3, 5))
    expected <- sort(c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
1ae931f9
    actual <- sort(tbl.mdb$motif)
    checkEquals(actual, expected)
fda2deaa
    checkEquals(sort(tbl.mdb$geneSymbol), sort(c("ELF1", "Esrra", "Zfp740")))
    checkEquals(tbl.mdb$source,     rep("MotifDb", 3))
d030e74b
 
       # TFClass mode uses  TF family classifcation
0aa3bd26
    tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass")
aaf4e1a7
    checkEquals(dim(tbl.tfClass), c(9,5))
4acf0d78
    checkEquals(tbl.tfClass$motif, rep("MA0592.2", 9))
07b8ea8b
    checkEquals(sort(tbl.tfClass$gene), sort(c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR")))
4acf0d78
    checkEquals(tbl.tfClass$source,       rep("TFClass", 9))
49ebd9dc
 
d030e74b
      # test motifs with regex characters in them, or other characters neither letter nor number
    motifs <- sort(c("DMAP1_NCOR{1,2}_SMARC.p2", "ELK1,4_GABP{A,B1}.p3", "SNAI1..3.p2", "EWSR1-FLI1.p2", "ETS1,2.p2"))
    tbl <- motifToGene(MotifDb, motifs, source="MotifDb")
    checkEquals(nrow(tbl), 0)
 
    tbl <- motifToGene(MotifDb, motifs, source="tfclass")
aaf4e1a7
    checkEquals(ncol(tbl), 5)
d030e74b
    checkTrue(nrow(tbl) > 80)
    checkTrue(nrow(tbl) < 100)
    checkTrue(all(motifs %in% tbl$motif))
 
aaf4e1a7
    motifs <- c("Hsapiens-HOCOMOCOv10-IKZF1_HUMAN.H10MO.C",
                "MA0099.2",
                "Hsapiens-SwissRegulon-ARID3A.SwissRegulon",
                "MA0592.2",
                "MA0036913_1.02")
 
    tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))
    checkTrue(all(motifs %in% tbl$motif))
 
    motif <- "Mmusculus;Rnorvegicus;Hsapiens-jaspar2018-FOS::JUN-MA0099.2"
    tbl <- motifToGene(MotifDb, motif, source="TFClass")
    checkEquals(tbl$geneSymbol, c("FOS", "JUN"))
 
    motifs <- c("Hsapiens-jaspar2016-RUNX1-MA0002.1",
                "Hsapiens-jaspar2016-TFAP2A-MA0003.1",
                "Hsapiens-jaspar2016-TFAP2A-MA0003.2",
                "Hsapiens-jaspar2016-TFAP2A-MA0003.3",
                "Hsapiens-jaspar2016-AR-MA0007.2",
                "MA0872.1")
    tbl <- motifToGene(MotifDb, motifs, source="Motifdb")
    checkTrue(all(motifs %in% tbl$motif))
    checkEquals(sort(unique(tbl$geneSymbol)), c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)"))
 
    tbl <- motifToGene(MotifDb, motifs, source="TFClass")
    checkEquals(sort(unique(tbl$motif)), c("Hsapiens-jaspar2016-TFAP2A-MA0003.3", "MA0872.1"))
    checkEquals(sort(unique(tbl$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
 
fda2deaa
    tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))
    checkTrue(all(motifs %in% tbl$motif))
    checkEquals(sort(unique(tbl$geneSymbol)),
                     c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
aaf4e1a7
 
5b88116f
       #(23 may 2018) found that MotifDb works, but c("MotifDb", "TFClass") does not
9535d30b
       # test the fix here
 
    motifs <- c("Hsapiens-jolma2013-IRF5-2", "Hsapiens-SwissRegulon-IRF5.SwissRegulon")
    checkEquals(motifToGene(MotifDb, motifs, source=c("MotifDb"))$geneSymbol, c("IRF5", "IRF5"))
    checkEquals(nrow(motifToGene(MotifDb, motifs, source=c("TFClass"))), 0)
    checkEquals(motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))$geneSymbol, c("IRF5", "IRF5"))
 
aaf4e1a7
    } # test.motifToGene
49ebd9dc
 #------------------------------------------------------------------------------------------------------------------------
 test.associateTranscriptionFactors <- function()
 {
    printf("--- test.associateTranscriptionFactors")
 
    mdb <- MotifDb
aaf4e1a7
    pfms <- query(mdb, andStrings=c("sapiens", "jaspar2016"))
2def1eec
 
fda2deaa
       # query(mdb, "jaspar2016", "sapiens")
49ebd9dc
       # first check motifs with MotifDb-style long names, using MotifDb lookup, in the
       # metadata of MotifDb:
       #      "Hsapiens-jaspar2016-RUNX1-MA0002.1"  "Hsapiens-jaspar2016-TFAP2A-MA0003.1"
2def1eec
 
fda2deaa
    motif.names <- c(names(pfms[1:5]), "MA0872.1", "hocus.pocus")
    tbl <- data.frame(motifName=motif.names, score=runif(7), stringsAsFactors=FALSE)
 
    tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=TRUE)
    checkEquals(nrow(tbl), nrow(tbl.anno.mdb))
    checkTrue(is.na(tbl.anno.mdb$geneSymbol[grep("hocus.pocus", tbl.anno.mdb$motifName)]))
    checkTrue(all(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)") %in% tbl.anno.mdb$geneSymbol))
 
    tbl.anno.tfc <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
    checkTrue(nrow(tbl) < nrow(tbl.anno.tfc))
    checkEquals(sort(unique(tbl.anno.tfc$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E"))
 
    tbl.anno.both <- associateTranscriptionFactors(mdb, tbl, source=c("MotifDb", "TFClass"), expand.rows=TRUE)
    checkEquals(length(grep("MotifDb", tbl.anno.both$source)), 6)
    checkEquals(length(grep("TFClass", tbl.anno.both$source)), 10)
 
    #   checkEquals(dim(tbl.anno.mdb), c(nrow(tbl), ncol(tbl) + 4))
    #   checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno.mdb)))
    #   checkEquals(sort(tbl.anno.mdb$geneSymbol),
    #               sort(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)")))
    #
    #      # now add in a bogus motif name, one for which there cannot possibly be a TF
    #
    #   motif.names[3] <- "bogus"
    #   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
    #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
    #   checkTrue(is.na(tbl.anno$geneSymbol[3]))
    #   checkTrue(is.na(tbl.anno$pubmedID[3]))
    #
    #      # now check motifs with short, traditional names, in "TFClass" mode, which uses
    #      # the tfFamily
    #      #      "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
    #
    #   motif.names <- names(pfms[1:5])
    #   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
    #   tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
    #
    #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
    #   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
    #
    #      # TFClass only annotates MA0003.3, none of the others
    #   checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
    #   checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
    #   checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
    #   checkEquals(tbl.anno$pubmedID[4],   "23180794")
    #
    #      # now ask for expandsion of the semicolon separated list
    #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
    #   checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
    #   checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
    #
    #      # now add in a bogus motif name, one for which there cannot possibly be a TF
    #
    #   motif.names <- names(pfms[1:5])
    #   short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
    #   short.motif.names[4] <- "bogus"
    #   tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
    #
    #   tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
    #   checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
    #      # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
    #   checkTrue(all(is.na(tbl.anno$geneSymbol)))
    #   checkTrue(all(is.na(tbl.anno$pubmedID)))
    #
    #      # now make sure that the absence of the  TFClass-specific "shortMotif" field is detected
    #   motif.names <- names(pfms[1:5])
    #   tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
    #   checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
49ebd9dc
 
aaf4e1a7
       # now some motif names
094b985a
 } # test.associateTranscriptionFactors
2def1eec
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.match <- function()
 {
    printf("--- test.match")
    gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560))
    motifs <- query(MotifDb, c("jaspar2018", "ZNF263"))
    checkEquals(length(motifs), 1)
    gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
    checkEquals(length(gr.match), 1)  # just one motif
    checkEquals(names(gr.match), names(motifs))
    checkEquals(length(gr.match[[1]]), 3)
 
    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
    checkEquals(dim(tbl.match), c(3, 7))
    checkTrue(all(tbl.match$motif == names(motifs)))
    checkEquals(class(tbl.match$chrom), "character")  # not a factor
 
    motifs <- query(MotifDb, "ZNF263", c("jaspar2018", "swissregulon"))
    checkEquals(length(motifs), 2)
    gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5)
    checkEquals(names(gr.match), names(motifs))
    checkEquals(as.numeric(lapply(gr.match, length)), c(3, 1))
 
    tbl.match <-matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
    checkEquals(dim(tbl.match), c(4, 7))
    checkEquals(length(unique(tbl.match$motif)), 2)
    checkEquals(unique(tbl.match$motif), names(motifs))
    checkEquals(colnames(tbl.match), c("chrom", "start", "end", "width", "strand", "mood.score", "motif_id"))
 
 
         #------------------------------------------------
         # now all jaspar2018 human motifs
         #------------------------------------------------
 
    motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE)
    checkEquals(dim(tbl.match), c(7, 7))
    checkEquals(sort(unique(tbl.match$motif)),
                c("Hsapiens-jaspar2018-EWSR1-FLI1-MA0149.1", "Hsapiens-jaspar2018-ZNF263-MA0528.1"))
 
         #-----------------------------------------------------
         # now all jaspar2018 human motifs, loosen the pValue
         #-----------------------------------------------------
 
    motifs <- query(MotifDb, c("jaspar2018", "hsapiens"))
    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-4, fimoDataFrameStyle=TRUE)
    checkTrue(nrow(tbl.match) > 15)
 
    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-3, fimoDataFrameStyle=TRUE)
    checkTrue(nrow(tbl.match) > 50)
 
        #-------------------------------------------------------------
        # now all jaspar2018 and hocomoco human motifs across 10kb
        #------------------------------------------------------------
 
f56272b0
    motifs <- query(MotifDb, "hsapiens", orStrings=c("jaspar2018", "hocomoco-core"))
    checkTrue(length(motifs) > 500)
5b88116f
    gr.region <- GRanges(seqnames="chr1", IRanges(start=47229000, end=47239000))
 
    tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-7, fimoDataFrameStyle=TRUE)
f56272b0
    checkTrue(nrow(tbl.match) > 90 && nrow(tbl.match) < 110)
5b88116f
    checkEquals(order(tbl.match$start), seq_len(nrow(tbl.match)))
 
 } # test.match
 #------------------------------------------------------------------------------------------------------------------------
aaf4e1a7
 findMotifsWithMutuallyExclusiveMappings <- function()
 {
    xtab <- as.data.frame(table(MotifDb@manuallyCuratedGeneMotifAssociationTable$motif))
    xtab <- xtab[order(xtab$Freq, decreasing=TRUE),]
    for(motif in xtab$Var1){
       mdb.genes <- toupper(motifToGene(MotifDb, motif, "motifdb")$geneSymbol)
       tfc.genes <- toupper(motifToGene(MotifDb, motif, "tfclass")$geneSymbol)
       if(length(mdb.genes) == 0) next
       if(length(tfc.genes) == 0) next
       if(length(intersect(mdb.genes, tfc.genes)) == 0){
          printf("------ %s", motif)
          printf("  mdb.genes: %s", paste(mdb.genes, collapse=", "))
          printf("  tfc.genes: %s", paste(tfc.genes, collapse=", "))
         } # if no intersection
       } # for motif
 
    #  [1] ------ MA0099.2
    #  [1]   mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN
    #  [1]   tfc.genes: FOS, JUN
 
 } # findMotifsWithMutuallyExclusiveMappings
 #------------------------------------------------------------------------------------------------------------------------
5b88116f
 test.hocomoco11.with.reliabilityScores <- function()
 {
    printf("--- test.hocomoco11.with.reliabilityScores")
 
2d7af589
      #-------------------------------------------------------------------------
      # these queries rely primarily upon the dataSoure column of the metadata
      # subsequent checks below look at metadata rownames and matrix names
      #-------------------------------------------------------------------------
 
f56272b0
    checkEquals(length(query(MotifDb, "hocomoco")), 1834)
5b88116f
    checkEquals(length(query(MotifDb, "hocomocov10")), 1066)
f56272b0
    checkEquals(length(query(MotifDb, "hocomocov11")), 768)
    checkEquals(length(query(MotifDb, "hocomocov11-core")), 400)
fe1bab0f
    checkEquals(length(query(MotifDb, "hocomocov11-secondary")), 368)
f56272b0
 
    checkEquals(length(query(MotifDb, "hocomocov11-core-A")), 181)
fe1bab0f
    checkEquals(length(query(MotifDb, "hocomocov11-secondary-A")), 46)
f56272b0
 
    checkEquals(length(query(MotifDb, "hocomocov11-core-B")), 84)
fe1bab0f
    checkEquals(length(query(MotifDb, "hocomocov11-secondary-B")), 19)
f56272b0
 
    checkEquals(length(query(MotifDb, "hocomocov11-core-C")), 135)
fe1bab0f
    checkEquals(length(query(MotifDb, "hocomocov11-secondary-C")), 13)
5b88116f
 
f56272b0
    checkEquals(length(query(MotifDb, "hocomocov11-core-D")), 0)
fe1bab0f
    checkEquals(length(query(MotifDb, "hocomocov11-secondary-D")), 290)
5b88116f
 
2d7af589
      #-------------------------------------------------------------------------
      # check matrix names
      #-------------------------------------------------------------------------
    checkEquals(length(grep("HOCOMOCOv11-core-A", names(MotifDb))), 181)
    checkEquals(length(grep("HOCOMOCOv11-secondary-A", names(MotifDb))), 46)
 
    checkEquals(length(grep("HOCOMOCOv11-core-A", rownames(mcols(MotifDb)))), 181)
    checkEquals(length(grep("HOCOMOCOv11-secondary-A", rownames(mcols(MotifDb)))), 46)
 
5b88116f
 } # test.hocomoco11.with.reliabilityScores
 #------------------------------------------------------------------------------------------------------------------------
48c3af44
 if(!interactive())
    runTests()