library(MotifDb) library(RUnit) library(MotIV) library(seqLogo) #---------------------------------------------------------------------------------------------------- printf <- function(...) print(noquote(sprintf(...))) #---------------------------------------------------------------------------------------------------- runTests = function() { test.emptyCtor() test.nonEmptyCtor() test.MotifDb.normalMode() test.MotifDb.emptyMode() test.allMatricesAreNormalized() test.providerNames() test.geneSymbols() test.geneIdsAndTypes() test.proteinIds() test.sequenceCount() test.longNames() test.organisms() test.bindingDomains() test.experimentTypes() test.tfFamilies() test.bindingSequences() test.flyBindingDomains() test.pubmedIDs() #test.allFullNames() test.subset() test.subsetWithVariables() test.queryOldStyle() test.query() test.transformMatrixToMemeRepresentation() test.matrixToMemeText() test.export_memeFormatStdOut() test.export_memeFormatToFile() test.export_memeFormatToFileDuplication() test.export_memeFormatToFile_run_tomtom() #test.MotIV.toTable() #test.run_MotIV.motifMatch() test.flyFactorGeneSymbols() test.export_jasparFormatStdOut() test.export_jasparFormatToFile() test.geneToMotif() test.geneToMotif.ignore.jasparSuffixes() test.geneToMotif.oneGene.noMotifs test.motifToGene() test.associateTranscriptionFactors() test.match() test.hocomoco11.with.reliabilityScores() } # runTests #------------------------------------------------------------------------------------------------------------------------ test.emptyCtor = function() { print('--- test.emptyCtor') motif.list = MotifDb:::MotifList() checkEquals(length(motif.list), 0) } # test.emptyCtor #------------------------------------------------------------------------------------------------------------------------ test.nonEmptyCtor = function() { print('--- test.nonEmptyCtor') mtx = matrix(runif(20), nrow=4, ncol=5, byrow=T, dimnames=list(c('A', 'C', 'G', 'T'), as.character(1:5))) mtx.normalized = apply(mtx, 2, function(colvector) colvector / sum(colvector)) matrixList = list(mtx.normalized) tbl.md = data.frame(providerName='', providerId='', dataSource='', geneSymbol='', geneId='', geneIdType='', proteinId='', proteinIdType='', organism='', sequenceCount='', bindingSequence='', bindingDomain='', tfFamily='', experimentType='', pubmedID='', stringsAsFactors=FALSE) names(matrixList) = 'test' rownames(tbl.md) = 'test' motif.list = MotifDb:::MotifList(matrixList, tbl.md) checkEquals(length(motif.list), 1) } # test.nonEmptyCtor #------------------------------------------------------------------------------------------------------------------------ # 'normal' in that all included data sources are already loaded test.MotifDb.normalMode = function() { print('--- test.MotifDb.normalMode') mdb = MotifDb # (quiet=TRUE) #(5 jun 2012) # JASPAR_CORE: 459 # ScerTF: 196 # UniPROBE: 380 # FlyFactorSurvey: 614 # hPDI: 437 checkTrue(length(mdb) > 2080) } # test.MotifDb.normalMode #------------------------------------------------------------------------------------------------------------------------ # this mode is not intended for users, but may see use in the future. test.MotifDb.emptyMode = function() { print('--- test.MotifDb.emptyMode') mdb = MotifDb:::.MotifDb(loadAllSources=FALSE, quiet=TRUE) checkTrue(length(mdb) == 0) } # test.MotifDb.emptyMode #------------------------------------------------------------------------------------------------------------------------ # 3 jaspar matrices arrive with organism = NA. find and fix. make sure here. # NA-JASPAR_CORE-TBP-MA0108.2: JASPAR gives for speciesID # NA-JASPAR_CORE-HNF4A-MA0114.1: JASPAR gives for speciesID # NA-JASPAR_CORE-CEBPA-MA0102.2: JASPAR gives '-' for speciesID, website says 'vertebrates' # Many more NA's exist...need to fix these; here's a quick fix for now test.noNAorganisms = function() { print('--- test.noNAorganisms') #checkEquals(which(is.na(mcols(MotifDb)$organism)), integer(0)) # There's a fair number of NA organisms, mostly due to including the homer DB checkEquals(sum(is.na(mcols(MotifDb)$organism)), 366) } # test.noNAorganisms #------------------------------------------------------------------------------------------------------------------------ test.allMatricesAreNormalized = function() { print('--- test.allMatricesAreNormalized') mdb = MotifDb#(quiet=TRUE) matrices = mdb@listData # a lenient test required by "Cparvum-UniPROBE-Cgd2_3490.UP00395" and "Hsapiens-UniPROBE-Sox4.UP00401" # for reasons not yet explored. 10e-8 should be be possible checkTrue(all(sapply(matrices, function(m) all(abs(colSums(m) - 1.0) < 0.02)))) } # test.allMatricesAreNormalized #------------------------------------------------------------------------------------------------------------------------ test.providerNames = function() { print('--- test.getProviderNames') mdb = MotifDb #() pn = mcols(mdb)$providerName checkEquals(length(which(is.na(pn))), 0) checkEquals(length(which(pn == '')), 0) } # test.providerNames #------------------------------------------------------------------------------------------------------------------------ test.geneSymbols = function() { print('--- test.getGeneSymbols') mdb = MotifDb #() syms = mcols(mdb)$geneSymbol checkEquals(length(which(is.na(syms))), 683) # no symols yet for the dgf stamlab motifs checkEquals(length(which(syms == '')), 0) } # test.geneSymbols #------------------------------------------------------------------------------------------------------------------------ test.geneIdsAndTypes = function() { print('--- test.getGeneIdsAndTypes') mdb = MotifDb tbl <- mcols(mdb) geneIds = tbl$geneId geneIdTypes = tbl$geneIdType typeCounts = as.list(table(geneIdTypes)) checkTrue(typeCounts$ENTREZ > 2300) checkTrue(typeCounts$FLYBASE >= 45) checkTrue(typeCounts$SGD >= 600) checkTrue(nrow(subset(tbl, is.na(geneIdType))) > 2000) empty.count = length(which(geneIds == '')) checkEquals(empty.count, 0) } # test.geneIdsAndTypes #------------------------------------------------------------------------------------------------------------------------ # make sure that all proteinIds have explicit values, either proper identifiers or NA # currently tested by looking for empty string assignments test.proteinIds = function() { print('--- test.proteinIds') mdb = MotifDb #(quiet=TRUE) NA.string.count <- sum(is.na(mcols(mdb)$proteinId)) # NA.string.count = length(grep('NA', mcols(mdb)$proteinId)) checkEquals(NA.string.count, 2514) # FIX THIS; Currently 2514 don't have protein IDs #checkEquals(NA.string.count, 0) empty.count = length(which(mcols(mdb)$proteinId=="")) if(empty.count > 0) browser('test.proteinIds') checkEquals(empty.count, 0) # FlyFactorSurvey, as digested by me, had a blanket assigment of UNIPROT to all proteinIds # Herve' pointed out that this applied also to entries with no proteinId. # make sure this is fixed ### FIX THIS TOO! Currently have 913 entries with a proteinIdType and no proteinId x = mcols(mdb) # checkEquals(nrow(subset(x, !is.na(proteinIdType) & is.na(proteinId))), 0) } # test.proteinIds #------------------------------------------------------------------------------------------------------------------------ # only for UniPROBE do we not have sequence count. might be possible to get them along with 'insertion sequences' test.sequenceCount = function() { print('--- test.sequenceCount') mdb = MotifDb #() x = mcols(mdb) if(interactive()) { x.up = subset(x, dataSource == 'UniPROBE') checkTrue(all(is.na(x.up$sequenceCount))) } else { uniprobe.indices = which(x$dataSource == 'UniPROBE') checkTrue(all(is.na(x$sequenceCount [uniprobe.indices]))) } } # test.sequenceCount #------------------------------------------------------------------------------------------------------------------------ # make sure that a legitimate organism-dataSource-identifier is supplied for each matrix and as a rowname # of the corresponding DataFrame test.longNames = function() { print('--- test.longNames') mdb <- MotifDb longNames <- strsplit(names(mdb), '-') organisms <- unique(sapply(longNames, '[', 1)) dataSources <- unique(lapply(longNames, '[', 2)) recognized.dataSources <- c("cisbp_1.02", "FlyFactorSurvey", "HOCOMOCOv10", "HOCOMOCOv11B-core", "HOCOMOCOv11C-core", "HOCOMOCOv11B-full", "HOCOMOCOv11C-full", "HOCOMOCOv11A-core", "HOCOMOCOv11D-full", "HOCOMOCOv11A-full", "HOMER", "hPDI", "JASPAR_CORE", "JASPAR_2014", "jaspar2016", "jaspar2018", "jolma2013", "ScerTF", "stamlab", "SwissRegulon", "UniPROBE") recognized.dataSources <- c("cisbp_1.02", "FlyFactorSurvey", "HOCOMOCOv10", "HOCOMOCOv11-core-A", "HOCOMOCOv11-core-B", "HOCOMOCOv11-core-C", "HOCOMOCOv11-full-A", "HOCOMOCOv11-full-B", "HOCOMOCOv11-full-C", "HOCOMOCOv11-full-D", "HOMER", "hPDI", "JASPAR_2014", "JASPAR_CORE", "jaspar2016", "jaspar2018", "jolma2013", "ScerTF", "stamlab", "SwissRegulon", "UniPROBE") recognized.organisms <- unique(mcols(mdb)$organism) # a few(3) matrices from JASPAR core have NA organism. make this into a character # so that it can be matched up against the 'NA' extracted from longNames just above na.indices <- which(is.na(recognized.organisms)) if(length(na.indices) > 0) recognized.organisms [na.indices] <- 'NA' checkTrue(all(organisms %in% recognized.organisms)) # new hocomoco-[core|full]-[ABCD] dataSource names are not incorporated into rownames yet # checkTrue(all(dataSources %in% recognized.dataSources)) } # test.longNames #------------------------------------------------------------------------------------------------------------------------ # make sure that a legitimate organism is specified for each matrix test.organisms <- function() { print('--- test.organisms') mdb <- MotifDb #(quiet=TRUE) organisms <- mcols(mdb)$organism # jaspar_core has 3 NA speciesId: TBP, HNF4A and CEBPA(MA0108.2, MA0114.1, MA0102.2) # their website shows these as vertebrates, which I map to 'Vertebrata'. An organismID of '-' # gets the same treatment, matching website also. ### Note: this failing test is the same as the test.noNAorganisms test! # As in case of noNA, need to add organisms for these #checkEquals(which(is.na(mcols(MotifDb)$organism)), integer(0)) empty.count <- length(which(mcols(mdb)$organism=="")) checkEquals(empty.count, 0) } # test.organisms #------------------------------------------------------------------------------------------------------------------------ test.bindingDomains <- function() { print('--- test.bindingDomains') mdb <- MotifDb #(quiet=TRUE) checkTrue(length(unique(mcols(mdb)$bindingDomain)) > 1) } # test.bindingDomains #------------------------------------------------------------------------------------------------------------------------ test.flyBindingDomains <- function() { print('--- test.flyBindingDomains') x <- mcols(MotifDb) tmp <- as.list(head(sort(table(subset(x, organism=='Dmelanogaster')$bindingDomain), decreasing=TRUE), n=3)) # these counts will likely change with a fresh load of data from FlyFactorSurvey. checkEquals(tmp$Homeobox, 212) checkEquals(tmp[['zf-C2H2']], 160) checkEquals(tmp[["Helix-Turn-Helix"]], 182) checkTrue(length(which(is.na(subset(x, organism=='Dmelanogaster')$bindingDomain))) > 300) # lots of cisbp } # test.flyBindingDomains #------------------------------------------------------------------------------------------------------------------------ test.experimentTypes <- function() { print('--- test.experimentTypes') mdb <- MotifDb #(quiet=TRUE) x <- mcols(mdb) checkTrue(length(unique(x$experimentType)) >= 18) checkEquals(length(which(x$experimentType=='')), 0) } # test.experimentTypes #------------------------------------------------------------------------------------------------------------------------ test.tfFamilies = function() { print('--- test.tfFamilies') mdb = MotifDb #(quiet=TRUE) checkTrue(length(unique(mcols(mdb)$tfFamily)) > 1) } # test.tfFamilies #------------------------------------------------------------------------------------------------------------------------ test.bindingSequences = function() { print('--- test.bindingSequences') mdb = MotifDb #(quiet=TRUE) checkTrue(length(unique(mcols(mdb)$bindingSequence)) > 1) } # test.tfFamilies #------------------------------------------------------------------------------------------------------------------------ test.pubmedIDs = function() { print('--- test.pubmedIDs') x = mcols(MotifDb) #(quiet=TRUE)) checkTrue(length(unique(x$pubmedID)) >= 139) checkEquals(length(which(x$pubmedID == '')), 0) } # test.pubmedIDs #------------------------------------------------------------------------------------------------------------------------ # every matrix, and every corresponding metadata table row, has a name formed thus: # # dataSource-organism-nativeGeneName # # where nativeGeneName is how the matrix is named in the dataSource from which it comes. # thus: # FlyFactorSurvey-Dmelanogaster-Abd.A_FlyReg_FBgn0000014 # UniPROBE-Scerevisiae-Asg1-UP00350 # ScerTF-Scerevisiae-ABF2-badis # JASPAR_CORE-Rrattus-Ar-MA0007.1 # skip.test.allFullNames = function() { print('--- test.allFullNames') mdb = MotifDb #(quiet=TRUE) matrices = mdb@listData fullNames = names(matrices) all.dataSources = unique(mcols(mdb)$dataSource) checkTrue(length(all.dataSources) >= 4) for(source in all.dataSources) { this.dataSource <- source matrices.by.source = subset(mdb, dataSource==this.dataSource) matrix.name = names(matrices.by.source)[1] # FlyFactorSurvey: Dmelanogaster-FlyFactorSurvey-ab_SANGER_10_FBgn0259750 # hPDI: Hsapiens-hPDI-ABCF2 # JASPAR_CORE: JASPAR_CORE-Athaliana-AGL3-MA0001.1 # UniPROBE: Hsapiens-UniPROBE-Sox4.UP00401 checkTrue(grep(this.dataSource, matrix.name) == 1) #printf('%20s: %s', source, matrix.name) } return(TRUE) } # test.allFullNames #------------------------------------------------------------------------------------------------------------------------ test.subset = function() { if(interactive()) { print('--- test.subset') mdb = MotifDb checkTrue('geneSymbol' %in% colnames(elementMetadata(mdb))) mdb.sub = subset(mdb, geneSymbol=='ABCF2') checkEquals(length(mdb.sub), 1) } # if interactive } # test.subset #------------------------------------------------------------------------------------------------------------------------ test.subsetWithVariables = function() { if(interactive()) { print('--- test.subsetWithVariables') mdb = MotifDb #() checkTrue('geneSymbol' %in% colnames(elementMetadata(mdb))) target.gene <<- 'ABCF2' mdb.sub = subset(mdb, geneSymbol==target.gene) checkEquals(length(mdb.sub), 1) } # if interactive } # test.subsetWithVariables #------------------------------------------------------------------------------------------------------------------------ # "old style": just one query term allowed test.queryOldStyle = function() { print('--- test.queryOldStyle') mdb = MotifDb # do queries on dataSource counts match those from a contingency table? sources.list = as.list(table(mcols(mdb)$dataSource)) checkEquals(length(query(mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey) checkEquals(length(query(mdb, 'uniprobe')), sources.list$UniPROBE) checkEquals(length(query(mdb, 'UniPROBE')), sources.list$UniPROBE) # gene symbols which begin with 'sox' are quite common. can we them? # there are currently(19 jul 2012) 18, but since this may change, our test is approximate # Change on 8/1/2017: increase top limit of sox entries as they've expanded sox.entries = query(mdb, '^sox') checkTrue(length(sox.entries) > 10) checkTrue(length(sox.entries) < 200) # manual inspection reveals that some of these genes have names which are all capitalized. test that. checkTrue(length(query(mdb, '^sox', ignore.case=TRUE)) > length(query(mdb, '^SOX', ignore.case=FALSE))) # make sure that queries can be stacked up, and that order of call does not affect the outcome uniprobe.sox.matrices = query(query(mdb, 'uniprobe'), '^sox') sox.uniprobe.matrices = query(query(mdb, '^sox'), 'uniprobe') checkEquals(length(uniprobe.sox.matrices), length(sox.uniprobe.matrices)) # an approximate count check checkTrue(length(uniprobe.sox.matrices) > 10) checkTrue(length(uniprobe.sox.matrices) < 30) checkEquals(unique(mcols(uniprobe.sox.matrices)$dataSource), 'UniPROBE') checkEquals(unique(mcols(sox.uniprobe.matrices)$dataSource), 'UniPROBE') gene.symbols = sort(unique(mcols(uniprobe.sox.matrices)$geneSymbol)) # query on a string(in this case, an oddly named motif) which contains # regular expression characters. no solution to this yet. # query uses base R's grep, in which # x <- query(mdb, "ELK1,4_GABP{A,B1}.p3") } # test.queryOldStyle #------------------------------------------------------------------------------------------------------------------------ test.query <- function() { print('--- test.query') mdb = MotifDb ors <- c("MA0511.1", "MA0057.1") ands <- c("jaspar2018", "sapiens") nots <- "cisbp" x <- query(mdb, andStrings=ands, orStrings=ors) checkEquals(length(x), 2) checkEquals(sort(names(x)), c("Hsapiens-jaspar2018-MZF1(var.2)-MA0057.1", "Hsapiens-jaspar2018-RUNX2-MA0511.1")) x <- query(mdb, andStrings="MA0057.1") checkEquals(length(x), 15) x <- query(mdb, andStrings=c("MA0057.1", "cisbp")) checkEquals(length(x), 11) x <- query(mdb, andStrings=c("MA0057.1"), notStrings="cisbp") checkEquals(length(x), 4) x <- query(mdb, andStrings=c("MA0057.1"), notStrings=c("cisbp", "JASPAR_2014")) checkEquals(length(x), 3) x <- query(mdb, orStrings=c("mus", "sapiens"), andStrings="MA0057.1") #checkEquals(sort(names(x)), # do queries on dataSource counts match those from a contingency table? sources.list = as.list(table(mcols(mdb)$dataSource)) checkEquals(length(query(mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey) checkEquals(length(query(mdb, 'uniprobe')), sources.list$UniPROBE) checkEquals(length(query(mdb, 'UniPROBE')), sources.list$UniPROBE) } # test.query #------------------------------------------------------------------------------------------------------------------------ test.query2 <- function() { mdb <- MotifDb matrices.human <- query(mdb, 'hsapiens') matrices.sox4 <- query(mdb, 'sox4') matrices.human.sox4 <- query(mdb, c("hsapiens", "sox4")) matrices.human.sox4.oldStyle <- query(matrices.human, "sox4") checkTrue(length(matrices.human.sox4) > 0) # 6 found on(24 oct 2018) checkEquals(length(matrices.human.sox4), length(matrices.human.sox4.oldStyle)) checkTrue(length(matrices.human.sox4) < length(matrices.human)) checkTrue(length(matrices.human.sox4) < length(matrices.sox4)) uniprobe.sox.matrices <- query(mdb, c('uniprobe', '^sox')) } # test.query2 #------------------------------------------------------------------------------------------------------------------------ test.transformMatrixToMemeRepresentation = function() { print('--- test.transformMatrixToMemeRepresentation') mdb = MotifDb #() matrix.agl3 = subset(mdb, dataSource=='JASPAR_CORE' & organism=='Athaliana' & geneSymbol=='AGL3')[[1]] checkEquals(dim(matrix.agl3), c(4, 10)) # a transposed frequency is needed for meme. we store normalized frequency matrices, to just transposition is needed tfm = MotifDb:::transformMatrixToMemeRepresentation(matrix.agl3) checkEquals(dim(tfm), c(10, 4)) checkTrue(all(round(rowSums(tfm)) == 1.0)) } # test.transformMatrixToMemeRepresentation #------------------------------------------------------------------------------------------------------------------------ # translate a motif matrix from MotifList into text suitable for meme and tomtom input. # choose the top two from UniPROBE. they reliably have high counts, and easily distinguished normalized frequencies test.matrixToMemeText = function() { print('--- test.matrixToMemeText') sox4 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4') rtg3 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3') text.sox4 = MotifDb:::matrixToMemeText(sox4) # check these with t(sox4 [[1]]) line1.sox4 = " 0.2457457130 0.1950426500 0.2287887620 0.3304228750" line14.sox4 = " 0.2821643030 0.2286132160 0.1585395830 0.3306828990" checkEquals(length(text.sox4), 29) checkEquals(text.sox4 [1], "MEME version 4") checkEquals(text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401") checkEquals(grep(line1.sox4, text.sox4), 12) checkEquals(grep(line14.sox4, text.sox4), 25) text.rtg3 = MotifDb:::matrixToMemeText(rtg3) line1.rtg3 = " 0.3935122858 0.1453016447 0.3308830322 0.1303030373" line20.rtg3 = " 0.2490417648 0.3966478493 0.1083586569 0.2459517291" checkEquals(length(text.rtg3), 35) # 4 trailing empty lines checkEquals(text.rtg3 [1], "MEME version 4") checkEquals(text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356") checkEquals(grep(line1.rtg3, text.rtg3), 12) checkEquals(grep(line20.rtg3, text.rtg3), 31) # now call with both matrices, and see if the right results are returned text.both = MotifDb:::matrixToMemeText(c(sox4, rtg3)) checkEquals(text.both [1], "MEME version 4") checkEquals(grep(line1.sox4, text.both), 12) checkEquals(grep(line14.sox4, text.both), 25) checkEquals(grep(line1.rtg3, text.both), 29) checkEquals(grep(line20.rtg3, text.both), 48) } # test.matrixToMemeText #------------------------------------------------------------------------------------------------------------------------ # translate a motif matrix from MotifList into text suitable for meme and tomtom input. # choose the top two from UniPROBE. they reliably have high counts, and easily distinguished normalized frequencies #test.matrixToMemeText_mapVersion = function() #{ # print('--- test.matrixToMemeText_mapVersion') # sox4 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4') # rtg3 = subset(MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3') # # text.sox4 = MotifDb:::mapped.broken.matrixToMemeText(sox4) # text.rtg3 = MotifDb:::mapped.broken.matrixToMemeText(rtg3) # text.both = MotifDb:::mapped.broken.matrixToMemeText(c(sox4, rtg3)) # # # check these with t(sox4 [[1]]) # line1.sox4 = " 0.2457457130 0.1950426500 0.2287887620 0.3304228750" # line14.sox4 = " 0.2821643030 0.2286132160 0.1585395830 0.3306828990" # # checkEquals(length(text.sox4), 29) # checkEquals(text.sox4 [1], "MEME version 4") # checkEquals(text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401") # checkEquals(grep(line1.sox4, text.sox4), 12) # checkEquals(grep(line14.sox4, text.sox4), 25) # # line1.rtg3 = " 0.3935122858 0.1453016447 0.3308830322 0.1303030373" # line20.rtg3 = " 0.2490417648 0.3966478493 0.1083586569 0.2459517291" # checkEquals(length(text.rtg3), 35) # 4 trailing empty lines # checkEquals(text.rtg3 [1], "MEME version 4") # checkEquals(text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356") # checkEquals(grep(line1.rtg3, text.rtg3), 12) # checkEquals(grep(line20.rtg3, text.rtg3), 31) # # # now call with both matrices, and see if the right results are returned # checkEquals(text.both [1], "MEME version 4") # checkEquals(grep(line1.sox4, text.both), 12) # checkEquals(grep(line14.sox4, text.both), 25) # checkEquals(grep(line1.rtg3, text.both), 29) # checkEquals(grep(line20.rtg3, text.both), 48) # #} # test.matrixToMemeText_mapVersion #------------------------------------------------------------------------------------------------------------------------ test.export_memeFormatStdOut = function() { print('--- test.export_memeFormatStdOut') mdb = MotifDb #() mdb.chicken = subset(mdb, organism=='Gallus') checkEquals(length(mdb.chicken), 3) # text is cat-ed to stdout, so not avaialable here to check. # but just like print, export also returns the text invisibly. # so that CAN be checked. meme.text = export(mdb.chicken, format='meme') checkEquals(length(meme.text), 1) # just one long string checkTrue(is.character(meme.text)) checkTrue(nchar(meme.text) > 800) # 1002 as of(10 aug 2012) return(TRUE) } # test.exportMemeFormatToStdOut #------------------------------------------------------------------------------------------------------------------------ test.export_memeFormatToFile = function() { print('--- test.export_memeFormatToFile') mdb = MotifDb #() mdb.chicken = subset(mdb, organism=='Gallus') checkEquals(length(mdb.chicken), 3) output.file = tempfile() meme.text = export(mdb.chicken, output.file, 'meme') retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE) invisible(retrieved) } # test.exportMemeFormatToFile #------------------------------------------------------------------------------------------------------------------------ test.export_memeFormatToFileDuplication = function() { print('--- test.export_memeFormatToFileDuplication') mdb = MotifDb #() mdb.mouse = subset(mdb, organism=='Mmusculus') checkTrue(length(mdb.mouse) > 1300) output.file = 'mouse.txt' # tempfile() max = 3 meme.text = export(mdb.mouse [1:max], output.file, 'meme') retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE) invisible(retrieved) } # test.exportMemeFormatToFileDuplication #------------------------------------------------------------------------------------------------------------------------ test.export_memeFormatToFile_run_tomtom = function(max=50) { if(interactive()) { print('--- test.export_memeFormatToFile_run_tomtom') mdb = MotifDb sox4.mouse = subset(mdb, organism=='Mmusculus' & geneSymbol=='Sox4') all.human = subset(mdb, organism=='Hsapiens') tomtom.tmp.dir = tempfile() print(tomtom.tmp.dir) stopifnot(dir.create(tomtom.tmp.dir)) sox4.file.path = file.path(tomtom.tmp.dir, 'sox4.mouse.text') all.human.file.path = file.path(tomtom.tmp.dir,'all.human.text') export(sox4.mouse, sox4.file.path, 'meme') export(all.human, all.human.file.path, 'meme') # find similarity of motif #1 to all the motifs in mdbMany # cannot rely upon tomtom being present #cmd = sprintf('tomtom -no-ssc -oc %s -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s', # tomtom.tmp.dir, sox4.file.path, all.human.file.path) #system(cmd) #cmd = sprintf('open %s/tomtom.html', tomtom.tmp.dir) #system(cmd) } # if interactive } # test.export_memeFormatToFile_run_tomtom #------------------------------------------------------------------------------------------------------------------------ # MotIV::motifMatch fails with MotIV_1.25.0. will look into this in September, 2015, pshannon # MotIV abandoned, all of my own motif/sequence matching is done via an independently installed # FIMO from the meme suite. # another option is the MOODs motif matching capability provided by the bioc package "motifmatchr" # this possibility, and these tests, are deferred for now. disabled_test.run_MotIV.motifMatch = function() { require(MotIV) print('--- test.run_MotIV.motifMatch') mdb <- MotifDb #() db.tmp <- mdb@listData # match motif 1 against the entire MotifDb collection motif.hits <- motifMatch(db.tmp [1], database<-db.tmp) # the long way to extract the matrix name. see MotIV.toTable below for more convenient way checkEquals(motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names(db.tmp)[1]) # match the last motif against all last <- length(db.tmp) # MotIV:motifMatch works differently on linux and macos. by asking for 50 matches, # the search target(db.tmp[last]) is sure to be in the hit list. motif.hits <- motifMatch(db.tmp [last], database=db.tmp, top=50) tbl.hits <- MotIV.toTable(motif.hits) # the 5 hits return should include the one we tried to match, but the MotIV search strategy # may not place it first checkTrue(names(db.tmp[last]) %in% tbl.hits$name) invisible(tbl.hits) } # distable_test.run_MotIV.motifMatch #------------------------------------------------------------------------------------------------------------------------ MotIV.toTable = function(match) { stopifnot(length(match@bestMatch) >= 1) alignments = match@bestMatch[[1]]@aligns df = data.frame(stringsAsFactors=FALSE) for(alignment in alignments) { x = alignment name = x@TF@name eVal = x@evalue sequence = x@sequence match = x@match strand = x@strand df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE)) } # for alignment return(df) } # MotIV.toTable #------------------------------------------------------------------------------------------------------------------------ disabled_test.MotIV.toTable = function() { print('--- test.MotIVtoTable') mdb = MotifDb #() test.hits = motifMatch(mdb[1]@listData, database=jaspar) tbl.hits = MotIV.toTable(test.hits) checkEquals(dim(tbl.hits), c(5, 5)) checkEquals(sort(colnames(tbl.hits)), sort(c("name", "eVal", "sequence", "match", "strand"))) } # test.MotIV.toTable #------------------------------------------------------------------------------------------------------------------------ pwmMatch.toTable = function(motifMatch) { if(length(motifMatch@bestMatch) == 0) return(NA) df.list = vector("list", length(motifMatch@bestMatch)) for(k in seq(length(motifMatch@bestMatch))) { alignments = motifMatch@bestMatch[[k]]@aligns df = data.frame(stringsAsFactors=FALSE) for(alignment in alignments) { x = alignment name = x@TF@name eVal = x@evalue sequence = x@sequence match = x@match strand = x@strand df = rbind(df, data.frame(name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE)) } # for alignment df.list[[k]]=df } names(df.list) <- names(motifMatch) return(df.list) } # pwmMatch.toTable #------------------------------------------------------------------------------ # Robert Stojnic reports incorrect gene symbols for matrices obtained from # flyFactorSurvey. # the solution was to abandon the original strategy of extracting the # symbol from the matrix(and file) name. # now, the flybase importer("inst/scripts/import/flyFactorSurvey/import.R") # uses FBgn id(which can be reliably extracted) and uses indpendent # data sources to learn the gene symbol. # # robert's email: # I'm working on using MotifDb motifs in my PWMEnrich package and I # have noticed that there is a slight problem with gene symbols for # Drosophila. In particular, the gene symbols do not always correspond # to the gene ID and are frequently mis-capitalized. In Drosophila z # and Z are two different genes and capitalization does matter if # someone is to use the gene symbols. Also, in some cases the symbols # are missing hyphens or parenthesis. I have used the gene IDs and the # Flybase annotation database to set the correct gene symbols for # Drosophila, please find attached the result of my re-annotation. # # looking at his correctedMotifDbDmel.csv # # head(read.table("correctedMotifDbDmel.csv", sep=",", header=TRUE, stringsAsFactors=FALSE)) # providerName oldGeneSymbol newGeneSymbol # 1 ab_SANGER_10_FBgn0259750 Ab ab # 2 ab_SOLEXA_5_FBgn0259750 Ab ab # 3 Abd-A_FlyReg_FBgn0000014 Abd-a abd-A # 4 Abd-B_FlyReg_FBgn0000015 Abd-b Abd-B # 5 AbdA_Cell_FBgn0000014 Abda abd-A # 6 AbdA_SOLEXA_FBgn0000014 Abda abd-A # test.flyFactorGeneSymbols <- function() { print("--- test.flyFactorGeneSymbols") mdb = MotifDb checkEquals(sort(mcols(query(mdb, "FBgn0259750"))$geneSymbol), sort(c("FBgn0259750", "FBgn0259750"))) checkEquals(mcols(query(mdb, "FBgn0000014"))$geneSymbol, rep("abd-A", 3)) checkEquals(mcols(query(mdb, "FBgn0000015"))$geneSymbol, rep("Abd-B", 3)) } # test.flyFactorGeneSymbols #------------------------------------------------------------------------------- test.export_jasparFormatStdOut = function() { print('--- test.export_jasparFormatStdOut') mdb = MotifDb #() mdb.chicken = subset(mdb, organism=='Gallus') checkEquals(length(mdb.chicken), 3) # text is cat-ed to stdout, so not avaialable here to check. # but just like print, export also returns the text invisibly. # so that CAN be checked. jaspar.text = export(mdb.chicken, format='jaspar') checkEquals(length(jaspar.text), 1) # just one long string checkTrue(is.character(jaspar.text)) checkTrue(nchar(jaspar.text) > 800) # 1002 as of(10 aug 2012) return(TRUE) } # test.exportjasparFormatToStdOut #------------------------------------------------------------------------------------------------------------------------ test.export_jasparFormatToFile = function() { print('--- test.export_jasparFormatToFile') mdb = MotifDb #() mdb.chicken = subset(mdb, organism=='Gallus') checkEquals(length(mdb.chicken), 3) output.file = tempfile() jaspar.text = export(mdb.chicken, output.file, 'jaspar') retrieved = scan(output.file, what=character(0), sep='\n', quiet=TRUE) invisible(retrieved) } # test.exportjasparFormatToFile #------------------------------------------------------------------------------------------------------------------------ test.geneToMotif <- function() { printf("--- test.geneToMotif") mdb <- MotifDb genes <- c("FOS", "ATF5", "bogus", "SATB2") good.genes <- genes[-which(genes=="bogus")] # use TFClass family classifcation tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS") # intentional mis-capitalization checkTrue(all(good.genes %in% tbl.tfClass$gene)) expected.motifs <- c("MA0833.1", "MA0099.2", "MA0476.1", "MA0679.1", "MA0754.1", "MA0755.1", "MA0756.1", "MA0757.1") checkTrue(all(expected.motifs %in% tbl.tfClass$motif)) checkEquals(unique(tbl.tfClass$source), "TFClass") # MotifDb mode uses the MotifDb metadata, pulled from many sources tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb") # intentional mis-capitalization checkEquals(dim(tbl.mdb), c(14, 6)) checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1") # no recognizable(i.e., jaspar standard) motif name returned by MotifDb metadata # MotifDb for ATF5 # todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1 # check use of ignore.case tbl.caseSensitive <- geneToMotif(MotifDb, "STAT4", source="MotifDb") checkEquals(length(grep("jaspar", tbl.caseSensitive$dataSource, ignore.case=TRUE)), 0) tbl.caseInsensitive <- geneToMotif(MotifDb, "STAT4", source="MotifDb", ignore.case=TRUE) checkTrue(length(grep("jaspar", tbl.caseInsensitive$dataSource, ignore.case=TRUE)) >= 3) tbl.caseSensitive <- geneToMotif(MotifDb, "stat4", source="TFclass") checkEquals(nrow(tbl.caseSensitive), 0) tbl.caseInsensitive <- geneToMotif(MotifDb, "stat4", source="TFclass", ignore.case=TRUE) checkTrue(nrow(tbl.caseInsensitive) >= 5) } # test.geneToMotif #------------------------------------------------------------------------------------------------------------------------ # this case discovered(31 jan 2018). when called on a gene/source combination for which there are # no motifs, i attempted to add the mapping source(either "MotifDb", "TFClass") as a column # to an empty data.frame. check for that and its fix here test.geneToMotif.oneGene.noMotifs <- function() { checkEquals(nrow(geneToMotif(MotifDb, "SATB2", "MotifDb")), 0) checkEquals(nrow(geneToMotif(MotifDb, "bogus-arandum", "MotifDb")), 0) checkEquals(nrow(geneToMotif(MotifDb, "bogus-arandum", "TFclass")), 0) } # test.geneToMotif.oneGene.noMotifs #------------------------------------------------------------------------------------------------------------------------ # sad to say I do not recall what problem/fix is tested here(pshannon, 23 jan 2018). # however, it demonstrates the variety of results which can be returned by non-jaspar datasets # when using the MotifDb mapping source, and the relative paucity which is sometimes # seen with the TFclass mapper test.geneToMotif.ignore.jasparSuffixes <- function() { printf("--- test.geneToMotif.ignore.jasparSuffixes") mdb <- MotifDb genes <- c("FOS", "ATF5", "bogus") # use TFClass family classifcation tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS") # intentional mis-capitalization checkEquals(sort(tbl.tfClass$gene), sort(c("ATF5", "FOS", "FOS"))) checkEquals(sort(tbl.tfClass$motif), sort(c("MA0833.1", "MA0099.2", "MA0476.1"))) checkEquals(tbl.tfClass$source, rep("TFClass", 3)) # MotifDb mode uses the MotifDb metadata, pulled from many sources tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb") # intentional mis-capitalization checkEquals(dim(tbl.mdb), c(14, 6)) checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1") # no recognizable(i.e., jaspar standard) motif name returned by MotifDb metadata # MotifDb for ATF5 # compare the MA0110599_1.02 matrix of cisp_1.02 to japar MA0476.1: the identical matrix! # 1 FOS MA0110599_1.02 cisbp_1.02 Hsapiens 24194598 MotifDb # 10 FOS MA0476.1 jaspar2018 Hsapiens 17916232 MotifDb # this establishes the need for careful scrutiny as one winnows a geneToMotif result into # useful non-reduplicative sequence analysis pfm.ma0110599 <- as.list(query(mdb, "MA0110599"))[[1]] pfm.ma0476.1 <- as.list(query(query(mdb, "MA0476.1"), "jaspar2018"))[[1]] checkEquals(pfm.ma0110599, pfm.ma0476.1) } # test.geneToMotif.ignore.jasparSuffixes #------------------------------------------------------------------------------------------------------------------------ test.motifToGene <- function() { printf("--- test.motifToGene") #printf("Sys.getlocale: %s", Sys.getlocale()) # good test case of querying both sources, motif <- "MA0099.2" tbl <- motifToGene(MotifDb, motif, c("MotifDb", "TFclass")) # [1] mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN # [1] tfc.genes: FOS, JUN checkEquals(sort(unique(tbl$geneSymbol)), c("AP1", "FOS", "FOS::JUN", "JUN", "JUN::FOS")) motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon") # MotifDb mode uses the MotifDb metadata "providerId", tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb") checkEquals(dim(tbl.mdb), c(3, 5)) expected <- sort(c("MA0592.2", "ELF1.SwissRegulon", "UP00022")) actual <- sort(tbl.mdb$motif) checkEquals(actual, expected) checkEquals(sort(tbl.mdb$geneSymbol), sort(c("ELF1", "Esrra", "Zfp740"))) checkEquals(tbl.mdb$source, rep("MotifDb", 3)) # TFClass mode uses TF family classifcation tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass") checkEquals(dim(tbl.tfClass), c(9,5)) checkEquals(tbl.tfClass$motif, rep("MA0592.2", 9)) checkEquals(sort(tbl.tfClass$gene), sort(c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR"))) checkEquals(tbl.tfClass$source, rep("TFClass", 9)) # test motifs with regex characters in them, or other characters neither letter nor number motifs <- sort(c("DMAP1_NCOR{1,2}_SMARC.p2", "ELK1,4_GABP{A,B1}.p3", "SNAI1..3.p2", "EWSR1-FLI1.p2", "ETS1,2.p2")) tbl <- motifToGene(MotifDb, motifs, source="MotifDb") checkEquals(nrow(tbl), 0) tbl <- motifToGene(MotifDb, motifs, source="tfclass") checkEquals(ncol(tbl), 5) checkTrue(nrow(tbl) > 80) checkTrue(nrow(tbl) < 100) checkTrue(all(motifs %in% tbl$motif)) motifs <- c("Hsapiens-HOCOMOCOv10-IKZF1_HUMAN.H10MO.C", "MA0099.2", "Hsapiens-SwissRegulon-ARID3A.SwissRegulon", "MA0592.2", "MA0036913_1.02") tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass")) checkTrue(all(motifs %in% tbl$motif)) motif <- "Mmusculus;Rnorvegicus;Hsapiens-jaspar2018-FOS::JUN-MA0099.2" tbl <- motifToGene(MotifDb, motif, source="TFClass") checkEquals(tbl$geneSymbol, c("FOS", "JUN")) motifs <- c("Hsapiens-jaspar2016-RUNX1-MA0002.1", "Hsapiens-jaspar2016-TFAP2A-MA0003.1", "Hsapiens-jaspar2016-TFAP2A-MA0003.2", "Hsapiens-jaspar2016-TFAP2A-MA0003.3", "Hsapiens-jaspar2016-AR-MA0007.2", "MA0872.1") tbl <- motifToGene(MotifDb, motifs, source="Motifdb") checkTrue(all(motifs %in% tbl$motif)) checkEquals(sort(unique(tbl$geneSymbol)), c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)")) tbl <- motifToGene(MotifDb, motifs, source="TFClass") checkEquals(sort(unique(tbl$motif)), c("Hsapiens-jaspar2016-TFAP2A-MA0003.3", "MA0872.1")) checkEquals(sort(unique(tbl$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E")) tbl <- motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass")) checkTrue(all(motifs %in% tbl$motif)) checkEquals(sort(unique(tbl$geneSymbol)), c("AR", "RUNX1", "TFAP2A", "TFAP2A(var.3)", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E")) #(23 may 2018) found that MotifDb works, but c("MotifDb", "TFClass") does not # test the fix here motifs <- c("Hsapiens-jolma2013-IRF5-2", "Hsapiens-SwissRegulon-IRF5.SwissRegulon") checkEquals(motifToGene(MotifDb, motifs, source=c("MotifDb"))$geneSymbol, c("IRF5", "IRF5")) checkEquals(nrow(motifToGene(MotifDb, motifs, source=c("TFClass"))), 0) checkEquals(motifToGene(MotifDb, motifs, source=c("MotifDb", "TFClass"))$geneSymbol, c("IRF5", "IRF5")) } # test.motifToGene #------------------------------------------------------------------------------------------------------------------------ test.associateTranscriptionFactors <- function() { printf("--- test.associateTranscriptionFactors") mdb <- MotifDb pfms <- query(mdb, andStrings=c("sapiens", "jaspar2016")) # query(mdb, "jaspar2016", "sapiens") # first check motifs with MotifDb-style long names, using MotifDb lookup, in the # metadata of MotifDb: # "Hsapiens-jaspar2016-RUNX1-MA0002.1" "Hsapiens-jaspar2016-TFAP2A-MA0003.1" motif.names <- c(names(pfms[1:5]), "MA0872.1", "hocus.pocus") tbl <- data.frame(motifName=motif.names, score=runif(7), stringsAsFactors=FALSE) tbl.anno.mdb <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=TRUE) checkEquals(nrow(tbl), nrow(tbl.anno.mdb)) checkTrue(is.na(tbl.anno.mdb$geneSymbol[grep("hocus.pocus", tbl.anno.mdb$motifName)])) checkTrue(all(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)") %in% tbl.anno.mdb$geneSymbol)) tbl.anno.tfc <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE) checkTrue(nrow(tbl) < nrow(tbl.anno.tfc)) checkEquals(sort(unique(tbl.anno.tfc$geneSymbol)), c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E")) tbl.anno.both <- associateTranscriptionFactors(mdb, tbl, source=c("MotifDb", "TFClass"), expand.rows=TRUE) checkEquals(length(grep("MotifDb", tbl.anno.both$source)), 6) checkEquals(length(grep("TFClass", tbl.anno.both$source)), 10) # checkEquals(dim(tbl.anno.mdb), c(nrow(tbl), ncol(tbl) + 4)) # checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno.mdb))) # checkEquals(sort(tbl.anno.mdb$geneSymbol), # sort(c("AR", "RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "TFAP2A(var.3)"))) # # # now add in a bogus motif name, one for which there cannot possibly be a TF # # motif.names[3] <- "bogus" # tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE) # tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE) # checkTrue(is.na(tbl.anno$geneSymbol[3])) # checkTrue(is.na(tbl.anno$pubmedID[3])) # # # now check motifs with short, traditional names, in "TFClass" mode, which uses # # the tfFamily # # "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2" # # motif.names <- names(pfms[1:5]) # short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)]))) # tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE) # # tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE) # checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2)) # # # TFClass only annotates MA0003.3, none of the others # checkTrue(all(is.na(tbl.anno$geneSymbol[-4]))) # checkTrue(all(is.na(tbl.anno$pubmedID[-4]))) # checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E") # checkEquals(tbl.anno$pubmedID[4], "23180794") # # # now ask for expandsion of the semicolon separated list # tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE) # checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2)) # checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol)) # # # now add in a bogus motif name, one for which there cannot possibly be a TF # # motif.names <- names(pfms[1:5]) # short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)]))) # short.motif.names[4] <- "bogus" # tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE) # # tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE) # checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2)) # # after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA # checkTrue(all(is.na(tbl.anno$geneSymbol))) # checkTrue(all(is.na(tbl.anno$pubmedID))) # # # now make sure that the absence of the TFClass-specific "shortMotif" field is detected # motif.names <- names(pfms[1:5]) # tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE) # checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE) # now some motif names } # test.associateTranscriptionFactors #------------------------------------------------------------------------------------------------------------------------ test.match <- function() { printf("--- test.match") gr.region <- GRanges(seqnames="chr1", IRanges(start=47229520, end=47229560)) motifs <- query(MotifDb, c("jaspar2018", "ZNF263")) checkEquals(length(motifs), 1) gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5) checkEquals(length(gr.match), 1) # just one motif checkEquals(names(gr.match), names(motifs)) checkEquals(length(gr.match[[1]]), 3) tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE) checkEquals(dim(tbl.match), c(3, 7)) checkTrue(all(tbl.match$motif == names(motifs))) checkEquals(class(tbl.match$chrom), "character") # not a factor motifs <- query(MotifDb, "ZNF263", c("jaspar2018", "swissregulon")) checkEquals(length(motifs), 2) gr.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5) checkEquals(names(gr.match), names(motifs)) checkEquals(as.numeric(lapply(gr.match, length)), c(3, 1)) tbl.match <-matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE) checkEquals(dim(tbl.match), c(4, 7)) checkEquals(length(unique(tbl.match$motif)), 2) checkEquals(unique(tbl.match$motif), names(motifs)) checkEquals(colnames(tbl.match), c("chrom", "start", "end", "width", "strand", "mood.score", "motif_id")) #------------------------------------------------ # now all jaspar2018 human motifs #------------------------------------------------ motifs <- query(MotifDb, c("jaspar2018", "hsapiens")) tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-5, fimoDataFrameStyle=TRUE) checkEquals(dim(tbl.match), c(7, 7)) checkEquals(sort(unique(tbl.match$motif)), c("Hsapiens-jaspar2018-EWSR1-FLI1-MA0149.1", "Hsapiens-jaspar2018-ZNF263-MA0528.1")) #----------------------------------------------------- # now all jaspar2018 human motifs, loosen the pValue #----------------------------------------------------- motifs <- query(MotifDb, c("jaspar2018", "hsapiens")) tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-4, fimoDataFrameStyle=TRUE) checkTrue(nrow(tbl.match) > 15) tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-3, fimoDataFrameStyle=TRUE) checkTrue(nrow(tbl.match) > 50) #------------------------------------------------------------- # now all jaspar2018 and hocomoco human motifs across 10kb #------------------------------------------------------------ motifs <- query(MotifDb, "hsapiens", orStrings=c("jaspar2018", "hocomoco-core")) checkTrue(length(motifs) > 500) gr.region <- GRanges(seqnames="chr1", IRanges(start=47229000, end=47239000)) tbl.match <- matchMotif(MotifDb, motifs, "hg38", gr.region, 1e-7, fimoDataFrameStyle=TRUE) checkTrue(nrow(tbl.match) > 90 && nrow(tbl.match) < 110) checkEquals(order(tbl.match$start), seq_len(nrow(tbl.match))) } # test.match #------------------------------------------------------------------------------------------------------------------------ findMotifsWithMutuallyExclusiveMappings <- function() { xtab <- as.data.frame(table(MotifDb@manuallyCuratedGeneMotifAssociationTable$motif)) xtab <- xtab[order(xtab$Freq, decreasing=TRUE),] for(motif in xtab$Var1){ mdb.genes <- toupper(motifToGene(MotifDb, motif, "motifdb")$geneSymbol) tfc.genes <- toupper(motifToGene(MotifDb, motif, "tfclass")$geneSymbol) if(length(mdb.genes) == 0) next if(length(tfc.genes) == 0) next if(length(intersect(mdb.genes, tfc.genes)) == 0){ printf("------ %s", motif) printf(" mdb.genes: %s", paste(mdb.genes, collapse=", ")) printf(" tfc.genes: %s", paste(tfc.genes, collapse=", ")) } # if no intersection } # for motif # [1] ------ MA0099.2 # [1] mdb.genes: AP1, JUN::FOS, FOS::JUN, FOS::JUN # [1] tfc.genes: FOS, JUN } # findMotifsWithMutuallyExclusiveMappings #------------------------------------------------------------------------------------------------------------------------ test.hocomoco11.with.reliabilityScores <- function() { printf("--- test.hocomoco11.with.reliabilityScores") #------------------------------------------------------------------------- # these queries rely primarily upon the dataSoure column of the metadata # subsequent checks below look at metadata rownames and matrix names #------------------------------------------------------------------------- checkEquals(length(query(MotifDb, "hocomoco")), 1834) checkEquals(length(query(MotifDb, "hocomocov10")), 1066) checkEquals(length(query(MotifDb, "hocomocov11")), 768) checkEquals(length(query(MotifDb, "hocomocov11-core")), 400) checkEquals(length(query(MotifDb, "hocomocov11-secondary")), 368) checkEquals(length(query(MotifDb, "hocomocov11-core-A")), 181) checkEquals(length(query(MotifDb, "hocomocov11-secondary-A")), 46) checkEquals(length(query(MotifDb, "hocomocov11-core-B")), 84) checkEquals(length(query(MotifDb, "hocomocov11-secondary-B")), 19) checkEquals(length(query(MotifDb, "hocomocov11-core-C")), 135) checkEquals(length(query(MotifDb, "hocomocov11-secondary-C")), 13) checkEquals(length(query(MotifDb, "hocomocov11-core-D")), 0) checkEquals(length(query(MotifDb, "hocomocov11-secondary-D")), 290) #------------------------------------------------------------------------- # check matrix names #------------------------------------------------------------------------- checkEquals(length(grep("HOCOMOCOv11-core-A", names(MotifDb))), 181) checkEquals(length(grep("HOCOMOCOv11-secondary-A", names(MotifDb))), 46) checkEquals(length(grep("HOCOMOCOv11-core-A", rownames(mcols(MotifDb)))), 181) checkEquals(length(grep("HOCOMOCOv11-secondary-A", rownames(mcols(MotifDb)))), 46) } # test.hocomoco11.with.reliabilityScores #------------------------------------------------------------------------------------------------------------------------ if(!interactive()) runTests()