b70c1a6e |
library (MotifDb)
library (RUnit)
|
000f7c78 |
library (MotIV)
|
b70c1a6e |
library (seqLogo)
|
1f060cb8 |
#----------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#----------------------------------------------------------------------------------------------------
|
49ebd9dc |
runTests = function ()
|
b70c1a6e |
{
test.emptyCtor ()
test.nonEmptyCtor ()
test.MotifDb.normalMode ()
test.MotifDb.emptyMode ()
test.allMatricesAreNormalized ()
test.providerNames ()
test.geneSymbols ()
|
2f30a7ba |
test.geneIdsAndTypes ()
|
b70c1a6e |
test.proteinIds ()
test.sequenceCount ()
test.longNames ()
test.organisms ()
test.bindingDomains ()
test.experimentTypes ()
test.tfFamilies ()
test.bindingSequences ()
test.flyBindingDomains ()
test.pubmedIDs ()
test.allFullNames ()
test.subset ()
test.subsetWithVariables ()
test.query ()
test.transformMatrixToMemeRepresentation ()
test.matrixToMemeText ()
test.export_memeFormatStdOut ()
test.export_memeFormatToFile ()
test.export_memeFormatToFileDuplication ()
test.export_memeFormatToFile_run_tomtom ()
|
000f7c78 |
test.MotIV.toTable ()
test.run_MotIV.motifMatch()
|
77e1c264 |
test.flyFactorGeneSymbols()
|
49ebd9dc |
test.export_jasparFormatStdOut()
test.export_jasparFormatToFile()
|
b70c1a6e |
|
49ebd9dc |
test.geneToMotif()
test.motifToGene()
test.associateTranscriptionFactors()
} # runTests
|
b70c1a6e |
#------------------------------------------------------------------------------------------------------------------------
test.emptyCtor = function ()
{
print ('--- test.emptyCtor')
motif.list = MotifDb:::MotifList ()
|
2def1eec |
checkEquals (length (motif.list), 0)
|
b70c1a6e |
} # test.emptyCtor
#------------------------------------------------------------------------------------------------------------------------
test.nonEmptyCtor = function ()
{
print ('--- test.nonEmptyCtor')
mtx = matrix (runif (20), nrow=4, ncol=5, byrow=T, dimnames=list(c ('A', 'C', 'G', 'T'), as.character (1:5)))
|
2def1eec |
mtx.normalized = apply (mtx, 2, function (colvector) colvector / sum (colvector))
|
b70c1a6e |
matrixList = list (mtx.normalized)
tbl.md = data.frame (providerName='',
providerId='',
dataSource='',
geneSymbol='',
geneId='',
geneIdType='',
proteinId='',
proteinIdType='',
organism='',
sequenceCount='',
bindingSequence='',
bindingDomain='',
tfFamily='',
experimentType='',
|
2def1eec |
pubmedID='',
|
b70c1a6e |
stringsAsFactors=FALSE)
names (matrixList) = 'test'
rownames (tbl.md) = 'test'
motif.list = MotifDb:::MotifList (matrixList, tbl.md)
|
2def1eec |
checkEquals (length (motif.list), 1)
|
b70c1a6e |
} # test.nonEmptyCtor
#------------------------------------------------------------------------------------------------------------------------
# 'normal' in that all included data sources are already loaded
test.MotifDb.normalMode = function ()
{
print ('--- test.MotifDb.normalMode')
mdb = MotifDb # (quiet=TRUE)
# (5 jun 2012)
# JASPAR_CORE: 459
# ScerTF: 196
# UniPROBE: 380
# FlyFactorSurvey: 614
# hPDI: 437
checkTrue (length (mdb) > 2080)
} # test.MotifDb.normalMode
#------------------------------------------------------------------------------------------------------------------------
|
2def1eec |
# this mode is not intended for users, but may see use in the future.
|
b70c1a6e |
test.MotifDb.emptyMode = function ()
{
print ('--- test.MotifDb.emptyMode')
mdb = MotifDb:::.MotifDb (loadAllSources=FALSE, quiet=TRUE)
|
2def1eec |
checkTrue (length (mdb) == 0)
|
b70c1a6e |
} # test.MotifDb.emptyMode
#------------------------------------------------------------------------------------------------------------------------
# 3 jaspar matrices arrive with organism = NA. find and fix. make sure here.
# NA-JASPAR_CORE-TBP-MA0108.2: JASPAR gives <NA> for speciesID
# NA-JASPAR_CORE-HNF4A-MA0114.1: JASPAR gives <NA> for speciesID
# NA-JASPAR_CORE-CEBPA-MA0102.2: JASPAR gives '-' for speciesID, website says 'vertebrates'
|
8ff73793 |
# Many more NA's exist...need to fix these; here's a quick fix for now
|
fa5fbc94 |
|
b70c1a6e |
test.noNAorganisms = function ()
{
print ('--- test.noNAorganisms')
|
8ff73793 |
#checkEquals (which (is.na (mcols(MotifDb)$organism)), integer (0))
|
1f060cb8 |
# There's a fair number of NA organisms, mostly due to including the homer DB
checkEquals(sum(is.na (mcols(MotifDb)$organism)), 366)
|
b70c1a6e |
} # test.noNAorganisms
#------------------------------------------------------------------------------------------------------------------------
test.allMatricesAreNormalized = function ()
{
print ('--- test.allMatricesAreNormalized')
mdb = MotifDb# (quiet=TRUE)
matrices = mdb@listData
|
6666b3f4 |
# a lenient test required by "Cparvum-UniPROBE-Cgd2_3490.UP00395" and "Hsapiens-UniPROBE-Sox4.UP00401"
# for reasons not yet explored. 10e-8 should be be possible
|
a84b875e |
checkTrue(all(sapply(matrices, function (m) all (abs (colSums (m) - 1.0) < 0.02))))
|
2def1eec |
|
b70c1a6e |
} # test.allMatricesAreNormalized
#------------------------------------------------------------------------------------------------------------------------
test.providerNames = function ()
{
print ('--- test.getProviderNames')
mdb = MotifDb # ()
|
fe92bd53 |
pn = mcols(mdb)$providerName
|
b70c1a6e |
checkEquals (length (which (is.na (pn))), 0)
checkEquals (length (which (pn == '')), 0)
} # test.providerNames
#------------------------------------------------------------------------------------------------------------------------
test.geneSymbols = function ()
{
print ('--- test.getGeneSymbols')
mdb = MotifDb # ()
|
fe92bd53 |
syms = mcols(mdb)$geneSymbol
|
6666b3f4 |
checkEquals (length (which (is.na (syms))), 683) # no symols yet for the dgf stamlab motifs
|
b70c1a6e |
checkEquals (length (which (syms == '')), 0)
} # test.geneSymbols
#------------------------------------------------------------------------------------------------------------------------
test.geneIdsAndTypes = function ()
{
print ('--- test.getGeneIdsAndTypes')
|
17abb8b8 |
mdb = MotifDb
|
dc17ee90 |
tbl <- mcols(mdb)
geneIds = tbl$geneId
geneIdTypes = tbl$geneIdType
|
17abb8b8 |
typeCounts = as.list (table (geneIdTypes))
|
dc17ee90 |
|
2f30a7ba |
checkTrue(typeCounts$ENTREZ > 2300)
checkTrue(typeCounts$FLYBASE >= 45)
checkTrue(typeCounts$SGD >= 600)
checkTrue(nrow(subset(tbl, is.na(geneIdType))) > 2000)
|
2def1eec |
|
b70c1a6e |
empty.count = length (which (geneIds == ''))
checkEquals (empty.count, 0)
} # test.geneIdsAndTypes
#------------------------------------------------------------------------------------------------------------------------
# make sure that all proteinIds have explicit values, either proper identifiers or NA
# currently tested by looking for empty string assignments
test.proteinIds = function ()
{
print ('--- test.proteinIds')
mdb = MotifDb # (quiet=TRUE)
|
8ff73793 |
NA.string.count <- sum(is.na(mcols(mdb)$proteinId))
# NA.string.count = length (grep ('NA', mcols(mdb)$proteinId))
checkEquals(NA.string.count, 2514)
# FIX THIS; Currently 2514 don't have protein IDs
#checkEquals (NA.string.count, 0)
|
2def1eec |
|
fe92bd53 |
empty.count = length (which (mcols(mdb)$proteinId==""))
|
b70c1a6e |
if (empty.count > 0)
|
2def1eec |
browser ('test.proteinIds')
|
b70c1a6e |
checkEquals (empty.count, 0)
# FlyFactorSurvey, as digested by me, had a blanket assigment of UNIPROT to all proteinIds
# Herve' pointed out that this applied also to entries with no proteinId.
# make sure this is fixed
|
8ff73793 |
### FIX THIS TOO! Currently have 913 entries with a proteinIdType and no proteinId
|
fe92bd53 |
x = mcols(mdb)
|
8ff73793 |
# checkEquals (nrow (subset (x, !is.na (proteinIdType) & is.na (proteinId))), 0)
|
2def1eec |
|
b70c1a6e |
} # test.proteinIds
#------------------------------------------------------------------------------------------------------------------------
# only for UniPROBE do we not have sequence count. might be possible to get them along with 'insertion sequences'
test.sequenceCount = function ()
{
print ('--- test.sequenceCount')
mdb = MotifDb # ()
|
fe92bd53 |
x = mcols(mdb)
|
b70c1a6e |
if (interactive ()) {
x.up = subset (x, dataSource == 'UniPROBE')
checkTrue (all (is.na (x.up$sequenceCount)))
}
else {
uniprobe.indices = which (x$dataSource == 'UniPROBE')
checkTrue (all (is.na (x$sequenceCount [uniprobe.indices])))
}
} # test.sequenceCount
#------------------------------------------------------------------------------------------------------------------------
# make sure that a legitimate organism-dataSource-identifier is supplied for each matrix and as a rowname
# of the corresponding DataFrame
test.longNames = function ()
{
print ('--- test.longNames')
|
2def1eec |
mdb = MotifDb
|
b70c1a6e |
longNames = strsplit (names (mdb), '-')
organisms = unique (sapply (longNames, '[', 1))
|
2def1eec |
|
a84b875e |
dataSources = unique (lapply (longNames, '[', 2))
|
b70c1a6e |
|
fe92bd53 |
recognized.dataSources = unique (mcols(mdb)$dataSource)
recognized.organisms = unique (mcols(mdb)$organism)
|
b70c1a6e |
# a few (3) matrices from JASPAR core have NA organism. make this into a character
# so that it can be matched up against the 'NA' extracted from longNames just above
na.indices = which (is.na (recognized.organisms))
if (length (na.indices) > 0)
|
a84b875e |
recognized.organisms [na.indices] = 'NA'
|
b70c1a6e |
checkTrue (all (organisms %in% recognized.organisms))
checkTrue (all (dataSources %in% recognized.dataSources))
} # test.longNames
#------------------------------------------------------------------------------------------------------------------------
# make sure that a legitimate organism is specified for each matrix
test.organisms = function ()
{
print ('--- test.organisms')
mdb = MotifDb # (quiet=TRUE)
|
fe92bd53 |
organisms = mcols(mdb)$organism
|
b70c1a6e |
# jaspar_core has 3 NA speciesId: TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
# their website shows these as vertebrates, which I map to 'Vertebrata'. An organismID of '-'
|
fa5fbc94 |
# gets the same treatment, matching website also.
|
8ff73793 |
### Note: this failing test is the same as the test.noNAorganisms test!
|
fa5fbc94 |
# As in case of noNA, need to add organisms for these
|
8ff73793 |
#checkEquals (which (is.na (mcols(MotifDb)$organism)), integer (0))
|
b70c1a6e |
|
fe92bd53 |
empty.count = length (which (mcols(mdb)$organism==""))
|
b70c1a6e |
checkEquals (empty.count, 0)
} # test.organisms
#------------------------------------------------------------------------------------------------------------------------
test.bindingDomains = function ()
{
print ('--- test.bindingDomains')
mdb = MotifDb # (quiet=TRUE)
|
fe92bd53 |
checkTrue (length (unique (mcols(mdb)$bindingDomain)) > 1)
|
b70c1a6e |
} # test.bindingDomains
#------------------------------------------------------------------------------------------------------------------------
test.flyBindingDomains = function ()
{
print ('--- test.flyBindingDomains')
|
fe92bd53 |
x = mcols(MotifDb)
|
b70c1a6e |
tmp = as.list (head (sort (table (subset (x, organism=='Dmelanogaster')$bindingDomain), decreasing=TRUE), n=3))
# these counts will likely change with a fresh load of data from FlyFactorSurvey.
checkEquals (tmp$Homeobox, 212)
checkEquals (tmp[['zf-C2H2']], 160)
|
5a4345d9 |
checkEquals (tmp[["Helix-Turn-Helix"]], 182)
|
fa5fbc94 |
checkEquals (length (which (is.na (subset (x, organism=='Dmelanogaster')$bindingDomain))), 301) # lots of cisbp
|
b70c1a6e |
} # test.flyBindingDomains
#------------------------------------------------------------------------------------------------------------------------
test.experimentTypes = function ()
{
print ('--- test.experimentTypes')
mdb = MotifDb # (quiet=TRUE)
|
fe92bd53 |
x = mcols(mdb)
|
b70c1a6e |
checkTrue (length (unique (x$experimentType)) >= 18)
checkEquals (length (which (x$experimentType=='')), 0)
} # test.experimentTypes
#------------------------------------------------------------------------------------------------------------------------
test.tfFamilies = function ()
{
print ('--- test.tfFamilies')
mdb = MotifDb # (quiet=TRUE)
|
fe92bd53 |
checkTrue (length (unique (mcols(mdb)$tfFamily)) > 1)
|
b70c1a6e |
} # test.tfFamilies
#------------------------------------------------------------------------------------------------------------------------
test.bindingSequences = function ()
{
print ('--- test.bindingSequences')
mdb = MotifDb # (quiet=TRUE)
|
fe92bd53 |
checkTrue (length (unique (mcols(mdb)$bindingSequence)) > 1)
|
b70c1a6e |
} # test.tfFamilies
#------------------------------------------------------------------------------------------------------------------------
test.pubmedIDs = function ()
{
print ('--- test.pubmedIDs')
|
fe92bd53 |
x = mcols(MotifDb) # (quiet=TRUE))
|
b70c1a6e |
checkTrue (length (unique (x$pubmedID)) >= 139)
checkEquals (length (which (x$pubmedID == '')), 0)
} # test.pubmedIDs
#------------------------------------------------------------------------------------------------------------------------
# every matrix, and every corresponding metadata table row, has a name formed thus:
#
# dataSource-organism-nativeGeneName
#
# where nativeGeneName is how the matrix is named in the dataSource from which it comes.
# thus:
# FlyFactorSurvey-Dmelanogaster-Abd.A_FlyReg_FBgn0000014
# UniPROBE-Scerevisiae-Asg1-UP00350
# ScerTF-Scerevisiae-ABF2-badis
# JASPAR_CORE-Rrattus-Ar-MA0007.1
|
2def1eec |
#
|
b70c1a6e |
test.allFullNames = function ()
{
print ('--- test.allFullNames')
mdb = MotifDb # (quiet=TRUE)
matrices = mdb@listData
fullNames = names (matrices)
|
fe92bd53 |
all.dataSources = unique (mcols(mdb)$dataSource)
|
b70c1a6e |
checkTrue (length (all.dataSources) >= 4)
|
2def1eec |
|
b70c1a6e |
for (source in all.dataSources) {
this.dataSource <<- source
matrices.by.source = subset (mdb, dataSource==this.dataSource)
matrix.name = names (matrices.by.source)[1]
# FlyFactorSurvey: Dmelanogaster-FlyFactorSurvey-ab_SANGER_10_FBgn0259750
# hPDI: Hsapiens-hPDI-ABCF2
# JASPAR_CORE: JASPAR_CORE-Athaliana-AGL3-MA0001.1
# UniPROBE: Hsapiens-UniPROBE-Sox4.UP00401
checkTrue (grep (this.dataSource, matrix.name) == 1)
#printf ('%20s: %s', source, matrix.name)
}
return (TRUE)
} # test.allFullNames
#------------------------------------------------------------------------------------------------------------------------
test.subset = function ()
{
if (interactive ()) {
print ('--- test.subset')
mdb = MotifDb
checkTrue ('geneSymbol' %in% colnames (elementMetadata (mdb)))
mdb.sub = subset (mdb, geneSymbol=='ABCF2')
checkEquals (length (mdb.sub), 1)
|
2def1eec |
} # if interactive
|
b70c1a6e |
} # test.subset
#------------------------------------------------------------------------------------------------------------------------
test.subsetWithVariables = function ()
{
if (interactive ()) {
print ('--- test.subsetWithVariables')
|
2def1eec |
|
b70c1a6e |
mdb = MotifDb # ()
checkTrue ('geneSymbol' %in% colnames (elementMetadata (mdb)))
target.gene <<- 'ABCF2'
mdb.sub = subset (mdb, geneSymbol==target.gene)
checkEquals (length (mdb.sub), 1)
|
2def1eec |
} # if interactive
|
b70c1a6e |
} # test.subsetWithVariables
#------------------------------------------------------------------------------------------------------------------------
test.query = function ()
{
print ('--- test.query')
mdb = MotifDb
# do queries on dataSource counts match those from a contingency table?
|
fe92bd53 |
sources.list = as.list (table (mcols(mdb)$dataSource))
|
b70c1a6e |
checkEquals (length (query (mdb, 'flyfactorsurvey')), sources.list$FlyFactorSurvey)
checkEquals (length (query (mdb, 'uniprobe')), sources.list$UniPROBE)
checkEquals (length (query (mdb, 'UniPROBE')), sources.list$UniPROBE)
# gene symbols which begin with 'sox' are quite common. can we them?
# there are currently (19 jul 2012) 18, but since this may change, our test is approximate
|
8ff73793 |
# Change on 8/1/2017: increase top limit of sox entries as they've expanded
|
b70c1a6e |
sox.entries = query (mdb, '^sox')
checkTrue (length (sox.entries) > 10)
|
8ff73793 |
checkTrue (length (sox.entries) < 200)
|
b70c1a6e |
# manual inspection reveals that some of these genes have names which are all capitalized. test that.
checkTrue (length (query (mdb, '^sox', ignore.case=TRUE)) > length (query (mdb, '^SOX', ignore.case=FALSE)))
# make sure that queries can be stacked up, and that order of call does not affect the outcome
uniprobe.sox.matrices = query (query (mdb, 'uniprobe'), '^sox')
sox.uniprobe.matrices = query (query (mdb, '^sox'), 'uniprobe')
checkEquals (length (uniprobe.sox.matrices), length (sox.uniprobe.matrices))
# an approximate count check
checkTrue (length (uniprobe.sox.matrices) > 10)
checkTrue (length (uniprobe.sox.matrices) < 30)
|
fe92bd53 |
checkEquals (unique (mcols(uniprobe.sox.matrices)$dataSource), 'UniPROBE')
checkEquals (unique (mcols(sox.uniprobe.matrices)$dataSource), 'UniPROBE')
gene.symbols = sort (unique (mcols(uniprobe.sox.matrices)$geneSymbol))
|
2def1eec |
|
dae43ecd |
# query on a string (in this case, an oddly named motif) which contains
# regular expression characters. no solution to this yet.
# query uses base R's grep, in which
# x <- query(mdb, "ELK1,4_GABP{A,B1}.p3")
|
b70c1a6e |
} # test.query
#------------------------------------------------------------------------------------------------------------------------
test.transformMatrixToMemeRepresentation = function ()
{
print ('--- test.transformMatrixToMemeRepresentation')
mdb = MotifDb # ()
matrix.agl3 = subset (mdb, dataSource=='JASPAR_CORE' & organism=='Athaliana' & geneSymbol=='AGL3')[[1]]
checkEquals (dim (matrix.agl3), c (4, 10))
# a transposed frequency is needed for meme. we store normalized frequency matrices, to just transposition is needed
tfm = MotifDb:::transformMatrixToMemeRepresentation (matrix.agl3)
checkEquals (dim (tfm), c (10, 4))
checkTrue (all (round (rowSums (tfm)) == 1.0))
} # test.transformMatrixToMemeRepresentation
#------------------------------------------------------------------------------------------------------------------------
# translate a motif matrix from MotifList into text suitable for meme and tomtom input.
# choose the top two from UniPROBE. they reliably have high counts, and easily distinguished normalized frequencies
test.matrixToMemeText = function ()
{
print ('--- test.matrixToMemeText')
sox4 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
rtg3 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
text.sox4 = MotifDb:::matrixToMemeText (sox4)
# check these with t (sox4 [[1]])
line1.sox4 = " 0.2457457130 0.1950426500 0.2287887620 0.3304228750"
line14.sox4 = " 0.2821643030 0.2286132160 0.1585395830 0.3306828990"
|
2def1eec |
|
b70c1a6e |
checkEquals (length (text.sox4), 29)
checkEquals (text.sox4 [1], "MEME version 4")
checkEquals (text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
checkEquals (grep (line1.sox4, text.sox4), 12)
checkEquals (grep (line14.sox4, text.sox4), 25)
text.rtg3 = MotifDb:::matrixToMemeText (rtg3)
line1.rtg3 = " 0.3935122858 0.1453016447 0.3308830322 0.1303030373"
line20.rtg3 = " 0.2490417648 0.3966478493 0.1083586569 0.2459517291"
checkEquals (length (text.rtg3), 35) # 4 trailing empty lines
checkEquals (text.rtg3 [1], "MEME version 4")
checkEquals (text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
checkEquals (grep (line1.rtg3, text.rtg3), 12)
checkEquals (grep (line20.rtg3, text.rtg3), 31)
# now call with both matrices, and see if the right results are returned
text.both = MotifDb:::matrixToMemeText (c (sox4, rtg3))
checkEquals (text.both [1], "MEME version 4")
checkEquals (grep (line1.sox4, text.both), 12)
checkEquals (grep (line14.sox4, text.both), 25)
checkEquals (grep (line1.rtg3, text.both), 29)
checkEquals (grep (line20.rtg3, text.both), 48)
} # test.matrixToMemeText
#------------------------------------------------------------------------------------------------------------------------
# translate a motif matrix from MotifList into text suitable for meme and tomtom input.
# choose the top two from UniPROBE. they reliably have high counts, and easily distinguished normalized frequencies
#test.matrixToMemeText_mapVersion = function ()
#{
# print ('--- test.matrixToMemeText_mapVersion')
# sox4 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Hsapiens' & geneSymbol=='Sox4')
# rtg3 = subset (MotifDb, dataSource=='UniPROBE' & organism=='Scerevisiae' & geneSymbol=='Rtg3')
#
# text.sox4 = MotifDb:::mapped.broken.matrixToMemeText (sox4)
# text.rtg3 = MotifDb:::mapped.broken.matrixToMemeText (rtg3)
# text.both = MotifDb:::mapped.broken.matrixToMemeText (c (sox4, rtg3))
#
# # check these with t (sox4 [[1]])
# line1.sox4 = " 0.2457457130 0.1950426500 0.2287887620 0.3304228750"
# line14.sox4 = " 0.2821643030 0.2286132160 0.1585395830 0.3306828990"
#
# checkEquals (length (text.sox4), 29)
# checkEquals (text.sox4 [1], "MEME version 4")
# checkEquals (text.sox4 [10], "MOTIF Hsapiens-UniPROBE-Sox4.UP00401")
# checkEquals (grep (line1.sox4, text.sox4), 12)
# checkEquals (grep (line14.sox4, text.sox4), 25)
#
# line1.rtg3 = " 0.3935122858 0.1453016447 0.3308830322 0.1303030373"
# line20.rtg3 = " 0.2490417648 0.3966478493 0.1083586569 0.2459517291"
# checkEquals (length (text.rtg3), 35) # 4 trailing empty lines
# checkEquals (text.rtg3 [1], "MEME version 4")
# checkEquals (text.rtg3 [10], "MOTIF Scerevisiae-UniPROBE-Rtg3.UP00356")
# checkEquals (grep (line1.rtg3, text.rtg3), 12)
# checkEquals (grep (line20.rtg3, text.rtg3), 31)
#
# # now call with both matrices, and see if the right results are returned
# checkEquals (text.both [1], "MEME version 4")
# checkEquals (grep (line1.sox4, text.both), 12)
# checkEquals (grep (line14.sox4, text.both), 25)
# checkEquals (grep (line1.rtg3, text.both), 29)
# checkEquals (grep (line20.rtg3, text.both), 48)
#
#} # test.matrixToMemeText_mapVersion
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatStdOut = function ()
{
print ('--- test.export_memeFormatStdOut')
mdb = MotifDb # ()
mdb.chicken = subset (mdb, organism=='Gallus')
|
5a4345d9 |
checkEquals (length (mdb.chicken), 3)
|
b70c1a6e |
# text is cat-ed to stdout, so not avaialable here to check.
# but just like print, export also returns the text invisibly.
# so that CAN be checked.
|
2def1eec |
|
b70c1a6e |
meme.text = export (mdb.chicken, format='meme')
checkEquals (length (meme.text), 1) # just one long string
checkTrue (is.character (meme.text))
checkTrue (nchar (meme.text) > 800) # 1002 as of (10 aug 2012)
return (TRUE)
} # test.exportMemeFormatToStdOut
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatToFile = function ()
{
print ('--- test.export_memeFormatToFile')
mdb = MotifDb # ()
mdb.chicken = subset (mdb, organism=='Gallus')
|
5a4345d9 |
checkEquals (length (mdb.chicken), 3)
|
b70c1a6e |
output.file = tempfile ()
meme.text = export (mdb.chicken, output.file, 'meme')
retrieved = scan (output.file, what=character (0), sep='\n', quiet=TRUE)
invisible (retrieved)
} # test.exportMemeFormatToFile
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatToFileDuplication = function ()
{
print ('--- test.export_memeFormatToFileDuplication')
mdb = MotifDb # ()
mdb.mouse = subset (mdb, organism=='Mmusculus')
|
fa5fbc94 |
checkEquals (length (mdb.mouse), 1251)
|
b70c1a6e |
output.file = 'mouse.txt' # tempfile ()
max = 3
meme.text = export (mdb.mouse [1:max], output.file, 'meme')
|
4ff3698f |
retrieved = scan (output.file, what=character (0), sep='\n', quiet=TRUE)
|
b70c1a6e |
invisible (retrieved)
} # test.exportMemeFormatToFileDuplication
#------------------------------------------------------------------------------------------------------------------------
test.export_memeFormatToFile_run_tomtom = function (max=50)
{
if (interactive ()) {
print ('--- test.export_memeFormatToFile_run_tomtom')
mdb = MotifDb
sox4.mouse = subset (mdb, organism=='Mmusculus' & geneSymbol=='Sox4')
all.human = subset (mdb, organism=='Hsapiens')
tomtom.tmp.dir = tempfile ()
print (tomtom.tmp.dir)
stopifnot (dir.create (tomtom.tmp.dir))
sox4.file.path = file.path (tomtom.tmp.dir, 'sox4.mouse.text')
all.human.file.path = file.path (tomtom.tmp.dir,'all.human.text')
export (sox4.mouse, sox4.file.path, 'meme')
export (all.human, all.human.file.path, 'meme')
# find similarity of motif #1 to all the motifs in mdbMany
|
a84b875e |
# cannot rely upon tomtom being present
#cmd = sprintf ('tomtom -no-ssc -oc %s -verbosity 3 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10 %s %s',
# tomtom.tmp.dir, sox4.file.path, all.human.file.path)
#system (cmd)
#cmd = sprintf ('open %s/tomtom.html', tomtom.tmp.dir)
#system (cmd)
|
b70c1a6e |
} # if interactive
} # test.export_memeFormatToFile_run_tomtom
#------------------------------------------------------------------------------------------------------------------------
|
000f7c78 |
# MotIV::motifMatch fails with MotIV_1.25.0. will look into this in September, 2015, pshannon
test.run_MotIV.motifMatch = function ()
|
b70c1a6e |
{
library (MotIV)
|
000f7c78 |
print ('--- test.run_MotIV.motifMatch')
|
9ad58ff7 |
mdb <- MotifDb # ()
|
b70c1a6e |
|
9ad58ff7 |
db.tmp <- mdb@listData
|
b70c1a6e |
# match motif 1 against the entire MotifDb collection
|
9ad58ff7 |
motif.hits <- motifMatch (db.tmp [1], database<-db.tmp)
|
b70c1a6e |
# the long way to extract the matrix name. see MotIV.toTable below for more convenient way
checkEquals (motif.hits@bestMatch[[1]]@aligns[[1]]@TF@name, names (db.tmp)[1])
|
000f7c78 |
# match the last motif against all
last <- length(db.tmp)
|
9ad58ff7 |
# MotIV:motifMatch works differently on linux and macos. by asking for 50 matches,
# the search target (db.tmp[last]) is sure to be in the hit list.
motif.hits <- motifMatch (db.tmp [last], database=db.tmp, top=50)
tbl.hits <- MotIV.toTable (motif.hits)
|
000f7c78 |
# the 5 hits return should include the one we tried to match, but the MotIV search strategy
# may not place it first
checkTrue(names(db.tmp[last]) %in% tbl.hits$name)
|
b70c1a6e |
invisible (tbl.hits)
|
2def1eec |
|
000f7c78 |
} # test.run_MotIV.motifMatch
|
b70c1a6e |
#------------------------------------------------------------------------------------------------------------------------
MotIV.toTable = function (match)
{
stopifnot (length (match@bestMatch) >= 1)
alignments = match@bestMatch[[1]]@aligns
df = data.frame (stringsAsFactors=FALSE)
for (alignment in alignments) {
x = alignment
name = x@TF@name
eVal = x@evalue
sequence = x@sequence
match = x@match
strand = x@strand
df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
} # for alignment
return (df)
|
2def1eec |
} # MotIV.toTable
|
b70c1a6e |
#------------------------------------------------------------------------------------------------------------------------
test.MotIV.toTable = function ()
{
print ('--- test.MotIVtoTable')
mdb = MotifDb # ()
test.hits = motifMatch (mdb[1]@listData, database=jaspar)
tbl.hits = MotIV.toTable (test.hits)
checkEquals (dim (tbl.hits), c (5, 5))
|
07b8ea8b |
checkEquals (sort(colnames (tbl.hits)), sort(c("name", "eVal", "sequence", "match", "strand")))
|
b70c1a6e |
|
2def1eec |
} # test.MotIV.toTable
|
b70c1a6e |
#------------------------------------------------------------------------------------------------------------------------
pwmMatch.toTable = function (motifMatch) {
if (length (motifMatch@bestMatch) == 0)
return (NA)
df.list = vector("list", length(motifMatch@bestMatch))
for (k in seq(length(motifMatch@bestMatch)))
{
alignments = motifMatch@bestMatch[[k]]@aligns
df = data.frame (stringsAsFactors=FALSE)
for (alignment in alignments) {
x = alignment
name = x@TF@name
eVal = x@evalue
sequence = x@sequence
match = x@match
strand = x@strand
df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence, match=match, strand=strand, stringsAsFactors=FALSE))
} # for alignment
df.list[[k]]=df
}
names(df.list) <- names(motifMatch)
return (df.list)
|
77e1c264 |
} # pwmMatch.toTable
#------------------------------------------------------------------------------
# Robert Stojnic reports incorrect gene symbols for matrices obtained from
# flyFactorSurvey.
# the solution was to abandon the original strategy of extracting the
# symbol from the matrix (and file) name.
# now, the flybase importer ("inst/scripts/import/flyFactorSurvey/import.R")
# uses FBgn id (which can be reliably extracted) and uses indpendent
# data sources to learn the gene symbol.
#
# robert's email:
# I'm working on using MotifDb motifs in my PWMEnrich package and I
# have noticed that there is a slight problem with gene symbols for
# Drosophila. In particular, the gene symbols do not always correspond
# to the gene ID and are frequently mis-capitalized. In Drosophila z
# and Z are two different genes and capitalization does matter if
# someone is to use the gene symbols. Also, in some cases the symbols
# are missing hyphens or parenthesis. I have used the gene IDs and the
# Flybase annotation database to set the correct gene symbols for
# Drosophila, please find attached the result of my re-annotation.
#
|
2def1eec |
# looking at his correctedMotifDbDmel.csv
|
77e1c264 |
#
# head(read.table("correctedMotifDbDmel.csv", sep=",", header=TRUE, stringsAsFactors=FALSE))
# providerName oldGeneSymbol newGeneSymbol
# 1 ab_SANGER_10_FBgn0259750 Ab ab
# 2 ab_SOLEXA_5_FBgn0259750 Ab ab
# 3 Abd-A_FlyReg_FBgn0000014 Abd-a abd-A
# 4 Abd-B_FlyReg_FBgn0000015 Abd-b Abd-B
# 5 AbdA_Cell_FBgn0000014 Abda abd-A
# 6 AbdA_SOLEXA_FBgn0000014 Abda abd-A
#
test.flyFactorGeneSymbols <- function()
{
print ("--- test.flyFactorGeneSymbols")
mdb = MotifDb
|
07b8ea8b |
checkEquals(sort(mcols(query(mdb, "FBgn0259750"))$geneSymbol),
sort(c("FBgn0259750", "FBgn0259750")))
|
77e1c264 |
checkEquals(mcols(query(mdb, "FBgn0000014"))$geneSymbol, rep("abd-A", 3))
checkEquals(mcols(query(mdb, "FBgn0000015"))$geneSymbol, rep("Abd-B", 3))
} # test.flyFactorGeneSymbols
#-------------------------------------------------------------------------------
|
8c9d39f9 |
test.export_jasparFormatStdOut = function ()
{
print ('--- test.export_jasparFormatStdOut')
mdb = MotifDb # ()
mdb.chicken = subset (mdb, organism=='Gallus')
checkEquals (length (mdb.chicken), 3)
# text is cat-ed to stdout, so not avaialable here to check.
# but just like print, export also returns the text invisibly.
# so that CAN be checked.
|
2def1eec |
|
8c9d39f9 |
jaspar.text = export (mdb.chicken, format='jaspar')
checkEquals (length (jaspar.text), 1) # just one long string
checkTrue (is.character (jaspar.text))
checkTrue (nchar (jaspar.text) > 800) # 1002 as of (10 aug 2012)
return (TRUE)
} # test.exportjasparFormatToStdOut
#------------------------------------------------------------------------------------------------------------------------
test.export_jasparFormatToFile = function ()
{
print ('--- test.export_jasparFormatToFile')
mdb = MotifDb # ()
mdb.chicken = subset (mdb, organism=='Gallus')
checkEquals (length (mdb.chicken), 3)
output.file = tempfile ()
jaspar.text = export (mdb.chicken, output.file, 'jaspar')
retrieved = scan (output.file, what=character (0), sep='\n', quiet=TRUE)
invisible (retrieved)
|
77e1c264 |
|
8c9d39f9 |
} # test.exportjasparFormatToFile
|
2def1eec |
#------------------------------------------------------------------------------------------------------------------------
|
49ebd9dc |
test.geneToMotif <- function()
|
2def1eec |
{
|
49ebd9dc |
printf("--- test.geneToMotif")
|
2def1eec |
mdb <- MotifDb
|
49ebd9dc |
genes <- c("FOS", "ATF5", "bogus")
# use TFClass family classifcation
|
4acf0d78 |
tbl.tfClass <- geneToMotif(mdb, genes, source="TfClaSS") # intentional mis-capitalization
|
07b8ea8b |
checkEquals(sort(tbl.tfClass$gene), sort(c("ATF5", "FOS", "FOS")))
checkEquals(sort(tbl.tfClass$motif), sort(c("MA0833.1", "MA0099.2", "MA0476.1")))
|
4acf0d78 |
checkEquals(tbl.tfClass$source, rep("TFClass", 3))
|
49ebd9dc |
# MotifDb mode uses the MotifDb metadata, pulled from many sources
|
4acf0d78 |
tbl.mdb <- geneToMotif(mdb, genes, source="mOtifdb") # intentional mis-capitalization
checkEquals(dim(tbl.mdb), c(12, 6))
checkEquals(subset(tbl.mdb, dataSource=="jaspar2016" & geneSymbol== "FOS")$motif, "MA0476.1")
|
49ebd9dc |
# no recognizable (i.e., jaspar standard) motif name returned by MotifDb metadata
# MotifDb for ATF5
# todo: compare the MA0110596_1.02 matrix of cisp_1.02 to japar MA0833.1
} # test.geneToMotif
#------------------------------------------------------------------------------------------------------------------------
test.motifToGene <- function()
{
printf("--- test.motifToGene")
|
e90f5267 |
printf("Sys.getlocale: %s", Sys.getlocale())
|
2def1eec |
|
49ebd9dc |
motifs <- c("MA0592.2", "UP00022", "ELF1.SwissRegulon")
|
2def1eec |
|
0aa3bd26 |
# MotifDb mode uses the MotifDb metadata "providerId",
tbl.mdb <- motifToGene(MotifDb, motifs, source="MotifDb")
|
4acf0d78 |
checkEquals(dim(tbl.mdb), c(3, 6))
|
1ae931f9 |
expected <- sort(c("MA0592.2", "ELF1.SwissRegulon", "UP00022"))
actual <- sort(tbl.mdb$motif)
checkEquals(actual, expected)
|
07b8ea8b |
checkEquals(sort(tbl.mdb$geneSymbol), sort(c("Esrra", "ELF1", "Zfp740")))
checkEquals(sort(tbl.mdb$dataSource), sort(c("jaspar2016", "SwissRegulon", "UniPROBE")))
checkEquals(sort(tbl.mdb$organism), sort(c("Mmusculus", "Hsapiens", "Mmusculus")))
checkEquals(sort(tbl.mdb$source), rep("MotifDb", 3))
|
d030e74b |
# TFClass mode uses TF family classifcation
|
0aa3bd26 |
tbl.tfClass <- motifToGene(MotifDb, motifs, source="TFClass")
|
4acf0d78 |
checkEquals(dim(tbl.tfClass), c(9,4))
checkEquals(tbl.tfClass$motif, rep("MA0592.2", 9))
|
07b8ea8b |
checkEquals(sort(tbl.tfClass$gene), sort(c("AR", "ESR1", "ESR2", "ESRRA", "ESRRB", "ESRRG", "NR3C1", "NR3C2", "PGR")))
|
4acf0d78 |
checkEquals(tbl.tfClass$source, rep("TFClass", 9))
|
49ebd9dc |
|
d030e74b |
# test motifs with regex characters in them, or other characters neither letter nor number
motifs <- sort(c("DMAP1_NCOR{1,2}_SMARC.p2", "ELK1,4_GABP{A,B1}.p3", "SNAI1..3.p2", "EWSR1-FLI1.p2", "ETS1,2.p2"))
tbl <- motifToGene(MotifDb, motifs, source="MotifDb")
checkEquals(nrow(tbl), 0)
tbl <- motifToGene(MotifDb, motifs, source="tfclass")
checkEquals(ncol(tbl), 4)
checkTrue(nrow(tbl) > 80)
checkTrue(nrow(tbl) < 100)
checkTrue(all(motifs %in% tbl$motif))
} # test.motifToGene
|
49ebd9dc |
#------------------------------------------------------------------------------------------------------------------------
test.associateTranscriptionFactors <- function()
{
printf("--- test.associateTranscriptionFactors")
mdb <- MotifDb
pfms <- query(query(mdb, "jaspar2016"), "sapiens")
|
2def1eec |
|
49ebd9dc |
# first check motifs with MotifDb-style long names, using MotifDb lookup, in the
# metadata of MotifDb:
# "Hsapiens-jaspar2016-RUNX1-MA0002.1" "Hsapiens-jaspar2016-TFAP2A-MA0003.1"
|
2def1eec |
|
49ebd9dc |
motif.names <- names(pfms[1:5])
|
094b985a |
tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
|
49ebd9dc |
checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
checkTrue(all(c("geneSymbol", "pubmedID") %in% colnames(tbl.anno)))
|
07b8ea8b |
checkEquals(sort(tbl.anno$geneSymbol), sort(c("RUNX1", "TFAP2A", "TFAP2A", "TFAP2A", "AR")))
|
2def1eec |
|
49ebd9dc |
# now add in a bogus motif name, one for which there cannot possibly be a TF
motif.names[3] <- "bogus"
|
094b985a |
tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="MotifDb", expand.rows=FALSE)
|
49ebd9dc |
checkTrue(is.na(tbl.anno$geneSymbol[3]))
checkTrue(is.na(tbl.anno$pubmedID[3]))
# now check motifs with short, traditional names, in "TFClass" mode, which uses
# the tfFamily
# "MA0002.1" "MA0003.1" "MA0003.2" "MA0003.3" "MA0007.2"
|
094b985a |
motif.names <- names(pfms[1:5])
|
49ebd9dc |
short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
|
094b985a |
tbl <- data.frame(motifName=motif.names, shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
|
49ebd9dc |
|
094b985a |
tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
|
49ebd9dc |
checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
|
094b985a |
# TFClass only annotates MA0003.3, none of the others
checkTrue(all(is.na(tbl.anno$geneSymbol[-4])))
checkTrue(all(is.na(tbl.anno$pubmedID[-4])))
checkEquals(tbl.anno$geneSymbol[4], "TFAP2A;TFAP2B;TFAP2C;TFAP2D;TFAP2E")
checkEquals(tbl.anno$pubmedID[4], "23180794")
# now ask for expandsion of the semicolon separated list
tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=TRUE)
checkEquals(dim(tbl.anno), c(nrow(tbl) + 4, ncol(tbl) + 2))
checkTrue(all(c("TFAP2A", "TFAP2B", "TFAP2C", "TFAP2D", "TFAP2E") %in% tbl.anno$geneSymbol))
|
49ebd9dc |
# now add in a bogus motif name, one for which there cannot possibly be a TF
motif.names <- names(pfms[1:5])
short.motif.names <- unlist(lapply(strsplit(motif.names, "-"), function(tokens) return(tokens[length(tokens)])))
|
094b985a |
short.motif.names[4] <- "bogus"
tbl <- data.frame(shortMotif=short.motif.names, score=runif(5), stringsAsFactors=FALSE)
|
49ebd9dc |
|
094b985a |
tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE)
|
49ebd9dc |
checkEquals(dim(tbl.anno), c(nrow(tbl), ncol(tbl) + 2))
|
094b985a |
# after adding bogus to the only mapped motif name, all geneSymbol and pubmedID values should be NA
checkTrue(all(is.na(tbl.anno$geneSymbol)))
checkTrue(all(is.na(tbl.anno$pubmedID)))
|
49ebd9dc |
|
094b985a |
# now make sure that the absence of the TFClass-specific "shortMotif" field is detected
motif.names <- names(pfms[1:5])
tbl <- data.frame(motifName=motif.names, score=runif(5), stringsAsFactors=FALSE)
checkException(tbl.anno <- associateTranscriptionFactors(mdb, tbl, source="TFClass", expand.rows=FALSE), silent=TRUE)
|
49ebd9dc |
|
094b985a |
} # test.associateTranscriptionFactors
|
2def1eec |
#------------------------------------------------------------------------------------------------------------------------
|