Browse code

moved tristan's new work to a more appropriate directory

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86588 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 19/02/2014 18:03:49
Showing4 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,342 @@
1
+# jaspar/import.R
2
+# 
3
+#------------------------------------------------------------------------------------------------------------------------
4
+library (org.Hs.eg.db)
5
+library (org.Mm.eg.db)
6
+#------------------------------------------------------------------------------------------------------------------------
7
+printf <- function(...) print(noquote(sprintf(...)))
8
+#------------------------------------------------------------------------------------------------------------------------
9
+run = function (dataDir)
10
+{
11
+  dataDir <- file.path(dataDir, "jaspar")
12
+  tbl.rmat = readRawMatrices (dataDir)
13
+  matrices = convertRawMatricesToStandard (tbl.rmat)
14
+  tbl.anno = createAnnotationTable (dataDir)
15
+  tbl.md = createMetadataTable (tbl.anno, matrices)
16
+  matrices = renameMatrices (matrices, tbl.md)
17
+  matrices = normalizeMatrices (matrices)
18
+  serializedFile <- "jaspar.RData"
19
+  save (matrices, tbl.md, file=serializedFile)
20
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
21
+  printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
22
+
23
+} # run
24
+#------------------------------------------------------------------------------------------------------------------------
25
+
26
+readRawMatrices = function (dataDir)
27
+{
28
+  file <- file.path(dataDir, 'MATRIX_DATA.txt')
29
+  #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
30
+  #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
31
+  tbl.matrices = read.table (file, sep='\n', header=FALSE )
32
+  
33
+  invisible (tbl.matrices)
34
+
35
+} # readRawMatrices
36
+#------------------------------------------------------------------------------------------------------------------------
37
+convertRawMatricesToStandard = function (tbl.rmat)
38
+{
39
+  #matrix.ids = unique (tbl.rmat$id)
40
+  #result =  vector ('list', length (matrix.ids))
41
+
42
+  #i = 1
43
+  #for (matrix.id in matrix.ids) {
44
+  #  tbl.sub = subset (tbl.rmat, id == matrix.id)
45
+  #    # sanity check this rather unusual representation of a position count matrix
46
+  #  base.count = as.data.frame (table (tbl.sub$base))
47
+  #  stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
48
+      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
49
+  #  if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
50
+  #    printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
51
+  #    }
52
+  #  stopifnot (all (base.count$Freq == base.count$Freq [1]))
53
+  # nucleotide.counts = tbl.sub$count
54
+  #  row.names = c('A', 'C', 'G', 'T'); 
55
+  #  col.names = 1:(nrow (tbl.sub) / 4)
56
+  #  m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
57
+  #  result [[i]] = m
58
+  #  i = i + 1
59
+  #  } # for matrix.id
60
+
61
+  #names (result) = matrix.ids
62
+
63
+  result<- vector("list",length=593)
64
+  matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE)
65
+  names (result) = matrices.id
66
+  #Get a list of the ID's and then assign them as names 
67
+  m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE)
68
+  data.list=list()
69
+  x = 1
70
+  y = 1
71
+  while (x < length(m.d))
72
+    {   
73
+      matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3])
74
+      matrices.formatted <- gsub("\t"," ",matrices.position)
75
+      #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t"))
76
+      row.names = c('A', 'C', 'G', 'T')
77
+      col.names = 1:(ncol (matrices.formatted))
78
+      #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,))
79
+      result[[y]] <- matrices.formatted
80
+      y=y+1
81
+      x=x+4
82
+    }
83
+  browser()
84
+  return (result)
85
+
86
+} # convertRawMatricesToStandard 
87
+#------------------------------------------------------------------------------------------------------------------------
88
+# read 'mysql' tables provide by jaspar: 
89
+#          MATRIX.txt:  9229	CORE	MA0001	1	AGL3
90
+#  MATRIX_PROTEIN.txt: 9229	P29383
91
+#  MATRIX_SPECIES.txt: 9229	3702
92
+#  MATRIX_ANNOTATION.txt: 
93
+#     9229	class	Other Alpha-Helix
94
+#     9229	comment	-
95
+#     9229	family	MADS
96
+#     9229	medline	7632923
97
+#     9229	tax_group	plants
98
+#     9229	type	SELEX
99
+createAnnotationTable = function (dataDir)
100
+{
101
+  file <- file.path(dataDir, "MATRIX.txt")
102
+  tbl.matrix =  read.table (file, sep='\t', header=F, as.is=TRUE)
103
+  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
104
+
105
+  file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
106
+  tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
107
+  colnames (tbl.protein) =  c ('id', 'proteinID')
108
+
109
+  file <- file.path(dataDir, "MATRIX_SPECIES.txt")
110
+  tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
111
+  colnames (tbl.species) = c ('id', 'speciesID')
112
+
113
+  file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
114
+  tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
115
+  colnames (tbl.anno) = c ('id', 'attribute', 'value')
116
+
117
+  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
118
+  colnames (tbl.family) = c ('id', 'family')
119
+
120
+  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
121
+  colnames (tbl.tax) = c ('id', 'tax')
122
+
123
+  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
124
+  colnames (tbl.class) = c ('id', 'class')
125
+
126
+  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
127
+  colnames (tbl.comment) = c ('id', 'comment')
128
+
129
+  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
130
+  colnames (tbl.pubmed) = c ('id', 'pubmed')
131
+
132
+  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
133
+  colnames (tbl.type) = c ('id', 'type')
134
+
135
+
136
+  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
137
+  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
138
+  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
139
+  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
140
+  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
141
+  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
142
+  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
143
+
144
+  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
145
+  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
146
+
147
+  invisible (tbl.md)
148
+
149
+} # createAnnotationTable
150
+#------------------------------------------------------------------------------------------------------------------------
151
+# assemble these columns:
152
+#                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
153
+#                      nativeNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
154
+#                      geneSymbols=character(),              # ABF2, Rhox11
155
+#                      sequenceCounts=integer(),             # often NA
156
+#                      organisms=character(),                # Scerevisiae, Mmusculus
157
+#                      bindingMolecules=character(),         # YMR072W, 194738
158
+#                      bindingMoleculeIdTypes=character(),   # SGD, entrez gene
159
+#                      bindingDomainTypes=character(),       # NA, Homeo
160
+#                      dataSources=character(),              # ScerTF, UniPROBE
161
+#                      experimentTypes=character(),          # NA, protein-binding microarray
162
+#                      pubmedIDs=character(),                # 19111667, 1858359
163
+#                      tfFamilies=character())               # NA, NA
164
+#
165
+# from these
166
+#
167
+createMetadataTable = function (tbl.anno, matrices)
168
+{
169
+  options (stringsAsFactors=FALSE)
170
+  tbl.md = data.frame ()
171
+  matrix.ids = names (matrices) 
172
+  dataSource = 'JASPAR_CORE'
173
+  
174
+  for (matrix.id in matrix.ids) {
175
+    matrix = matrices [[matrix.id]]
176
+    stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
177
+    tbl.sub = subset (tbl.anno, id==matrix.id)
178
+    if (nrow (tbl.sub) > 1) {  
179
+        # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
180
+        # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
181
+      dimer = paste (unique (tbl.sub$proteinID), collapse=';')
182
+      tbl.sub [1, 'proteinID'] = dimer
183
+      }
184
+    anno = as.list (tbl.sub [1,])
185
+    taxon.code = anno$speciesID
186
+    geneId.info = assignGeneId (anno$proteinID)
187
+    new.row = list (providerName=anno$binder,
188
+                    providerId=anno$fullID,
189
+                    dataSource=dataSource,
190
+                    geneSymbol=anno$binder,
191
+                    geneId=geneId.info$geneId,
192
+                    geneIdType=geneId.info$type,
193
+                    proteinId=anno$proteinID,
194
+                    proteinIdType=guessProteinIdentifierType (anno$proteinID),
195
+                    organism=convertTaxonCode(anno$speciesID),
196
+                    sequenceCount=as.integer (mean (colSums (matrix))),
197
+                    bindingSequence=NA_character_,
198
+                    bindingDomain=anno$class,
199
+                    tfFamily=anno$family,
200
+                    experimentType=anno$type,
201
+                    pubmedID=anno$pubmed)
202
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
203
+    full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
204
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
205
+    } # for i
206
+
207
+      # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
208
+    NA.string.indices = grep ('NA', tbl.md$proteinId)
209
+    if (length (NA.string.indices) > 0)
210
+      tbl.md$proteinId [NA.string.indices] = NA
211
+     
212
+   invisible (tbl.md)
213
+
214
+} # createMetadataTable
215
+#------------------------------------------------------------------------------------------------------------------------
216
+renameMatrices = function (matrices, tbl.md)
217
+{
218
+  stopifnot (length (matrices) == nrow (tbl.md))
219
+  names (matrices) = rownames (tbl.md)
220
+  invisible (matrices)
221
+
222
+} # renameMatrices
223
+#------------------------------------------------------------------------------------------------------------------------
224
+# full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
225
+#                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
226
+#                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
227
+#                'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', 
228
+#                'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
229
+convertTaxonCode = function (ncbi.code)
230
+{
231
+  #if (is.na (ncbi.code))  
232
+  #  return (NA_character_)
233
+  ncbi.code = as.character (ncbi.code)
234
+  if (ncbi.code %in% c ('-', NA_character_, 'NA'))
235
+    return ('Vertebrata')
236
+
237
+  tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
238
+                            '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
239
+                            '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
240
+                    name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
241
+                            'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
242
+                            'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
243
+                            'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
244
+                            'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
245
+                    stringsAsFactors=FALSE)
246
+
247
+  ncbi.code = as.character (ncbi.code)
248
+  index = which (tbl$code == ncbi.code)
249
+  if (length (index) == 1)
250
+    return (tbl$name [index])
251
+  else {
252
+    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
253
+    return (NA_character_)
254
+    }
255
+
256
+} # convertTaxonCode
257
+#------------------------------------------------------------------------------------------------------------------------
258
+# an empirical and not altogether trustworthy solution to identifying identifier types.
259
+guessProteinIdentifierType = function (moleculeName)
260
+{
261
+  if (nchar (moleculeName) == 0)
262
+    return (NA_character_)
263
+  if (is.na (moleculeName))
264
+    return (NA_character_) 
265
+
266
+  first.char = substr (moleculeName, 1, 1)
267
+
268
+  if (first.char == 'Y')
269
+    return ('SGD')
270
+
271
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
272
+    return ('UNIPROT')
273
+
274
+  if (length (grep ('^EAW', moleculeName)) == 1)
275
+    return ('NCBI')
276
+
277
+  if (length (grep ('^EAX', moleculeName)) == 1)
278
+    return ('NCBI')
279
+
280
+  if (length (grep ('^NP_', moleculeName)) == 1)
281
+    return ('REFSEQ')
282
+
283
+  if (length (grep ('^BAD', moleculeName)) == 1)
284
+    return ('EMBL')
285
+
286
+   return (NA_character_)
287
+
288
+} # guessProteinIdentifierType
289
+#------------------------------------------------------------------------------------------------------------------------
290
+normalizeMatrices = function (matrices)
291
+{
292
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
293
+  invisible (mtx.normalized)
294
+
295
+} # normalizeMatrices
296
+#------------------------------------------------------------------------------------------------------------------------
297
+assignGeneId = function (proteinId)
298
+{
299
+  if (!exists ('id.uniprot.human')) {
300
+
301
+    tbl = toTable (org.Hs.egUNIPROT)
302
+    id.uniprot.human <<- as.character (tbl$gene_id)
303
+    names (id.uniprot.human) <<- tbl$uniprot_id
304
+
305
+    tbl = toTable (org.Hs.egREFSEQ)
306
+    tbl = tbl [grep ('^NP_', tbl$accession),]
307
+    id.refseq.human <<- as.character (tbl$gene_id)
308
+    names (id.refseq.human) <<- tbl$accession
309
+
310
+    tbl = toTable (org.Mm.egUNIPROT)
311
+    id.uniprot.mouse <<- as.character (tbl$gene_id)
312
+    names (id.uniprot.mouse) <<- tbl$uniprot_id
313
+
314
+    tbl = toTable (org.Mm.egREFSEQ)
315
+    tbl = tbl [grep ('^NP_', tbl$accession),]
316
+    id.refseq.mouse <<- as.character (tbl$gene_id)
317
+    names (id.refseq.mouse) <<- tbl$accession
318
+    }
319
+
320
+  proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
321
+
322
+  if (proteinId %in% names (id.uniprot.human))
323
+    return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
324
+
325
+  if (proteinId %in% names (id.uniprot.mouse))
326
+    return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
327
+
328
+  if (proteinId %in% names (id.refseq.human))
329
+    return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
330
+
331
+  if (proteinId %in% names (id.refseq.mouse))
332
+    return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
333
+
334
+  found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
335
+
336
+  if (found.leading.Y == 1)
337
+    return (list (geneId=proteinId, type='SGD'))
338
+
339
+  return (list (geneId=NA_character_, type=NA_character_))
340
+
341
+} # assignGeneId
342
+#------------------------------------------------------------------------------------------------------------------------
0 343
new file mode 100644
... ...
@@ -0,0 +1,5 @@
1
+id	jaspar.class	ma.name	unknown	gene.symbol	uniprot	ncbi.tax.code	class	comment	family	medline	tax_group	type	pazar_tf_id	description	tfbs_shape_id	consensus	jaspar	mcs	transfac	end_relative_to_tss	start_relative_to_tss	included_models	source	centrality_logp	tfe_id	symbol	alias
2
+9232	CORE	MA0004	1	Arnt	P53762	10090	Zipper-Type	-	Helix-Loop-Helix	7592839	vertebrates	SELEX	TF0000003	aryl hydrocarbon receptor nuclear translocator	11	NA	NA	NA	NA	NA	NA	NA	NA	NA	580	ARNT	HIF-1beta,bHLHe2
3
+9234	CORE	MA0006	1	Arnt::Ahr	P30561, P53762	10090	Zipper-Type	dimer	Helix-Loop-Helix	7592839	vertebrates	SELEX	TF0000004	-	13	NA	NA	NA	NA	NA	NA	NA	NA	NA	580, 577	-	-
4
+9236	CORE	MA0008	1	HAT5	Q02283	3702	Helix-Turn-Helix	homodimer	Homeo	8253077	plants	SELEX	NA	NA	15	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
5
+9237	CORE	MA0009	1	T	P20293	10090	Beta-Hairpin-Ribbon	-	T	8344258	vertebrates	SELEX	TF0000006	T,brachyury homolog (mouse)	16	NA	NA	NA	NA	NA	NA	NA	NA	NA	855	T	-
0 6
new file mode 100644
... ...
@@ -0,0 +1,25 @@
1
+>MA0004.1 Arnt
2
+4	19	0	0	0	0
3
+16	0	20	0	0	0
4
+0	1	0	20	0	20
5
+0	0	0	0	20	0
6
+>MA0006.1 Arnt::Ahr
7
+3	0	0	0	0	0
8
+8	0	23	0	0	0
9
+2	23	0	23	0	24
10
+11	1	1	1	24	0
11
+>MA0008.1 HAT5
12
+3	21	25	0	0	24	1	0
13
+13	1	0	0	5	0	0	0
14
+4	0	0	0	0	1	0	2
15
+5	3	0	25	20	0	24	23
16
+>MA0009.1 T
17
+2	1	40	0	0	0	0	0	1	40	31
18
+28	1	0	0	0	0	0	2	7	0	5
19
+8	0	0	40	40	0	40	0	28	0	0
20
+2	38	0	0	0	40	0	38	4	0	4
21
+>MA0010.1 br_Z1
22
+3	1	5	7	3	6	4	7	1	9	8	5	4	2
23
+1	1	2	0	0	0	1	0	8	0	0	0	0	3
24
+4	1	1	1	0	0	4	1	0	0	1	3	0	2
25
+1	6	1	1	6	3	0	1	0	0	0	1	5	2
0 26
new file mode 100644
... ...
@@ -0,0 +1,194 @@
1
+# jaspar/test.R
2
+# 
3
+#------------------------------------------------------------------------------------------------------------------------
4
+library (RUnit)
5
+source("import.R")
6
+#------------------------------------------------------------------------------------------------------------------------
7
+run.tests = function (dataDir)
8
+{
9
+  dataDir <- file.path(dataDir, "jaspar")
10
+  x.tbl.rmat <<- test.readRawMatrices (dataDir)
11
+  x.matrices <<- test.convertRawMatricesToStandard (x.tbl.rmat)
12
+  x.tbl.anno <<- test.createAnnotationTable (dataDir)
13
+  test.assignGeneId (dataDir)
14
+  x.tbl.md <<- test.createMetadataTable (x.tbl.anno, x.matrices)
15
+  x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md, x.tbl.anno)
16
+  x.matrices.normalized <<- test.normalizeMatrices (x.matrices.renamed)
17
+
18
+} # run.tests
19
+#------------------------------------------------------------------------------------------------------------------------
20
+test.readRawMatrices = function (dataDir)
21
+{
22
+  print ('--- test.readMatrices')
23
+  tbl.rmat = readRawMatrices (dataDir)
24
+  checkEquals (ncol (tbl.rmat), 4)
25
+  checkEquals (colnames (tbl.rmat), c ('id', 'base', 'pos', 'count'))
26
+  checkEquals (class (tbl.rmat$id),    'character')
27
+  checkEquals (class (tbl.rmat$base),  'character')
28
+  checkEquals (class (tbl.rmat$pos),   'numeric')
29
+  checkEquals (class (tbl.rmat$count), 'numeric')
30
+
31
+  checkTrue (nrow (tbl.rmat) > 18000)   # about 450 motifs, each represeted by 4 rows (ACGT) and about 10 positions
32
+  checkTrue (length (unique (tbl.rmat$id)) > 450)
33
+  invisible (tbl.rmat)
34
+
35
+} # test.readRawMatrices
36
+#------------------------------------------------------------------------------------------------------------------------
37
+test.convertRawMatricesToStandard = function (tbl.rmat)
38
+{
39
+  print ('--- test.convertRawMatricesToStandard')
40
+   # get just the first two raw matrices
41
+
42
+  first.two.ids = head (unique (tbl.rmat$id), n=2)
43
+  rows = nrow (subset (tbl.rmat, id %in% first.two.ids))
44
+  matrices = convertRawMatricesToStandard (tbl.rmat [1:rows,])
45
+  checkEquals (length (matrices), 2)
46
+  checkEquals (names (matrices), first.two.ids)
47
+
48
+    # it will not always be true, but IS true for the first two matrices, currently "9229" and  "9231", that there
49
+    # are an equal number of nucleotides at each position.
50
+  checkTrue (all (colSums (matrices [[1]]) == 97))
51
+  checkTrue (all (colSums (matrices [[2]]) == 185))
52
+
53
+    # now run all the matrices through
54
+  matrices = convertRawMatricesToStandard (tbl.rmat)
55
+  checkEquals (length (matrices), 459)
56
+  checkEquals (names (matrices)[1:2], first.two.ids)
57
+  
58
+  invisible (matrices)
59
+  
60
+} # test.convertRawMatricesToStandard 
61
+#------------------------------------------------------------------------------------------------------------------------
62
+test.createAnnotationTable = function (dataDir)
63
+{
64
+  print ('--- test.createAnnotationTable')
65
+  tbl.anno = createAnnotationTable (dataDir)
66
+  checkEquals (dim (tbl.anno), c (513, 13))
67
+  expected = c ("fullID", "id", "category", "mID", "version", "binder", "speciesID", "proteinID", "family", "tax", "class", "pubmed", "type")
68
+  checkEquals (colnames (tbl.anno), expected)
69
+
70
+  checkEquals (head (tbl.anno$fullID),  c ("MA0001.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1", "MA0006.1"))
71
+  invisible (tbl.anno)
72
+
73
+} # test.createAnnotationTable
74
+#------------------------------------------------------------------------------------------------------------------------
75
+test.createMetadataTable = function (tbl.anno, matrices)
76
+{
77
+  print ('--- test.createMetadataTable')
78
+   # try it first with just two matrices
79
+  tbl.md = createMetadataTable (tbl.anno, matrices [1:2])
80
+  checkEquals (dim (tbl.md), c (2, 15))
81
+  checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", 
82
+                                     "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence",
83
+                                     "bindingDomain", "tfFamily", "experimentType", "pubmedID"))
84
+  checkEquals (tbl.md$proteinId, c ('P29383', 'P05549'))
85
+  checkEquals (tbl.md$proteinIdType, c ('UNIPROT', 'UNIPROT'))
86
+
87
+    # now use the whole table
88
+  tbl.md = createMetadataTable (tbl.anno, matrices)
89
+  checkEquals (dim (tbl.md), c (length (matrices), 15))
90
+    # test for proper conversion of speciesID = NA or '-' to Vertebrata
91
+  checkEquals (which (is.na (tbl.md$organism)), integer (0))
92
+  checkEquals (grep ('-', tbl.md$organism), integer (0))
93
+
94
+    # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 had 'NA' for proteinID, not <NA>. fixed?
95
+  checkEquals (grep ('NA', tbl.md$proteinId), integer (0))
96
+  invisible (tbl.md)
97
+
98
+} # test.createMetadataTable
99
+#------------------------------------------------------------------------------------------------------------------------
100
+test.renameMatrices = function (matrices, tbl.md, tbl.anno)
101
+{
102
+    # try it with just the first two matrices
103
+  matrix.pair = matrices [1:2]
104
+  tbl.md = createMetadataTable (tbl.anno, matrix.pair)
105
+  checkEquals (dim (tbl.md), c (2, 15))
106
+  old.matrix.names = names (matrix.pair)
107
+  matrices.renamed = renameMatrices (matrix.pair, tbl.md)
108
+
109
+    # test:  the old name is an id, '9229'.  find, in tbl.anno, the fullID, 'MA0001.1'.  then make sure 'MA000.1' is
110
+    # in the new name of that same matrix
111
+
112
+  for (i in 1:length (matrix.pair)) {
113
+    fullID = subset (x.tbl.anno, id==old.matrix.names [i])$fullID
114
+    checkTrue (length (grep (fullID, names (matrices.renamed) [i])) == 1)
115
+    } # for i
116
+
117
+    # now try it for the whole set, with selective focused tests
118
+
119
+  tbl.md = createMetadataTable (tbl.anno, matrices)
120
+  checkEquals (nrow (tbl.md), length (matrices))
121
+  old.matrix.names = names (matrices)
122
+  matrices.renamed = renameMatrices (matrices, tbl.md)
123
+ 
124
+  checkEquals (nrow (tbl.md), length (matrices.renamed))
125
+  checkEquals (length (grep ('-MA0', names (matrices.renamed))), length (matrices.renamed))
126
+
127
+  invisible (matrices.renamed)
128
+
129
+} # test.renameMatrices
130
+#------------------------------------------------------------------------------------------------------------------------
131
+test.convertTaxonCode = function ()
132
+{
133
+  print ('--- test.convertTaxonCode')
134
+
135
+  checkEquals (convertTaxonCode ('9606'), 'Hsapiens')
136
+  checkEquals (convertTaxonCode (9606), 'Hsapiens')
137
+     # anomalous codes, which an examination of the jaspar website reveals as 'vertebrates'
138
+  checkEquals (convertTaxonCode (NA), 'Vertebrata')
139
+  checkEquals (convertTaxonCode ('NA'), 'Vertebrata')
140
+  checkEquals (convertTaxonCode (NA_character_), 'Vertebrata')
141
+  checkEquals (convertTaxonCode ('-'), 'Vertebrata')
142
+
143
+} # test.convertTaxonCode
144
+#------------------------------------------------------------------------------------------------------------------------
145
+test.guessProteinIdentifierType = function (moleculeName)
146
+{
147
+  print ('--- test.guessProteinIdentifierType')
148
+  checkEquals (guessProteinIdentifierType ('P29383'), 'UNIPROT')
149
+
150
+  all.types = sapply (x.tbl.anno$proteinID, guessProteinIdentifierType)
151
+  checkTrue (length (which (is.na (all.types))) < 12)   # got most of them.
152
+
153
+} # test.guessProteinIdentifierType
154
+#------------------------------------------------------------------------------------------------------------------------
155
+test.normalizeMatrices = function (matrices)
156
+{
157
+  print ('--- test.normalizeMatrices')
158
+
159
+  colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
160
+  #checkTrue (all (colsums > 1))
161
+
162
+  matrices.norm = normalizeMatrices (matrices)
163
+
164
+  colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx))))))
165
+  checkTrue (all (colsums == 1))
166
+
167
+  invisible (matrices.norm)
168
+
169
+} # test.normalizeMatrices
170
+#------------------------------------------------------------------------------------------------------------------------
171
+test.assignGeneId = function (dataDir, proteinId)
172
+{
173
+  print ('--- test.assignGeneId')
174
+  uniprot.ids = c ('Q9GRA5', 'P31314', 'AAC18941', 'O49397')
175
+  refseq.ids  = c ('NP_995315.1', 'NP_032840', 'NP_599022')
176
+  yeast.ids   = c ('YKL112W', 'YMR072W', 'YLR131C')
177
+
178
+  checkEquals (assignGeneId ('NP_995315.1'), list (geneId='4782', type='ENTREZ'))
179
+  checkEquals (assignGeneId ('NP_599022'),   list (geneId='6095', type='ENTREZ'))
180
+
181
+  checkEquals (assignGeneId ('P31314'),      list (geneId='3195', type='ENTREZ'))
182
+
183
+  checkEquals (assignGeneId ('YKL112W'),     list (geneId='YKL112W', type='SGD'))
184
+
185
+    # see how successful this is over all 513 proteinIds
186
+
187
+  tbl.anno = createAnnotationTable (dataDir)
188
+  mtx.geneId = as.data.frame (t (sapply (tbl.anno$proteinID, assignGeneId)))
189
+  tbl.types = as.data.frame (table (as.character (mtx.geneId$type), useNA='always'), stringsAsFactors=FALSE)
190
+  checkEquals (tbl.types$Var1, c ("ENTREZ", "SGD", NA))
191
+  checkEquals (tbl.types$Freq, c (142, 177, 194))
192
+
193
+} # test.assignGeneId
194
+#------------------------------------------------------------------------------------------------------------------------