Browse code

first version within this package

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75325 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 00:46:19
Showing 2 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,340 @@
1
+# jaspar/import.R
2
+# notes:  3 matrices come w/o speciesID, tax = 'vertebrates'.  not our problem to fix, at least not yet.
3
+#         TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
4
+#------------------------------------------------------------------------------------------------------------------------
5
+library (org.Hs.eg.db)
6
+library (org.Mm.eg.db)
7
+#------------------------------------------------------------------------------------------------------------------------
8
+printf <- function(...) print(noquote(sprintf(...)))
9
+#------------------------------------------------------------------------------------------------------------------------
10
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/jaspar"
11
+kDataDir <- "~/s/data/public/TFBS/jaspar/sql_tables"
12
+#------------------------------------------------------------------------------------------------------------------------
13
+run = function (dataDir)
14
+{
15
+  tbl.rmat = readRawMatrices (dataDir)
16
+  matrices = convertRawMatricesToStandard (tbl.rmat)
17
+  tbl.anno = createAnnotationTable (dataDir)
18
+  tbl.md = createMetadataTable (tbl.anno, matrices)
19
+  matrices = renameMatrices (matrices, tbl.md)
20
+  matrices = normalizeMatrices (matrices)
21
+  serializedFile <- "jaspar.RData"
22
+  save (matrices, tbl.md, file=serializedFile)
23
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
24
+  printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
25
+
26
+} # run
27
+#------------------------------------------------------------------------------------------------------------------------
28
+# jaspar's sql rendering of their matrices is unusual.  see below.  translate them to a 4-row, n-column form in 
29
+# convertRawMatricesToStandard, below.
30
+# the id can be mapped to other names and information via simple subset lookup in tbl.anno
31
+#      id base pos count
32
+# 1  9229    A   1     0
33
+# 2  9229    A   2     3
34
+# 3  9229    A   3    79
35
+# ...
36
+# 11 9229    C   1    94
37
+# 12 9229    C   2    75
38
+# 13 9229    C   3     4
39
+# ...
40
+# 20 9229    C  10     3
41
+# 21 9229    G   1     1
42
+# 22 9229    G   2     0
43
+# ...
44
+# 38 9229    T   8    81
45
+# 39 9229    T   9     1
46
+# 40 9229    T  10     6
47
+readRawMatrices = function (dataDir)
48
+{
49
+  file <- file.path(dataDir, 'MATRIX_DATA.txt')
50
+  tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
51
+  colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
52
+
53
+  invisible (tbl.matrices)
54
+
55
+} # readRawMatrices
56
+#------------------------------------------------------------------------------------------------------------------------
57
+convertRawMatricesToStandard = function (tbl.rmat)
58
+{
59
+  matrix.ids = unique (tbl.rmat$id)
60
+  result =  vector ('list', length (matrix.ids))
61
+
62
+  i = 1
63
+  for (matrix.id in matrix.ids) {
64
+    tbl.sub = subset (tbl.rmat, id == matrix.id)
65
+      # sanity check this rather unusual representation of a position count matrix
66
+    base.count = as.data.frame (table (tbl.sub$base))
67
+    stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
68
+      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
69
+    if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
70
+      printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
71
+      }
72
+    stopifnot (all (base.count$Freq == base.count$Freq [1]))
73
+    nucleotide.counts = tbl.sub$count
74
+    row.names = c('A', 'C', 'G', 'T'); 
75
+    col.names = 1:(nrow (tbl.sub) / 4)
76
+    m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
77
+    result [[i]] = m
78
+    i = i + 1
79
+    } # for matrix.id
80
+
81
+  names (result) = matrix.ids
82
+  return (result)
83
+
84
+} # convertRawMatricesToStandard 
85
+#------------------------------------------------------------------------------------------------------------------------
86
+# read 'mysql' tables provide by jaspar: 
87
+#          MATRIX.txt:  9229	CORE	MA0001	1	AGL3
88
+#  MATRIX_PROTEIN.txt: 9229	P29383
89
+#  MATRIX_SPECIES.txt: 9229	3702
90
+#  MATRIX_ANNOTATION.txt: 
91
+#     9229	class	Other Alpha-Helix
92
+#     9229	comment	-
93
+#     9229	family	MADS
94
+#     9229	medline	7632923
95
+#     9229	tax_group	plants
96
+#     9229	type	SELEX
97
+createAnnotationTable = function (dataDir)
98
+{
99
+  file <- file.path(dataDir, "MATRIX.txt")
100
+  tbl.matrix =  read.table (file, sep='\t', header=F, as.is=TRUE)
101
+  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
102
+
103
+  file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
104
+  tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
105
+  colnames (tbl.protein) =  c ('id', 'proteinID')
106
+
107
+  file <- file.path(dataDir, "MATRIX_SPECIES.txt")
108
+  tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
109
+  colnames (tbl.species) = c ('id', 'speciesID')
110
+
111
+  file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
112
+  tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
113
+  colnames (tbl.anno) = c ('id', 'attribute', 'value')
114
+
115
+  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
116
+  colnames (tbl.family) = c ('id', 'family')
117
+
118
+  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
119
+  colnames (tbl.tax) = c ('id', 'tax')
120
+
121
+  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
122
+  colnames (tbl.class) = c ('id', 'class')
123
+
124
+  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
125
+  colnames (tbl.comment) = c ('id', 'comment')
126
+
127
+  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
128
+  colnames (tbl.pubmed) = c ('id', 'pubmed')
129
+
130
+  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
131
+  colnames (tbl.type) = c ('id', 'type')
132
+
133
+
134
+  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
135
+  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
136
+  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
137
+  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
138
+  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
139
+  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
140
+  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
141
+
142
+  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
143
+  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
144
+
145
+  invisible (tbl.md)
146
+
147
+} # createAnnotationTable
148
+#------------------------------------------------------------------------------------------------------------------------
149
+# assemble these columns:
150
+#                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
151
+#                      nativeNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
152
+#                      geneSymbols=character(),              # ABF2, Rhox11
153
+#                      sequenceCounts=integer(),             # often NA
154
+#                      organisms=character(),                # Scerevisiae, Mmusculus
155
+#                      bindingMolecules=character(),         # YMR072W, 194738
156
+#                      bindingMoleculeIdTypes=character(),   # SGD, entrez gene
157
+#                      bindingDomainTypes=character(),       # NA, Homeo
158
+#                      dataSources=character(),              # ScerTF, UniPROBE
159
+#                      experimentTypes=character(),          # NA, protein-binding microarray
160
+#                      pubmedIDs=character(),                # 19111667, 1858359
161
+#                      tfFamilies=character())               # NA, NA
162
+#
163
+# from these
164
+#
165
+createMetadataTable = function (tbl.anno, matrices)
166
+{
167
+  options (stringsAsFactors=FALSE)
168
+  tbl.md = data.frame ()
169
+  matrix.ids = names (matrices) 
170
+  dataSource = 'JASPAR_CORE'
171
+  
172
+  for (matrix.id in matrix.ids) {
173
+    matrix = matrices [[matrix.id]]
174
+    stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
175
+    tbl.sub = subset (tbl.anno, id==matrix.id)
176
+    if (nrow (tbl.sub) > 1) {  
177
+        # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
178
+        # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
179
+      dimer = paste (unique (tbl.sub$proteinID), collapse=';')
180
+      tbl.sub [1, 'proteinID'] = dimer
181
+      }
182
+    anno = as.list (tbl.sub [1,])
183
+    taxon.code = anno$speciesID
184
+    geneId.info = assignGeneId (anno$proteinID)
185
+    new.row = list (providerName=anno$binder,
186
+                    providerId=anno$fullID,
187
+                    dataSource=dataSource,
188
+                    geneSymbol=anno$binder,
189
+                    geneId=geneId.info$geneId,
190
+                    geneIdType=geneId.info$type,
191
+                    proteinId=anno$proteinID,
192
+                    proteinIdType=guessProteinIdentifierType (anno$proteinID),
193
+                    organism=convertTaxonCode(anno$speciesID),
194
+                    sequenceCount=as.integer (mean (colSums (matrix))),
195
+                    bindingSequence=NA_character_,
196
+                    bindingDomain=anno$class,
197
+                    tfFamily=anno$family,
198
+                    experimentType=anno$type,
199
+                    pubmedID=anno$pubmed)
200
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
201
+    full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
202
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
203
+    } # for i
204
+
205
+      # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
206
+    NA.string.indices = grep ('NA', tbl.md$proteinId)
207
+    if (length (NA.string.indices) > 0)
208
+      tbl.md$proteinId [NA.string.indices] = NA
209
+     
210
+   invisible (tbl.md)
211
+
212
+} # createMetadataTable
213
+#------------------------------------------------------------------------------------------------------------------------
214
+renameMatrices = function (matrices, tbl.md)
215
+{
216
+  stopifnot (length (matrices) == nrow (tbl.md))
217
+  names (matrices) = rownames (tbl.md)
218
+  invisible (matrices)
219
+
220
+} # renameMatrices
221
+#------------------------------------------------------------------------------------------------------------------------
222
+# full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
223
+#                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
224
+#                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
225
+#                'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', 
226
+#                'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
227
+convertTaxonCode = function (ncbi.code)
228
+{
229
+  #if (is.na (ncbi.code))  
230
+  #  return (NA_character_)
231
+  ncbi.code = as.character (ncbi.code)
232
+  if (ncbi.code %in% c ('-', NA_character_, 'NA'))
233
+    return ('Vertebrata')
234
+
235
+  tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
236
+                            '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
237
+                            '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
238
+                    name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
239
+                            'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
240
+                            'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
241
+                            'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
242
+                            'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
243
+                    stringsAsFactors=FALSE)
244
+
245
+  ncbi.code = as.character (ncbi.code)
246
+  index = which (tbl$code == ncbi.code)
247
+  if (length (index) == 1)
248
+    return (tbl$name [index])
249
+  else {
250
+    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
251
+    return (NA_character_)
252
+    }
253
+
254
+} # convertTaxonCode
255
+#------------------------------------------------------------------------------------------------------------------------
256
+# an empirical and not altogether trustworthy solution to identifying identifier types.
257
+guessProteinIdentifierType = function (moleculeName)
258
+{
259
+  if (nchar (moleculeName) == 0)
260
+    return (NA_character_)
261
+  if (is.na (moleculeName))
262
+    return (NA_character_) 
263
+
264
+  first.char = substr (moleculeName, 1, 1)
265
+
266
+  if (first.char == 'Y')
267
+    return ('SGD')
268
+
269
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
270
+    return ('UNIPROT')
271
+
272
+  if (length (grep ('^EAW', moleculeName)) == 1)
273
+    return ('NCBI')
274
+
275
+  if (length (grep ('^EAX', moleculeName)) == 1)
276
+    return ('NCBI')
277
+
278
+  if (length (grep ('^NP_', moleculeName)) == 1)
279
+    return ('REFSEQ')
280
+
281
+  if (length (grep ('^BAD', moleculeName)) == 1)
282
+    return ('EMBL')
283
+
284
+   return (NA_character_)
285
+
286
+} # guessProteinIdentifierType
287
+#------------------------------------------------------------------------------------------------------------------------
288
+normalizeMatrices = function (matrices)
289
+{
290
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
291
+  invisible (mtx.normalized)
292
+
293
+} # normalizeMatrices
294
+#------------------------------------------------------------------------------------------------------------------------
295
+assignGeneId = function (proteinId)
296
+{
297
+  if (!exists ('id.uniprot.human')) {
298
+
299
+    tbl = toTable (org.Hs.egUNIPROT)
300
+    id.uniprot.human <<- as.character (tbl$gene_id)
301
+    names (id.uniprot.human) <<- tbl$uniprot_id
302
+
303
+    tbl = toTable (org.Hs.egREFSEQ)
304
+    tbl = tbl [grep ('^NP_', tbl$accession),]
305
+    id.refseq.human <<- as.character (tbl$gene_id)
306
+    names (id.refseq.human) <<- tbl$accession
307
+
308
+    tbl = toTable (org.Mm.egUNIPROT)
309
+    id.uniprot.mouse <<- as.character (tbl$gene_id)
310
+    names (id.uniprot.mouse) <<- tbl$uniprot_id
311
+
312
+    tbl = toTable (org.Mm.egREFSEQ)
313
+    tbl = tbl [grep ('^NP_', tbl$accession),]
314
+    id.refseq.mouse <<- as.character (tbl$gene_id)
315
+    names (id.refseq.mouse) <<- tbl$accession
316
+    }
317
+
318
+  proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
319
+
320
+  if (proteinId %in% names (id.uniprot.human))
321
+    return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
322
+
323
+  if (proteinId %in% names (id.uniprot.mouse))
324
+    return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
325
+
326
+  if (proteinId %in% names (id.refseq.human))
327
+    return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
328
+
329
+  if (proteinId %in% names (id.refseq.mouse))
330
+    return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
331
+
332
+  found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
333
+
334
+  if (found.leading.Y == 1)
335
+    return (list (geneId=proteinId, type='SGD'))
336
+
337
+  return (list (geneId=NA_character_, type=NA_character_))
338
+
339
+} # assignGeneId
340
+#------------------------------------------------------------------------------------------------------------------------
0 341
new file mode 100644
... ...
@@ -0,0 +1,194 @@
1
+# jaspar/test.R
2
+# notes:  3 matrices come w/o speciesID, tax = 'vertebrates'.  not our problem to fix, at least not yet.
3
+#         TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
4
+#------------------------------------------------------------------------------------------------------------------------
5
+library (RUnit)
6
+source("import.R")
7
+#------------------------------------------------------------------------------------------------------------------------
8
+run.tests = function (dataDir=kDataDir)
9
+{
10
+  x.tbl.rmat <<- test.readRawMatrices (dataDir)
11
+  x.matrices <<- test.convertRawMatricesToStandard (x.tbl.rmat)
12
+  x.tbl.anno <<- test.createAnnotationTable (dataDir)
13
+  test.assignGeneId (dataDir)
14
+  x.tbl.md <<- test.createMetadataTable (x.tbl.anno, x.matrices)
15
+  x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md, x.tbl.anno)
16
+  x.matrices.normalized <<- test.normalizeMatrices (x.matrices.renamed)
17
+
18
+} # run.tests
19
+#------------------------------------------------------------------------------------------------------------------------
20
+test.readRawMatrices = function (dataDir)
21
+{
22
+  print ('--- test.readMatrices')
23
+  tbl.rmat = readRawMatrices (dataDir)
24
+  checkEquals (ncol (tbl.rmat), 4)
25
+  checkEquals (colnames (tbl.rmat), c ('id', 'base', 'pos', 'count'))
26
+  checkEquals (class (tbl.rmat$id),    'character')
27
+  checkEquals (class (tbl.rmat$base),  'character')
28
+  checkEquals (class (tbl.rmat$pos),   'numeric')
29
+  checkEquals (class (tbl.rmat$count), 'numeric')
30
+
31
+  checkTrue (nrow (tbl.rmat) > 18000)   # about 450 motifs, each represeted by 4 rows (ACGT) and about 10 positions
32
+  checkTrue (length (unique (tbl.rmat$id)) > 450)
33
+  invisible (tbl.rmat)
34
+
35
+} # test.readRawMatrices
36
+#------------------------------------------------------------------------------------------------------------------------
37
+test.convertRawMatricesToStandard = function (tbl.rmat)
38
+{
39
+  print ('--- test.convertRawMatricesToStandard')
40
+   # get just the first two raw matrices
41
+
42
+  first.two.ids = head (unique (tbl.rmat$id), n=2)
43
+  rows = nrow (subset (tbl.rmat, id %in% first.two.ids))
44
+  matrices = convertRawMatricesToStandard (tbl.rmat [1:rows,])
45
+  checkEquals (length (matrices), 2)
46
+  checkEquals (names (matrices), first.two.ids)
47
+
48
+    # it will not always be true, but IS true for the first two matrices, currently "9229" and  "9231", that there
49
+    # are an equal number of nucleotides at each position.
50
+  checkTrue (all (colSums (matrices [[1]]) == 97))
51
+  checkTrue (all (colSums (matrices [[2]]) == 185))
52
+
53
+    # now run all the matrices through
54
+  matrices = convertRawMatricesToStandard (tbl.rmat)
55
+  checkEquals (length (matrices), 459)
56
+  checkEquals (names (matrices)[1:2], first.two.ids)
57
+  
58
+  invisible (matrices)
59
+  
60
+} # test.convertRawMatricesToStandard 
61
+#------------------------------------------------------------------------------------------------------------------------
62
+test.createAnnotationTable = function (dataDir)
63
+{
64
+  print ('--- test.createAnnotationTable')
65
+  tbl.anno = createAnnotationTable (dataDir)
66
+  checkEquals (dim (tbl.anno), c (513, 13))
67
+  expected = c ("fullID", "id", "category", "mID", "version", "binder", "speciesID", "proteinID", "family", "tax", "class", "pubmed", "type")
68
+  checkEquals (colnames (tbl.anno), expected)
69
+
70
+  checkEquals (head (tbl.anno$fullID),  c ("MA0001.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1", "MA0006.1"))
71
+  invisible (tbl.anno)
72
+
73
+} # test.createAnnotationTable
74
+#------------------------------------------------------------------------------------------------------------------------
75
+test.createMetadataTable = function (tbl.anno, matrices)
76
+{
77
+  print ('--- test.createMetadataTable')
78
+   # try it first with just two matrices
79
+  tbl.md = createMetadataTable (tbl.anno, matrices [1:2])
80
+  checkEquals (dim (tbl.md), c (2, 15))
81
+  checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", 
82
+                                     "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence",
83
+                                     "bindingDomain", "tfFamily", "experimentType", "pubmedID"))
84
+  checkEquals (tbl.md$proteinId, c ('P29383', 'P05549'))
85
+  checkEquals (tbl.md$proteinIdType, c ('UNIPROT', 'UNIPROT'))
86
+
87
+    # now use the whole table
88
+  tbl.md = createMetadataTable (tbl.anno, matrices)
89
+  checkEquals (dim (tbl.md), c (length (matrices), 15))
90
+    # test for proper conversion of speciesID = NA or '-' to Vertebrata
91
+  checkEquals (which (is.na (tbl.md$organism)), integer (0))
92
+  checkEquals (grep ('-', tbl.md$organism), integer (0))
93
+
94
+    # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 had 'NA' for proteinID, not <NA>. fixed?
95
+  checkEquals (grep ('NA', tbl.md$proteinId), integer (0))
96
+  invisible (tbl.md)
97
+
98
+} # test.createMetadataTable
99
+#------------------------------------------------------------------------------------------------------------------------
100
+test.renameMatrices = function (matrices, tbl.md, tbl.anno)
101
+{
102
+    # try it with just the first two matrices
103
+  matrix.pair = matrices [1:2]
104
+  tbl.md = createMetadataTable (tbl.anno, matrix.pair)
105
+  checkEquals (dim (tbl.md), c (2, 15))
106
+  old.matrix.names = names (matrix.pair)
107
+  matrices.renamed = renameMatrices (matrix.pair, tbl.md)
108
+
109
+    # test:  the old name is an id, '9229'.  find, in tbl.anno, the fullID, 'MA0001.1'.  then make sure 'MA000.1' is
110
+    # in the new name of that same matrix
111
+
112
+  for (i in 1:length (matrix.pair)) {
113
+    fullID = subset (x.tbl.anno, id==old.matrix.names [i])$fullID
114
+    checkTrue (length (grep (fullID, names (matrices.renamed) [i])) == 1)
115
+    } # for i
116
+
117
+    # now try it for the whole set, with selective focused tests
118
+
119
+  tbl.md = createMetadataTable (tbl.anno, matrices)
120
+  checkEquals (nrow (tbl.md), length (matrices))
121
+  old.matrix.names = names (matrices)
122
+  matrices.renamed = renameMatrices (matrices, tbl.md)
123
+ 
124
+  checkEquals (nrow (tbl.md), length (matrices.renamed))
125
+  checkEquals (length (grep ('-MA0', names (matrices.renamed))), length (matrices.renamed))
126
+
127
+  invisible (matrices.renamed)
128
+
129
+} # test.renameMatrices
130
+#------------------------------------------------------------------------------------------------------------------------
131
+test.convertTaxonCode = function ()
132
+{
133
+  print ('--- test.convertTaxonCode')
134
+
135
+  checkEquals (convertTaxonCode ('9606'), 'Hsapiens')
136
+  checkEquals (convertTaxonCode (9606), 'Hsapiens')
137
+     # anomalous codes, which an examination of the jaspar website reveals as 'vertebrates'
138
+  checkEquals (convertTaxonCode (NA), 'Vertebrata')
139
+  checkEquals (convertTaxonCode ('NA'), 'Vertebrata')
140
+  checkEquals (convertTaxonCode (NA_character_), 'Vertebrata')
141
+  checkEquals (convertTaxonCode ('-'), 'Vertebrata')
142
+
143
+} # test.convertTaxonCode
144
+#------------------------------------------------------------------------------------------------------------------------
145
+test.guessProteinIdentifierType = function (moleculeName)
146
+{
147
+  print ('--- test.guessProteinIdentifierType')
148
+  checkEquals (guessProteinIdentifierType ('P29383'), 'UNIPROT')
149
+
150
+  all.types = sapply (x.tbl.anno$proteinID, guessProteinIdentifierType)
151
+  checkTrue (length (which (is.na (all.types))) < 12)   # got most of them.
152
+
153
+} # test.guessProteinIdentifierType
154
+#------------------------------------------------------------------------------------------------------------------------
155
+test.normalizeMatrices = function (matrices)
156
+{
157
+  print ('--- test.normalizeMatrices')
158
+
159
+  colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
160
+  #checkTrue (all (colsums > 1))
161
+
162
+  matrices.norm = normalizeMatrices (matrices)
163
+
164
+  colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx))))))
165
+  checkTrue (all (colsums == 1))
166
+
167
+  invisible (matrices.norm)
168
+
169
+} # test.normalizeMatrices
170
+#------------------------------------------------------------------------------------------------------------------------
171
+test.assignGeneId = function (dataDir, proteinId)
172
+{
173
+  print ('--- test.assignGeneId')
174
+  uniprot.ids = c ('Q9GRA5', 'P31314', 'AAC18941', 'O49397')
175
+  refseq.ids  = c ('NP_995315.1', 'NP_032840', 'NP_599022')
176
+  yeast.ids   = c ('YKL112W', 'YMR072W', 'YLR131C')
177
+
178
+  checkEquals (assignGeneId ('NP_995315.1'), list (geneId='4782', type='ENTREZ'))
179
+  checkEquals (assignGeneId ('NP_599022'),   list (geneId='6095', type='ENTREZ'))
180
+
181
+  checkEquals (assignGeneId ('P31314'),      list (geneId='3195', type='ENTREZ'))
182
+
183
+  checkEquals (assignGeneId ('YKL112W'),     list (geneId='YKL112W', type='SGD'))
184
+
185
+    # see how successful this is over all 513 proteinIds
186
+
187
+  tbl.anno = createAnnotationTable (dataDir)
188
+  mtx.geneId = as.data.frame (t (sapply (tbl.anno$proteinID, assignGeneId)))
189
+  tbl.types = as.data.frame (table (as.character (mtx.geneId$type), useNA='always'), stringsAsFactors=FALSE)
190
+  checkEquals (tbl.types$Var1, c ("ENTREZ", "SGD", NA))
191
+  checkEquals (tbl.types$Freq, c (142, 177, 194))
192
+
193
+} # test.assignGeneId
194
+#------------------------------------------------------------------------------------------------------------------------