Browse code

first version

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75333 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 04:53:56
Showing 2 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,519 @@
1
+# uniprobe/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (RMySQL)
4
+library (org.Hs.eg.db)
5
+library (org.Mm.eg.db)
6
+library (org.Sc.sgd.db)
7
+library (org.Ce.eg.db)
8
+library(tools)   # for md5sum
9
+#------------------------------------------------------------------------------------------------------------------------
10
+printf <- function(...) print(noquote(sprintf(...)))
11
+#------------------------------------------------------------------------------------------------------------------------
12
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/uniprobe"
13
+kDataDir <- "~/s/data/public/TFBS/uniprobe"
14
+#------------------------------------------------------------------------------------------------------------------------
15
+run = function (dataDir)
16
+{
17
+  all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
18
+  matrices = readAndParse (all.files)
19
+  tbl.pubRef = createPublicationRefTable ()
20
+  tbl.geneRef = createGeneRefTable ()
21
+  tbl.md = createMetadata (matrices, tbl.pubRef, tbl.geneRef)
22
+  stopifnot (length (matrices) == nrow (tbl.md))
23
+  matrices = renameMatrices (matrices, tbl.md)
24
+
25
+  serializedFile <- "uniprobe.RData"
26
+  save (matrices, tbl.md, file=serializedFile)
27
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
28
+  printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
29
+
30
+} # run
31
+#------------------------------------------------------------------------------------------------------------------------
32
+createMatrixNameUniqifier = function (matrix)
33
+{
34
+  temporary.file <<- tempfile ()
35
+  write (as.character (matrix), file=temporary.file)
36
+  md5sum.string <<- as.character (md5sum (temporary.file))
37
+  stopifnot (nchar (md5sum.string) == 32)
38
+  md5.6chars = substr (md5sum.string, 29, 32)
39
+  #unlink (temporary.file)
40
+
41
+} # createMatrixNameUniqifier
42
+#------------------------------------------------------------------------------------------------------------------------
43
+parsePWMfromText = function (lines.of.text)
44
+{
45
+  z = lines.of.text
46
+  stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4)
47
+
48
+    # determine the number of columns
49
+  token.count.first.line = length (strsplit (lines.of.text [1], '\t')[[1]])
50
+  column.count = token.count.first.line - 1  # subtract off the 'A:\t' token
51
+  result = matrix (nrow=4, ncol=column.count, byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:column.count))
52
+
53
+  for (line in lines.of.text) {
54
+    tokens = strsplit (line, '\\s*[:\\|]') [[1]]
55
+    nucleotide = tokens [1]
56
+    numbers.raw = tokens [2]
57
+    zz = numbers.raw
58
+    number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
59
+    while (nchar (number.tokens [1]) == 0)
60
+      number.tokens = number.tokens [-1]
61
+    zzz = number.tokens
62
+    numbers = as.numeric (number.tokens)
63
+    zzzz = numbers
64
+    #printf ('adding %s: %s', nucleotide, list.to.string (numbers))
65
+    result [nucleotide,] = numbers
66
+    }
67
+
68
+  return (result)
69
+
70
+} # parsePWMfromText
71
+#------------------------------------------------------------------------------------------------------------------------
72
+extractPWMfromFile = function (filename)
73
+{
74
+  text = scan (filename, sep='\n', what=character (0), quiet=TRUE)
75
+  matrix.start.lines = grep ('A\\s*[:\\|]', text)
76
+  stopifnot (length (matrix.start.lines) == 1)
77
+  #printf ('%50s: %s', filename, list.to.string (matrix.start.lines))
78
+  start.line = matrix.start.lines [1]
79
+  end.line = start.line + 3
80
+  lines = text [start.line:end.line]
81
+  pwm.matrix = parsePWMfromText (lines)
82
+  return (pwm.matrix)
83
+
84
+} # extractPWMfromFile
85
+#------------------------------------------------------------------------------------------------------------------------
86
+createPublicationRefTable = function ()
87
+{
88
+  # tbl.pubmed <<- data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
89
+  #                     pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
90
+  #                     stringsAsFactors=FALSE)
91
+
92
+  options (stringsAsFactors=FALSE)
93
+  tbl.ref = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'))
94
+  tbl.ref = cbind (tbl.ref, author=c('Scharer', 'Berger', 'Grove', 'Wei', 'Lesch', 'Zhu', 'Pompeani', 'Berger', 'De Silva', 'Badis'))
95
+  tbl.ref = cbind (tbl.ref, pmid=c('19147588','18585359','19632181','20517297','19204119','19158363','18681939','16998473','18541913','19443739'))
96
+  tbl.ref = cbind (tbl.ref, organism=c('Hsapiens', 'Mmusculus', 'Celegans', 'Mmusculus', 'Celegans', 'Scerevisiae', 'Vharveyi',
97
+                                       'Scerevisiae;Hsapiens;Mmusculus', 'Apicomplexa', 'Mmusculus'))
98
+  tbl.ref = cbind (tbl.ref, count=c(1, 168, 34, 25, 1, 89, 1, 5, 3, 104))
99
+  titles = c ('Genome-wide promoter analysis of the SOX4 transcriptional network in prostate cancer cells',
100
+              'Variation in homeodomain DNA binding revealed by high-resolution analysis of sequence preferences',
101
+              'A Multiparameter Network Reveals Extensive Divergence between C. elegans bHLH Transcription Factors',
102
+              'Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo',
103
+              'Transcriptional regulation and stabilization of left right neuronal identity in C. elegans',
104
+              'High-resolution DNA-binding specificity analysis of yeast transcription factors',
105
+              'The Vibrio harveyi master quorum-sensing regulator, LuxR...',
106
+              'Compact, universal DNA microarrays...',
107
+              'Specific DNA-binding by Apicomplexan AP2 transcription factors',
108
+              'Diversity and complexity in DNA recognition by transcription factors')
109
+  tbl.ref = cbind (tbl.ref, title=titles)
110
+
111
+  tbl.ref
112
+
113
+} # createPublicationRefTable
114
+#------------------------------------------------------------------------------------------------------------------------
115
+identifyFiles = function (filePath)
116
+{
117
+  cmd = sprintf ('find %s -name "*pwm*"', filePath)
118
+
119
+  files.raw = system (cmd, intern=TRUE)
120
+
121
+      # all legit files end in ".pwm" or ".txt"
122
+  files = c (grep (".pwm$", files.raw, value=T, ignore.case=T), 
123
+             grep (".txt$", files.raw, value=T, ignore.case=T))
124
+
125
+  reverseComplementMatrices = grep ('RC', files)
126
+  if (length (reverseComplementMatrices) > 0)
127
+    files = files [-reverseComplementMatrices]
128
+
129
+      # don't know why these were excluded.  leave them in for now
130
+  embo10.excluders = grep ('EMBO', files)
131
+  if (length (embo10.excluders) > 0)
132
+    files = files [-embo10.excluders]
133
+
134
+  cell09.excluders = grep ('Cell09', files)   # include these once the sql tables are updated
135
+  if (length (cell09.excluders) > 0)
136
+    files = files [-cell09.excluders]
137
+  secondary.excluders = grep ('secondary', files)
138
+  if (length (secondary.excluders) > 0)
139
+    files = files [-secondary.excluders]
140
+
141
+  invisible (files)
142
+
143
+} # identifyFiles
144
+#------------------------------------------------------------------------------------------------------------------------
145
+readAndParse = function (file.list)
146
+{
147
+  matrices = list ()
148
+
149
+  for (file in file.list) {
150
+    #printf ('read and parse %s', file)
151
+    text = scan (file, sep='\n', what=character (0), quiet=TRUE)
152
+    aaa <<- text
153
+    matrix.start.lines = grep ('A:', text)
154
+    stopifnot (length (matrix.start.lines) == 1)
155
+    start.line = matrix.start.lines [1]
156
+    end.line = start.line + 3
157
+    bbb <<- start.line
158
+    ccc <<- end.line
159
+    lines.of.text = text [start.line:end.line]
160
+    zzz <<- lines.of.text
161
+    pwm.matrix = parsePWMfromText (lines.of.text)
162
+    name.tokens <- strsplit(file,"/")[[1]]
163
+    token.count <- length(name.tokens)
164
+
165
+    matrix.name <- paste(name.tokens[(token.count-1):token.count], collapse="/")
166
+    matrices [[matrix.name]] = pwm.matrix
167
+    }
168
+
169
+  invisible (matrices)
170
+
171
+} # readAndParse
172
+#------------------------------------------------------------------------------------------------------------------------
173
+# eg, ./All_PWMs/GD09/Nsy-7.pwm to simply 'Nsy-7'
174
+#
175
+translateFileNameToGeneName = function (long.name)
176
+{
177
+  if (length (grep ('/', long.name)) > 0) {
178
+    tokens = strsplit (long.name, '/')[[1]]
179
+    count = length (tokens)
180
+    gene.name.raw = tokens [count]  # get the last one
181
+    }
182
+  else {
183
+    gene.name.raw = long.name
184
+    }
185
+
186
+  gene.name = strsplit (gene.name.raw, '\\.')[[1]][1]   # remove any file suffix
187
+
188
+  gene.name
189
+
190
+} # test.translateFileNameToGeneName
191
+#------------------------------------------------------------------------------------------------------------------------
192
+# and update matrix names
193
+createMetadata = function (matrices, tbl.pubRef, tbl.geneRef)
194
+{
195
+  options (stringsAsFactors=FALSE)
196
+  dataSource = 'UniPROBE'
197
+  trim = function (s) {sub (' +$', '', sub ('^ +', '', s))}
198
+  tbl.md = data.frame ()
199
+  
200
+  removers = list (Cart1=110, Cutl1=115, Hoxa7=156, Irx3=185)
201
+ 
202
+  for (m in 1: length (matrices)) {
203
+    matrix.name = names (matrices) [m]
204
+    #printf ('%d: %s', m, matrix.name)
205
+    native.name.raw = gsub ('All_PWMs/', '', matrix.name)
206
+    native.name = extractNativeNames (native.name.raw)
207
+
208
+       # extractNativeNames needs to be rethought.  but for now, just hack in some fixes
209
+    if (native.name == 'Cgd2') native.name='Cgd2_3490'   # inconsistency at uniprobe, accomodated here
210
+    if (native.name == 'PF14') native.name='PF14_0633'
211
+    if (native.name == 'Uncx4') native.name='Uncx4.1'
212
+
213
+    if (!native.name %in% tbl.geneRef$name) {
214
+      browser (text='native.name not in tbl.geneRef')
215
+      }
216
+    
217
+    experiment.folder = strsplit (native.name.raw, '/')[[1]][1]
218
+    up.id.number = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$id
219
+    # todo BUG here!
220
+    full.uniprobe.id = sprintf ('UP%05d', up.id.number)
221
+    if (length (full.uniprobe.id) != 1) {
222
+      browser (text='full.uniprobe.id has wrong length')
223
+      }
224
+
225
+    geneId = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$stdID
226
+    if (is.na (geneId) | geneId == '')
227
+       geneId = NA_character_
228
+
229
+    bindingDomain = trim (subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$domain)
230
+    organism = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$species
231
+
232
+      # a native name (a gene symbol) may have a dash, but for the long name, we want dashes to separate
233
+      # the organism-dataSource-geneIdentifier matrix name.
234
+      # so covert this here, but do not eclipse any dash-including native.names
235
+
236
+    native.name.no.dashes = gsub ('-', '_', native.name)
237
+    native.name.uniq = sprintf ('%s-%s-%s.%s', organism, dataSource, native.name.no.dashes, full.uniprobe.id)
238
+    if (native.name.uniq %in% rownames (tbl.md)) {
239
+      matrix = matrices [[m]]
240
+      uniqifier = createMatrixNameUniqifier (matrix)
241
+      #printf ('before, not unique: %s', native.name.uniq)
242
+      native.name.uniq = paste (native.name.uniq, uniqifier, sep='.')
243
+      #printf ('after, not unique: %s', native.name.uniq)
244
+      }
245
+      
246
+    sequenceCount = NA_integer_
247
+    bindingSequence = NA_character_
248
+    tfFamily = NA_character_
249
+
250
+    experimentType = 'protein binding microarray'
251
+    pubmedID = subset (tbl.pubRef, folder==experiment.folder)$pmid
252
+    #printf ('%12s: %20s', native.name, organism)
253
+
254
+    if (is.na (geneId))
255
+      geneIdType = NA
256
+    else if (organism == 'Scerevisiae')
257
+      geneIdType = 'SGD'
258
+    else if (organism %in% c ('Mmusculus', 'Hsapiens', 'Celegans'))
259
+      geneIdType = 'ENTREZ'
260
+    else
261
+      geneIdType = 'todo'
262
+
263
+    proteinId = NA_character_
264
+    proteinIdType = NA_character_
265
+
266
+    proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$uniprot
267
+    if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
268
+      #printf ('getting uniprot id for %s: %s', native.name, proteinId.tmp)
269
+      proteinId = proteinId.tmp
270
+      proteinIdType = 'UNIPROT'
271
+      }  # found good id in the uniprot column
272
+
273
+    if (is.na (proteinId.tmp) | nchar (proteinId.tmp) == 0) {
274
+      proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$refseq_id
275
+      if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
276
+        #printf ('getting refseq id for %s: %s', native.name, proteinId.tmp)
277
+        proteinId = proteinId.tmp
278
+        proteinIdType = 'REFSEQ'
279
+        }
280
+      } # need to look in the refseq column
281
+
282
+
283
+    bindingSequence =  subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$bindingSequence
284
+    #printf ('%12s: %40s', native.name, bindingSequence)
285
+
286
+    new.row = list (providerName=native.name.raw,
287
+                    providerId=full.uniprobe.id,
288
+                    dataSource=dataSource,
289
+                    geneSymbol=native.name,
290
+                    geneId=geneId,
291
+                    geneIdType=geneIdType,
292
+                    proteinId=proteinId,
293
+                    proteinIdType=proteinIdType,
294
+                    organism=organism,
295
+                    sequenceCount=NA,
296
+                    bindingSequence=bindingSequence,
297
+                    bindingDomain=bindingDomain,
298
+                    tfFamily=tfFamily,
299
+                    experimentType=experimentType,
300
+                    pubmedID=pubmedID)
301
+    #if (native.name == 'Cart1')  browser (text='Cart1')
302
+    if (native.name.uniq %in% rownames (tbl.md)) browser (text='dup row name')
303
+    #print (new.row)
304
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
305
+    if (length (native.name.uniq) != 1) browser (text='native.name.unique length != 1')
306
+    rownames (tbl.md) [m] = native.name.uniq
307
+    }
308
+
309
+  invisible (tbl.md)
310
+
311
+} # createMetadata
312
+#------------------------------------------------------------------------------------------------------------------------
313
+extractNativeNames = function (native.names.raw)
314
+{
315
+  name.count = length (native.names.raw)
316
+  result = vector ('character', name.count)
317
+
318
+  for (i in 1:name.count) {
319
+    native.name.raw = native.names.raw [i]
320
+    tokens = strsplit (native.name.raw, '/') [[1]]
321
+    count = length (tokens)
322
+    cooked.1 = native.name.raw
323
+    if (count > 1)
324
+      cooked.1 = tokens [length (tokens)]
325
+    tokens = strsplit (cooked.1, '[_\\.]')[[1]]
326
+    cooked.2 = tokens [1]
327
+    result [i] = cooked.2
328
+    }
329
+
330
+  invisible (result)
331
+
332
+} # extractNativeNames
333
+#------------------------------------------------------------------------------------------------------------------------
334
+# 'standard' is usually entrez gene ID.  for yeast it is orf. for fly ...
335
+uniprotToStandardID = function (organism, uniprot.ids)
336
+{
337
+#  uniprot.ids = unique (uniprot.ids)
338
+
339
+  if (!exists ('lst.yeast.uniprot')) {
340
+    tbl.tmp = toTable (org.Sc.sgdUNIPROT)
341
+    yeast.uniprot <<- tbl.tmp$systematic_name
342
+    names (yeast.uniprot) <<- tbl.tmp$uniprot_id
343
+    }
344
+
345
+  if (!exists ('mouse.uniprot')) {
346
+    tbl.tmp = toTable (org.Mm.egUNIPROT)
347
+    mouse.uniprot <<- tbl.tmp$gene_id
348
+    names (mouse.uniprot) <<- tbl.tmp$uniprot_id
349
+    }
350
+
351
+  if (!exists ('human.uniprot')) {
352
+    tbl.tmp = toTable (org.Hs.egUNIPROT)
353
+    human.uniprot <<- tbl.tmp$gene_id
354
+    names (human.uniprot) <<- tbl.tmp$uniprot_id
355
+    }
356
+
357
+  if (!exists ('worm.uniprot')) {
358
+    tbl.tmp = toTable (org.Ce.egUNIPROT)
359
+    worm.uniprot <<- tbl.tmp$gene_id
360
+    names (worm.uniprot) <<- tbl.tmp$uniprot_id
361
+    }
362
+
363
+  organism = unique (organism)
364
+  stopifnot (length (unique (organism)) == 1)
365
+  stopifnot (organism %in% (c ('human', 'mouse', 'yeast', 'worm')))
366
+ 
367
+  if (organism == 'human') {
368
+    ids = human.uniprot [uniprot.ids]
369
+    }
370
+  else if (organism == 'mouse') {
371
+    ids = mouse.uniprot [uniprot.ids]
372
+    }
373
+  else if (organism == 'yeast') {
374
+    ids = yeast.uniprot [uniprot.ids]
375
+    }
376
+  else if (organism == 'worm') {
377
+    ids = worm.uniprot [uniprot.ids]
378
+    }
379
+
380
+
381
+    # embarrassing but true:  could not get this to work by creating a list directly
382
+    # round-about solution:  make a 1-column data.frame, then construct a named list from that
383
+  df = data.frame (uniprot=uniprot.ids, std=as.character (ids), stringsAsFactors=FALSE)
384
+  #rownames (df) = uniprot.ids
385
+  #result = df$stdID
386
+  #names (result) = uniprot.ids
387
+  return (df)
388
+
389
+} # uniprotToStandardID
390
+#------------------------------------------------------------------------------------------------------------------------
391
+# see ~/v/snotes/log "* load uniprobe sql dump file (11 may 2012)" for info on the uniprobe mysql database 
392
+# used here.  
393
+createGeneRefTable = function ()
394
+{
395
+  if (!exists ('db'))
396
+    db <<- dbConnect (MySQL (), dbname='uniprobe')
397
+
398
+  tbl.pubmed = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
399
+                 pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
400
+                 stringsAsFactors=FALSE)
401
+       #CR09	1	Scharer 	19147588	human	Genome-wide promoter analysis of the SOX4 transcriptional 
402
+       #                                                        network in prostate cancer cells
403
+       #Cell08	168	Berger  	18585359	mouse	Variation in homeodomain DNA binding revealed by high-resolution 
404
+       #                                                        analysis of sequence preferences
405
+       #Cell09	34	Grove   	19632181	worm	A Multiparameter Network Reveals Extensive Divergence between 
406
+       #                                                        C. elegans bHLH Transcription Factors
407
+       #EMBO10	25	Wei     	20517297	mouse	Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo
408
+       #GD09	1	Lesch   	19204119	worm	Transcriptional regulation and stabilization of left right neuronal 
409
+       #                                                         identity in C. elegans
410
+       #GR09	89	Zhu     	19158363	yeast	High-resolution DNA-binding specificity analysis of yeast transcription factors
411
+       #MMB08	1	Pompeani	18681939	Vibrio	The Vibrio harveyi master quorum-sensing regulator, LuxR...
412
+       #NBT06	5	Berger  	16998473	yeast;human;mouse	Compact, universal DNA microarrays...
413
+       #PNAS08	3	De Silva	18541913	apicomplexa	Specific DNA-binding by Apicomplexan AP2 transcription factors
414
+       #SCI09	104	Badis   	19443739	mouse	Diversity and complexity in DNA recognition by transcription factors
415
+
416
+
417
+  tbl.gene = dbGetQuery (db, 'select * from gene_ids_public')
418
+  colnames (tbl.gene) = c ('id', 'name', 'species', 'pub', 'type')
419
+  tbl.genomic = dbGetQuery (db, 'select * from genomic_info') [, -c(2,3,8:11)]  # remove the longish 'description' field
420
+  tbl.pub = dbGetQuery (db, 'select * from publication_ids') [,-3]  # remove 'full_ref' for legibility
421
+  tbl = merge (tbl.gene, tbl.pub [, c (1,4)], by.x='pub', by.y='publication_id', all.x=TRUE)
422
+  tbl = merge (tbl, tbl.pubmed, by.x='folder_name', by.y='folder', all.x=TRUE)
423
+  tbl = merge (tbl, tbl.genomic, all.x=TRUE, by.x='name', by.y='gene_name')
424
+  redundant.column = grep ('species.y', colnames (tbl))
425
+  stopifnot (length (redundant.column) == 1)
426
+  tbl = tbl [, -redundant.column]
427
+  column.to.rename = grep ('species.x', colnames (tbl))
428
+  stopifnot (length (column.to.rename) == 1)
429
+  colnames (tbl) [column.to.rename] = 'species'
430
+  
431
+
432
+   # now add 'standard IDs' -- orf names for yeast, entrez geneIDs for everything else
433
+
434
+  stdID = getAllStandardIDs (tbl)
435
+  tbl = cbind (tbl, stdID)
436
+  duplicates = which (duplicated (tbl [, c (1,2)]))
437
+  if (length (duplicates) > 0)
438
+    tbl = tbl [-duplicates,]
439
+  
440
+    # fix the organism (species) name, from (e.g.) "Homo sapiens" to "Hsapiens"
441
+  tbl$species = standardizeSpeciesNames (tbl$species)
442
+  invisible (tbl)
443
+
444
+    # now add bindingSequence, from DBDs:  gene_name + seq, apparently the binding sequence when known, of 171 of 372 genes
445
+  tbl.dbds = dbGetQuery (db, 'select * from DBDs')  
446
+  dbds.list = tbl.dbds$seq
447
+  names (dbds.list) = tbl.dbds$gene_name
448
+  bindingSequence = rep (NA_character_, nrow (tbl))
449
+  names (bindingSequence) = tbl$name
450
+  shared.names = unique (intersect (tbl.dbds$gene_name, tbl$name))
451
+  bindingSequence [shared.names] = dbds.list [shared.names]
452
+  tbl = cbind (tbl, bindingSequence)
453
+  invisible (tbl)
454
+  
455
+} # createGeneRefTable
456
+#------------------------------------------------------------------------------------------------------------------------
457
+# make successive species-specific calls to 'uniprotToStandardID', assembling a new column to be added to tbl.geneRef
458
+getAllStandardIDs = function (tbl.geneRef)
459
+{
460
+  mouse.rows  = grep ('Mus musculus', tbl.geneRef$species)
461
+  yeast.rows = grep ('Saccharomyces cerevisiae', tbl.geneRef$species)
462
+  human.rows = grep ('Homo sapiens', tbl.geneRef$species)
463
+  worm.rows = grep ('Caenorhabditis elegans', tbl.geneRef$species)
464
+
465
+  stdID = rep (NA_character_, nrow (tbl.geneRef))
466
+
467
+  mouse.uids = tbl.geneRef$uniprot [mouse.rows]
468
+  yeast.uids = tbl.geneRef$uniprot [yeast.rows]
469
+  human.uids = tbl.geneRef$uniprot [human.rows]
470
+  worm.uids = tbl.geneRef$uniprot [worm.rows]
471
+
472
+    # add mouse geneIDs
473
+  tbl.mouse =  uniprotToStandardID ('mouse', mouse.uids)
474
+  stopifnot (length (mouse.rows) == nrow (tbl.mouse))
475
+  stdID [mouse.rows] = tbl.mouse$std
476
+  
477
+    # add yeast orfs
478
+  tbl.yeast =  uniprotToStandardID ('yeast', yeast.uids)
479
+  stopifnot (length (yeast.rows) == nrow (tbl.yeast))
480
+  stdID [yeast.rows] = tbl.yeast$std
481
+  
482
+    # add human geneIDs
483
+  tbl.human =  uniprotToStandardID ('human', human.uids)
484
+  stopifnot (length (human.rows) == nrow (tbl.human))
485
+  stdID [human.rows] = tbl.human$std
486
+
487
+    # add worm geneIDs
488
+  tbl.worm =  uniprotToStandardID ('worm', worm.uids)
489
+  stopifnot (length (worm.rows) == nrow (tbl.worm))
490
+  stdID [worm.rows] = tbl.worm$std
491
+
492
+  invisible (stdID)
493
+
494
+} # getAllStandardIDs
495
+#------------------------------------------------------------------------------------------------------------------------
496
+# change, e.g., "Homo sapiens" to "Hsapiens"
497
+standardizeSpeciesNames = function (names)
498
+{
499
+   fix = function (name) {
500
+     tokens = strsplit (name, ' ')[[1]]
501
+     stopifnot (length (tokens) == 2)
502
+     genus = tokens [1]
503
+     species = tokens [2]
504
+     return (paste (substr (genus, 1, 1), species, sep=''))
505
+     } 
506
+
507
+   fixed.names = as.character (sapply (names, fix))
508
+   invisible (fixed.names)
509
+
510
+} # standardizeSpeciesNames
511
+#------------------------------------------------------------------------------------------------------------------------
512
+renameMatrices = function (matrices, tbl.md)
513
+{
514
+  stopifnot (length (matrices) == nrow (tbl.md))
515
+  names (matrices) = rownames (tbl.md)
516
+  invisible (matrices)
517
+
518
+} # renameMatrices
519
+#------------------------------------------------------------------------------------------------------------------------
0 520
new file mode 100644
... ...
@@ -0,0 +1,446 @@
1
+# uniprobe/test.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library(RUnit)
4
+source("import.R")
5
+#------------------------------------------------------------------------------------------------------------------------
6
+run.tests = function (dataDir=kDataDir)
7
+{
8
+  txxa <<- test.createMatrixNameUniqifier ()   # in ../common.R
9
+  txx1 <<- test.extractNativeNames (dataDir)
10
+  txx2 <<- test.createPublicationRefTable ()
11
+  txxb <<- test.standardizeSpeciesNames ()
12
+  txx4 <<- test.createGeneRefTable ()      # read mysql database to get, uniprobe-style, pwm-specific metadata
13
+  txx3 <<- test.uniprotToStandardID ()
14
+  txx5 <<- test.getAllStandardIDs ()       # build a 'stdID' column to add to tbl.geneRef
15
+  txx5a <<- test.parsePWMfromText (dataDir)
16
+
17
+
18
+  
19
+  txx6 <<- test.extractPWMfromFile (dataDir)
20
+  txx7 <<- test.translateFileNameToGeneName ()
21
+  txx8 <<- test.readAndParse (dataDir)
22
+  txx9 <<- test.createMetadata (dataDir)
23
+  txxb <<- test.createMetadata.sox4 (dataDir)
24
+  txxb <<- test.createMetadata.cbf1 (dataDir)
25
+  txxba <<- test.createMetadata.fixTrailingSpaceInBindingDomain (dataDir)
26
+  txxc <<- test.renameMatrices ()
27
+  txxd <<- test.emptyStringProteinId (dataDir)
28
+  
29
+} # run.tests
30
+#------------------------------------------------------------------------------------------------------------------------
31
+# what tables?  what columns in each?
32
+exploreDatabase = function ()
33
+{
34
+  if (!exists ('db'))
35
+    db <<- dbConnect (MySQL (), dbname='uniprobe')
36
+  tables = dbListTables (db)
37
+  excluders = grep ('mers_', tables)
38
+  if (length (excluders) > 0)
39
+    tables = tables [-excluders]
40
+  for (table in tables) {
41
+    fields = dbListFields (db, table)
42
+    printf ('--- %s: %d', table, length (fields))
43
+    printf ('  %s', list.to.string (fields)) 
44
+    } # for table
45
+
46
+
47
+} # exploreDatabase
48
+#------------------------------------------------------------------------------------------------------------------------
49
+test.parsePWMfromText = function (dataDir)
50
+{
51
+  print ('--- test.parsePWMfromText')
52
+
53
+  sample.file = file.path(dataDir, 'All_PWMs/Cell08/Phox2a_3947.1_pwm.txt')
54
+  lines.of.text = scan (sample.file, what=character(0), sep='\n', skip=2, quiet=TRUE)
55
+  yy <<- lines.of.text
56
+  checkEquals (grep ('^A:', lines.of.text), 1)
57
+  pwm.phox2a = parsePWMfromText (lines.of.text)
58
+  checkTrue (is.matrix (pwm.phox2a))
59
+  checkEquals (dim (pwm.phox2a), c (4, 16))
60
+  xx <<- pwm.phox2a
61
+  for (c in 1:ncol (pwm.phox2a)) {
62
+    checkEqualsNumeric (sum (pwm.phox2a [, c]), 1.0, tol=0.01)
63
+    } # for c
64
+  
65
+  invisible (pwm.phox2a)
66
+
67
+} # test.parsePWMfromText
68
+#------------------------------------------------------------------------------------------------------------------------
69
+test.extractPWMfromFile = function (dataDir)
70
+{
71
+  print ('--- test.extractPWMfromFile')
72
+
73
+    # a well-behaved file, with a single unaccompanied tab separating columns
74
+  file = file.path(dataDir, 'All_PWMs/SCI09/Arid3a_pwm_primary.txt')
75
+  checkTrue(file.exists(file))
76
+  
77
+  arid2a.matrix <<- extractPWMfromFile (file)
78
+  checkEquals (rownames (arid2a.matrix), c ('A','C', 'G', 'T'))
79
+  checkTrue (all (as.integer (round (as.double (colSums (arid2a.matrix)))) == 1))
80
+
81
+    # a file with a confusion of white spaces
82
+    # "A:\t  0.247560633305795  0.264091385393575\t\t 0.204731944407109 ....
83
+    # depends on perl=T whitespace+ regex in parsePWMfromText
84
+
85
+  file = file.path(dataDir, 'All_PWMs/GD09/Nsy-7.pwm')
86
+  nsy7.matrix <<- extractPWMfromFile (file)
87
+  checkEquals (rownames (nsy7.matrix), c ('A','C', 'G', 'T'))
88
+  checkTrue (all (as.integer (round (as.double (colSums (nsy7.matrix)))) == 1))
89
+
90
+} # test.extractPWMfromFile
91
+#------------------------------------------------------------------------------------------------------------------------
92
+test.createPublicationRefTable = function ()
93
+{
94
+  print ('--- test.createPublicationRefTable')
95
+  tbl.ref = createPublicationRefTable ()
96
+  checkEquals (dim (tbl.ref), c (10, 6))
97
+  checkEquals (colnames (tbl.ref), c ('folder', 'author', 'pmid', 'organism', 'count', 'title'))
98
+
99
+     # make sure neither author nor pmid repeat
100
+  #checkEquals (length (unique (tbl.ref$author)), nrow (tbl.ref))
101
+  #checkEquals (length (unique (tbl.ref$pmid)), nrow (tbl.ref))
102
+
103
+  invisible (tbl.ref)
104
+
105
+} # test.createPublicationRefTable
106
+#------------------------------------------------------------------------------------------------------------------------
107
+test.identifyFiles = function ()
108
+{
109
+  print ('--- test.identifyFiles')
110
+
111
+    # just one legit pwm file in GD09:
112
+
113
+  files.found = identifyFiles ('All_PWMs/GD09')
114
+  checkEquals (length (files.found), 1)
115
+
116
+
117
+  files.found = identifyFiles ('.')
118
+
119
+  checkTrue (length (files.found) > 375)
120
+
121
+  invisible (files.found)
122
+
123
+
124
+} # test.identifyFiles
125
+#------------------------------------------------------------------------------------------------------------------------
126
+test.readAndParse = function (dataDir)
127
+{
128
+  print ('--- test.readAndParse')
129
+  all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
130
+  test.file = grep ('GD09', all.files, v=TRUE)
131
+  checkEquals (length (test.file), 1)
132
+  mtx = readAndParse (test.file)
133
+  checkEquals (length (mtx), 1)
134
+  checkTrue (is.matrix (mtx [[1]]))
135
+
136
+  random.sample <- c (98, 99, 251)
137
+  test.files.3 = all.files [random.sample]
138
+  mtx3 = readAndParse (test.files.3)
139
+  checkEquals (length (mtx3), 3)
140
+  expected.matrix.names <- c("PNAS08/PF14_0633.pwm", "PNAS08/PFF0200c.pwm",
141
+                             "Cell08/Pou4f3_2791.1_pwm.txt")
142
+  checkEquals(names(mtx3), expected.matrix.names)
143
+
144
+  invisible (mtx3)
145
+
146
+} # test.readAndParse
147
+#------------------------------------------------------------------------------------------------------------------------
148
+test.translateFileNameToGeneName = function ()
149
+{
150
+  print ('--- test.translateFileNameToGeneName')
151
+  checkEquals (translateFileNameToGeneName ('./All_PWMs/GD09/Nsy-7.pwm'), 'Nsy-7')
152
+  checkEquals (translateFileNameToGeneName ('Nsy-7.pwm'), 'Nsy-7')
153
+  checkEquals (translateFileNameToGeneName ('Nsy-7'), 'Nsy-7')
154
+
155
+} # test.translateFileNameToGeneName
156
+#------------------------------------------------------------------------------------------------------------------------
157
+test.createMetadata = function (dataDir)
158
+{
159
+  print ('--- test.createMetadata')
160
+  all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
161
+
162
+  sample.files = all.files [c (12, 36, 259, 269, 309)]   #   sample (1:length (all.files), 5)]
163
+    # add in all entries from the NBT paper, which are a mix of organisms
164
+    # Scerevisiae;Hsapiens;Mmusculus.  want to make sure that organism is drawn from tbl.geneRef, rather than
165
+    # the old approach, which learned matrix/gene/organism relationships from tbl.pubRef
166
+  sample.files = c (sample.files, grep ('NBT', all.files, v=T))
167
+
168
+  matrices.10 = readAndParse (sample.files)
169
+  tbl.pubRef = createPublicationRefTable ()
170
+  tbl.geneRef = createGeneRefTable ()
171
+  tbl.md = createMetadata (matrices.10, tbl.pubRef, tbl.geneRef)
172
+
173
+  checkEquals (dim (tbl.md), c (10, 15))
174
+  checkEquals (tbl.md$geneId, c ("YPR104C", "YBR089C-A", "194738", "21410", "54131", "YJR060W", "179485", "5451", "YNL216W", "13653"))
175
+
176
+  checkEquals (tbl.md$geneIdType, c ("SGD", "SGD", "ENTREZ", "ENTREZ", "ENTREZ", "SGD", "ENTREZ", "ENTREZ", "SGD", "ENTREZ"))
177
+
178
+  checkEquals (tbl.md$organism, c ("Scerevisiae", "Scerevisiae", "Mmusculus", "Mmusculus", "Mmusculus", "Scerevisiae", 
179
+                                   "Celegans", "Hsapiens", "Scerevisiae", "Mmusculus"))
180
+  checkEquals (tbl.md$geneSymbol, c ("Fhl1", "Nhp6b", "Rhox11", "Tcf2", "Irf3", "Cbf1", "Ceh-22", "Oct-1", "Rap1", "Zif268"))
181
+  
182
+  checkEquals (tbl.md$proteinId,     c ("P39521", "P11633", "Q810N8", "P27889", "P70671", "P17106", "P41936", "P14859", "P11938", "P08046"))
183
+  checkEquals (tbl.md$proteinIdType, c ("UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT"))
184
+  checkEquals (tbl.md$bindingDomain, c ("Fork-head", "HMG_box", "Homeo", "Homeo", "IRF", "HLH", "Homeo", "POU", "Myb", "ZnF_C2H2"))
185
+  checkEquals (tbl.md$bindingSequence, c (NA_character_, NA_character_,
186
+                                         'PRKAYRFTPGQLWELQAVFVENQYPDALKRKELAGLLNVDEQKIKDWFNNKRAKYRK',
187
+                                         'RRNRFKWGPASQQILYQAYDRQKNPSKEEREALVEECNRAECLQRGVSPSKAHGLGSNLVTEVRVYNWFANRRKEEAF',
188
+                                         NA_character_,
189
+                                         NA_character_,
190
+                                         NA_character_,
191
+                                         NA_character_,
192
+                                         NA_character_,
193
+                                         NA_character_))
194
+
195
+
196
+  checkEquals (length (which (duplicated (tbl.md$providerName))), 0)
197
+
198
+  full.names = rownames (tbl.md)
199
+  checkEquals (length (grep ('.UP0', full.names)), nrow (tbl.md))
200
+  checkEquals (length (grep ('-UniPROBE', full.names)), nrow (tbl.md))
201
+
202
+     # should be 10 full.names, each of 3 tokens: "Scerevisiae-UniPROBE-Fhl1.UP00303"
203
+  tokens = strsplit (full.names, '-')
204
+  checkEquals (length (tokens), nrow (tbl.md))
205
+  all (sapply (tokens, function (sub.tokens) checkEquals (length (sub.tokens), 3)))
206
+
207
+  invisible (tbl.md)
208
+
209
+} # test.createMetadata
210
+#------------------------------------------------------------------------------------------------------------------------
211
+# uniprobe reports yeast Cbf1 from two studies. one is from NBT06 which is a multiple species pwm collection.  this
212
+# will be handled properly in the future (TODO) but not yet.  the more common UniPROBE practive of one-organism/one-paper
213
+# is used throughout to assign organism to pwm.  
214
+# createMetadata can return too many rows in some cases, mostly (always) 
215
+#
216
+test.createMetadata.cbf1 = function (dataDir)
217
+{
218
+  print ('--- test.createMetadata.cbf1')
219
+  all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
220
+  cbf1.files = all.files [grep ('cbf1', all.files, ignore.case=T)]
221
+  stopifnot (length (cbf1.files) == 2)   # one in GR09, one in NBT06
222
+
223
+  matrices.cbf1 = readAndParse (cbf1.files)
224
+  tbl.pubRef = createPublicationRefTable ()
225
+  tbl.geneRef = createGeneRefTable ()
226
+  tbl.md = createMetadata (matrices.cbf1, tbl.pubRef, tbl.geneRef)
227
+  checkEquals (dim (tbl.md), c (2, 15))
228
+  checkEquals (tbl.md$geneSymbol, c ('Cbf1', 'Cbf1'))
229
+  checkEquals (tbl.md$providerName, c ('GR09/Cbf1.pwm', 'NBT06/Cbf1.pwm'))
230
+
231
+  invisible (tbl.md)
232
+
233
+} # test.createMetadata.cbf1
234
+#------------------------------------------------------------------------------------------------------------------------
235
+# some of the metadata for matrices in Cell08, obtained through sql in createGeneRefTable,  ended up with a
236
+# trailing blank ('Homeo ') in the bindingDomain field.  make sure this is fixed.
237
+# currently the fix is to define and use a string 'trim' method in createMetaData ()
238
+test.createMetadata.fixTrailingSpaceInBindingDomain = function (dataDir)
239
+{
240
+  print ('--- test.createMetadata.fixTrailingSpaceInBindingDomain')
241
+  all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
242
+  sample.trailingSpaceCulpritFiles = head (grep ('Irx6', all.files))
243
+  stopifnot (length (sample.trailingSpaceCulpritFiles) == 1) 
244
+  filenames = all.files [sample.trailingSpaceCulpritFiles]
245
+  matrices.tmp = readAndParse (filenames)
246
+  tbl.pubRef = createPublicationRefTable ()
247
+  tbl.geneRef = createGeneRefTable ()
248
+    # the fix is in here.  
249
+  tbl.md = createMetadata (matrices.tmp, tbl.pubRef, tbl.geneRef)
250
+  checkEquals (dim (tbl.md), c (1, 15))
251
+  checkEquals (tbl.md$bindingDomain, "Homeo")
252
+
253
+} # test.createMetadata.fixTrailingSpaceInBindingDomain
254
+#------------------------------------------------------------------------------------------------------------------------
255
+# uniprobe reports sox4 in both mouse and human.  these different organisms must be recognized for this to work.
256
+# it used to fail when organism was variously (e.g.) 'Homo sapiens' and 'Hsapiens'.
257
+# the function standardizeSpeciesNames was added to fix this, converting everything in tbl.geneRef$species
258
+#
259
+test.createMetadata.sox4 = function (dataDir)
260
+{
261
+  print ('--- test.createMetadata.sox4')
262
+  all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
263
+  sox4.files = all.files [grep ('sox4', all.files, ignore.case=T)]
264
+  stopifnot (length (sox4.files) == 2)   # one in CR09, one in SCI09
265
+
266
+  matrices.sox4 = readAndParse (sox4.files)
267
+  tbl.pubRef = createPublicationRefTable ()
268
+  tbl.geneRef = createGeneRefTable ()
269
+  tbl.md = createMetadata (matrices.sox4, tbl.pubRef, tbl.geneRef)
270
+  checkEquals (dim (tbl.md), c (2, 15))
271
+
272
+} # test.createMetadata.sox4
273
+#------------------------------------------------------------------------------------------------------------------------
274
+test.extractNativeNames = function (dataDir)
275
+{
276
+  print ('--- test.extractNativeNames')
277
+
278
+     # check a random sample, extract called one raw name at at time
279
+
280
+  checkEquals (extractNativeNames ("Cell08/Lhx8_2247.2_pwm.txt"), 'Lhx8')
281
+  checkEquals (extractNativeNames ("Cell08/Phox2a_3947.1_pwm.txt"), 'Phox2a')
282
+
283
+  checkEquals (extractNativeNames ("GR09/Srd1.pwm"), 'Srd1')
284
+  checkEquals (extractNativeNames ("Cell08/Pou3f3_3235.2_pwm.txt"), 'Pou3f3')
285
+  checkEquals (extractNativeNames ("Cell08/Msx3_3206.1_pwm.txt"), 'Msx3')
286
+
287
+    # now do the sample again in one call
288
+
289
+  names.5 = extractNativeNames (c ("Cell08/Lhx8_2247.2_pwm.txt",
290
+				   "Cell08/Phox2a_3947.1_pwm.txt",
291
+				   "GR09/Srd1.pwm",
292
+				   "Cell08/Pou3f3_3235.2_pwm.txt",
293
+				   "Cell08/Msx3_3206.1_pwm.txt"))
294
+  checkEquals (names.5, c ("Lhx8", "Phox2a", "Srd1", "Pou3f3", "Msx3"))
295
+
296
+  all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
297
+  matrices.all = readAndParse (all.files)
298
+  all.raw.names = names (matrices.all)
299
+  cooked.names.all = c ()
300
+
301
+  for (raw.name in all.raw.names) {
302
+    cooked.name = extractNativeNames (raw.name)
303
+    cooked.names.all = c (cooked.names.all, cooked.name)
304
+    #printf ('%30s: %10s', raw.name, cooked.name)
305
+    } # for raw.name
306
+  
307
+  cooked.names.all.v2 = extractNativeNames (all.raw.names)
308
+  checkEquals (cooked.names.all, cooked.names.all.v2)
309
+
310
+  invisible (cooked.names.all)
311
+
312
+} # test.extractNativeNames
313
+#------------------------------------------------------------------------------------------------------------------------
314
+test.uniprotToStandardID = function ()
315
+{
316
+  print ('--- test.uniprotToStandardID')
317
+  checkEquals (uniprotToStandardID ('mouse', 'Q14A64')$std, '11298')
318
+  checkEquals (uniprotToStandardID ('human', 'P14859')$std, '5451')
319
+  checkEquals (uniprotToStandardID ('yeast', 'P17106')$std, "YJR060W")
320
+  checkEquals (uniprotToStandardID ('worm',  'P41936')$std, "179485")
321
+
322
+  yeast.uids = c ("P22149", "Q04052", "P40467", "P22035", "P17106")
323
+  checkEquals (uniprotToStandardID ('yeast', yeast.uids)$std,
324
+               c ("YGL071W", "YDR421W", "YIL130W", "YKR099W", "YJR060W"))
325
+
326
+  checkTrue (is.na (uniprotToStandardID ('mouse', 'Q8BI45')$std))
327
+  mouse.uids = c ("O70137", "O35137", "Q62431", "Q8BI45", "O35085", "O35885")
328
+  result = uniprotToStandardID ('mouse', mouse.uids)
329
+  checkEquals (result$std, c ("11694", "11695", "13496", NA, "11878", "17173"))
330
+
331
+  worm.uniprots =  c ("P41936", "Q4PIU8")
332
+  result = uniprotToStandardID ('worm', worm.uniprots)
333
+  checkEquals (result$uniprot, worm.uniprots)
334
+  checkEquals (result$std, c ('179485', NA))
335
+
336
+     # expose and solve a problem with duplicates
337
+     # "P02831"  "P97436" "Q9QZ28" "Q66JY7" "P08046" 
338
+  mouse.uids = c ("P02831", "P02831", '', '')
339
+  result = uniprotToStandardID ('mouse', mouse.uids)
340
+  checkEquals (result$uniprot, mouse.uids)
341
+  checkEquals (result$std, c ('15400', '15400', NA, NA))
342
+
343
+     # mouse Zic[123] maps to geneID 22771-3.  but 22772 is not found for Zic2.  why?
344
+  checkEquals (uniprotToStandardID ('mouse', 'P46684')$std, '22771')
345
+  #checkEquals (uniprotToStandardID ('mouse', 'Q62520')$std, '22772')    TODO!
346
+  checkEquals (uniprotToStandardID ('mouse', 'Q62521')$std, '22773')
347
+
348
+} # test.uniprotToStandardID
349
+#------------------------------------------------------------------------------------------------------------------------
350
+test.createGeneRefTable = function ()
351
+{ 
352
+  print ('--- test.createGeneRefTable')
353
+  tbl.geneRef = createGeneRefTable ()  
354
+  checkEquals (length (grep (' ', tbl.geneRef$species)), 0)   # no spaces in the species names
355
+  checkEquals (ncol (tbl.geneRef), 13)
356
+  checkEquals (sort (colnames (tbl.geneRef)),
357
+               c ("bindingSequence", "domain", "folder_name", "id", "ihop", "name", "pmid", "pub", "refseq_id", "species",  
358
+                   "stdID", "type", "uniprot"))
359
+  checkEquals (nrow (tbl.geneRef), 372)
360
+  checkEquals (length (which (duplicated (tbl.geneRef [, 1:2]))), 0)
361
+  checkEquals (length (which (!is.na (tbl.geneRef$bindingSequence))), 168)
362
+  
363
+  invisible (tbl.geneRef)
364
+
365
+} # test.createGeneRefTable
366
+#------------------------------------------------------------------------------------------------------------------------
367
+test.getAllStandardIDs = function ()
368
+{
369
+  print ('--- test.getAllStandardIDs')
370
+  if (!exists ('tbl.geneRef'))
371
+     tbl.geneRef =  createGeneRefTable ()  
372
+
373
+  stdID = getAllStandardIDs (tbl.geneRef)
374
+  checkEquals (length (stdID), nrow (tbl.geneRef))
375
+  tbl.g2 =  cbind (tbl.geneRef, stdID, stringsAsFactors=FALSE)
376
+  checkEquals (nrow (tbl.g2), nrow (tbl.geneRef))
377
+
378
+     # every row in the table for a yeast pwm should have stdID of NA or start with Y
379
+  should.be.orf.names = subset (tbl.g2, species=='Scerevisiae')$stdID
380
+  checkEquals ((length (which (is.na (should.be.orf.names))) +
381
+                length (grep ('^Y', should.be.orf.names))),
382
+               length (should.be.orf.names))
383
+    # no other rows should have stdID starting with Y
384
+  should.not.be.orf.names = subset (tbl.g2, species!='Scerevisiae')$stdID
385
+  checkEquals (length (grep ('^Y', should.not.be.orf.names)), 0)
386
+
387
+  checkTrue (length (which (is.na (tbl.g2$stdID))) < 70)
388
+
389
+ 	    # now handcheck a few from each species
390
+  checkEquals (subset (tbl.g2, name=='Cbf1')$stdID, c ("YJR060W", "YJR060W"))
391
+  checkEquals (subset (tbl.g2, name=='Yox1')$stdID, "YML027W")
392
+
393
+  invisible (tbl.g2)
394
+
395
+} # test.getAllStandardIDs
396
+#------------------------------------------------------------------------------------------------------------------------
397
+test.standardizeSpeciesNames = function ()
398
+{
399
+  print ('--- test.standardizeSpeciesNames')
400
+  old.style = c ("Saccharomyces cerevisiae", "Mus musculus", "Caenorhabditis elegans", "Cryptosporidium parvum", 
401
+                 "Vibrio harveyi", "Homo sapiens", "Plasmodium falciparum")
402
+  new.style = standardizeSpeciesNames (old.style [1])
403
+  checkEquals (new.style, 'Scerevisiae')
404
+
405
+  new.style = standardizeSpeciesNames (old.style)
406
+  checkEquals (new.style, c ("Scerevisiae", "Mmusculus", "Celegans", "Cparvum", "Vharveyi", "Hsapiens", "Pfalciparum"))
407
+
408
+} # test.standardizeSpeciesNames
409
+#------------------------------------------------------------------------------------------------------------------------
410
+test.renameMatrices = function (matrices, tbl.md, tbl.anno)
411
+{
412
+  print ('--- test.renameMatrices')
413
+
414
+} # test.renameMatrices
415
+#------------------------------------------------------------------------------------------------------------------------
416
+# Scerevisiae-UniPROBE-Gsm1.UP00301, GR09/Gsm1.pwm, UP00301, Gsm1, NA, NA, "", UNIPROT, NA,  Scerevisiae, Zn2Cys67
417
+test.emptyStringProteinId = function (dataDir)
418
+{
419
+  print ('--- test.emptyStringProteinId')
420
+  all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
421
+  filename.gsm1 = grep ('Gsm1', all.files, v=T)
422
+  mtx = readAndParse (filename.gsm1)
423
+  #browser (text='test.emptyStringProteinId')
424
+  tbl.pubRef = createPublicationRefTable ()
425
+  tbl.geneRef = createGeneRefTable ()
426
+  tbl.md = createMetadata (mtx, tbl.pubRef, tbl.geneRef)
427
+  checkEquals (nrow (tbl.md), 1)
428
+  md.list = as.list (tbl.md)
429
+  checkEquals (md.list$proteinId, 'NP_012432')
430
+  checkEquals (md.list$proteinIdType, 'REFSEQ')
431
+
432
+
433
+} # test.emptyStringProteinId 
434
+#------------------------------------------------------------------------------------------------------------------------
435
+test.createMatrixNameUniqifier = function ()
436
+{
437
+  print ('--- test.createMatrixNameUniqifier')
438
+  
439
+  data = c (8,8,8,7,2,2,5,6,5,3,10,5,8,6,9,5,8,8,4,5,8,6,9,2,1,0,5,7,7,2,4,4,3,7,7,9,9,6,1,3)
440
+  test.matrix = matrix (data=data, nrow=4, ncol=10)
441
+  uniqifier = createMatrixNameUniqifier (test.matrix)
442
+  xxx <<- uniqifier
443
+  checkEquals (uniqifier, "b42f")
444
+
445
+} # test.createMatrixNameUniqifier
446
+#------------------------------------------------------------------------------------------------------------------------