Browse code

minor print statement changes

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75484 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 09/04/2013 23:05:44
Showing1 changed files
... ...
@@ -13,6 +13,7 @@ printf <- function(...) print(noquote(sprintf(...)))
13 13
 run = function (dataDir)
14 14
 {
15 15
   dataDir <- file.path(dataDir, "uniprobe")
16
+  stopifnot(file.exists(dataDir))
16 17
 
17 18
   all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
18 19
   matrices = readAndParse (all.files)
Browse code

added and tested uniprobe to the 'importAll' operation

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75475 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 09/04/2013 20:49:43
Showing1 changed files
... ...
@@ -12,7 +12,9 @@ printf <- function(...) print(noquote(sprintf(...)))
12 12
 #------------------------------------------------------------------------------------------------------------------------
13 13
 run = function (dataDir)
14 14
 {
15
-  all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
15
+  dataDir <- file.path(dataDir, "uniprobe")
16
+
17
+  all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
16 18
   matrices = readAndParse (all.files)
17 19
   tbl.pubRef = createPublicationRefTable ()
18 20
   tbl.geneRef = createGeneRefTable (dataDir)
... ...
@@ -105,6 +107,8 @@ createPublicationRefTable = function ()
105 107
 #------------------------------------------------------------------------------------------------------------------------
106 108
 identifyFiles = function (filePath)
107 109
 {
110
+  stopifnot(file.exists(filePath))
111
+  
108 112
   cmd = sprintf ('find %s -name "*pwm*"', filePath)
109 113
 
110 114
   files.raw = system (cmd, intern=TRUE)
... ...
@@ -379,7 +383,6 @@ createGeneRefTable = function (dataDir)
379 383
 {
380 384
   if (!exists ('db')){
381 385
       dbFile <- file.path(dataDir, "uniprobe.sqlite")
382
-      browser("dbFile")
383 386
       stopifnot(file.exists(dbFile))
384 387
       db <<- dbConnect (dbDriver("SQLite"), dbFile)
385 388
       }
Browse code

uniprobe importer now gets metata from sqlite, rather than requiring a running MySQL

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75474 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 09/04/2013 19:52:26
Showing1 changed files
... ...
@@ -1,6 +1,7 @@
1 1
 # uniprobe/import.R
2 2
 #------------------------------------------------------------------------------------------------------------------------
3
-library (RMySQL)
3
+#library (RMySQL)
4
+library (RSQLite)
4 5
 library (org.Hs.eg.db)
5 6
 library (org.Mm.eg.db)
6 7
 library (org.Sc.sgd.db)
... ...
@@ -14,7 +15,7 @@ run = function (dataDir)
14 15
   all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
15 16
   matrices = readAndParse (all.files)
16 17
   tbl.pubRef = createPublicationRefTable ()
17
-  tbl.geneRef = createGeneRefTable ()
18
+  tbl.geneRef = createGeneRefTable (dataDir)
18 19
   tbl.md = createMetadata (matrices, tbl.pubRef, tbl.geneRef)
19 20
   stopifnot (length (matrices) == nrow (tbl.md))
20 21
   matrices = renameMatrices (matrices, tbl.md)
... ...
@@ -28,9 +29,9 @@ run = function (dataDir)
28 29
 #------------------------------------------------------------------------------------------------------------------------
29 30
 createMatrixNameUniqifier = function (matrix)
30 31
 {
31
-  temporary.file <<- tempfile ()
32
+  temporary.file <- tempfile ()
32 33
   write (as.character (matrix), file=temporary.file)
33
-  md5sum.string <<- as.character (md5sum (temporary.file))
34
+  md5sum.string <- as.character (md5sum (temporary.file))
34 35
   stopifnot (nchar (md5sum.string) == 32)
35 36
   md5.6chars = substr (md5sum.string, 29, 32)
36 37
   #unlink (temporary.file)
... ...
@@ -51,13 +52,10 @@ parsePWMfromText = function (lines.of.text)
51 52
     tokens = strsplit (line, '\\s*[:\\|]') [[1]]
52 53
     nucleotide = tokens [1]
53 54
     numbers.raw = tokens [2]
54
-    zz = numbers.raw
55 55
     number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
56 56
     while (nchar (number.tokens [1]) == 0)
57 57
       number.tokens = number.tokens [-1]
58
-    zzz = number.tokens
59 58
     numbers = as.numeric (number.tokens)
60
-    zzzz = numbers
61 59
     #printf ('adding %s: %s', nucleotide, list.to.string (numbers))
62 60
     result [nucleotide,] = numbers
63 61
     }
... ...
@@ -82,10 +80,6 @@ extractPWMfromFile = function (filename)
82 80
 #------------------------------------------------------------------------------------------------------------------------
83 81
 createPublicationRefTable = function ()
84 82
 {
85
-  # tbl.pubmed <<- data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
86
-  #                     pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
87
-  #                     stringsAsFactors=FALSE)
88
-
89 83
   options (stringsAsFactors=FALSE)
90 84
   tbl.ref = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'))
91 85
   tbl.ref = cbind (tbl.ref, author=c('Scharer', 'Berger', 'Grove', 'Wei', 'Lesch', 'Zhu', 'Pompeani', 'Berger', 'De Silva', 'Badis'))
... ...
@@ -146,15 +140,11 @@ readAndParse = function (file.list)
146 140
   for (file in file.list) {
147 141
     #printf ('read and parse %s', file)
148 142
     text = scan (file, sep='\n', what=character (0), quiet=TRUE)
149
-    aaa <<- text
150 143
     matrix.start.lines = grep ('A:', text)
151 144
     stopifnot (length (matrix.start.lines) == 1)
152 145
     start.line = matrix.start.lines [1]
153 146
     end.line = start.line + 3
154
-    bbb <<- start.line
155
-    ccc <<- end.line
156 147
     lines.of.text = text [start.line:end.line]
157
-    zzz <<- lines.of.text
158 148
     pwm.matrix = parsePWMfromText (lines.of.text)
159 149
     name.tokens <- strsplit(file,"/")[[1]]
160 150
     token.count <- length(name.tokens)
... ...
@@ -295,9 +285,7 @@ createMetadata = function (matrices, tbl.pubRef, tbl.geneRef)
295 285
                     tfFamily=tfFamily,
296 286
                     experimentType=experimentType,
297 287
                     pubmedID=pubmedID)
298
-    #if (native.name == 'Cart1')  browser (text='Cart1')
299 288
     if (native.name.uniq %in% rownames (tbl.md)) browser (text='dup row name')
300
-    #print (new.row)
301 289
     tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
302 290
     if (length (native.name.uniq) != 1) browser (text='native.name.unique length != 1')
303 291
     rownames (tbl.md) [m] = native.name.uniq
... ...
@@ -387,10 +375,14 @@ uniprotToStandardID = function (organism, uniprot.ids)
387 375
 #------------------------------------------------------------------------------------------------------------------------
388 376
 # see ~/v/snotes/log "* load uniprobe sql dump file (11 may 2012)" for info on the uniprobe mysql database 
389 377
 # used here.  
390
-createGeneRefTable = function ()
378
+createGeneRefTable = function (dataDir)
391 379
 {
392
-  if (!exists ('db'))
393
-    db <<- dbConnect (MySQL (), dbname='uniprobe')
380
+  if (!exists ('db')){
381
+      dbFile <- file.path(dataDir, "uniprobe.sqlite")
382
+      browser("dbFile")
383
+      stopifnot(file.exists(dbFile))
384
+      db <<- dbConnect (dbDriver("SQLite"), dbFile)
385
+      }
394 386
 
395 387
   tbl.pubmed = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
396 388
                  pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
Browse code

removed all explicit repo paths

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75381 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 20:35:47
Showing1 changed files
... ...
@@ -9,9 +9,6 @@ library(tools)   # for md5sum
9 9
 #------------------------------------------------------------------------------------------------------------------------
10 10
 printf <- function(...) print(noquote(sprintf(...)))
11 11
 #------------------------------------------------------------------------------------------------------------------------
12
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/uniprobe"
13
-kDataDir <- "~/s/data/public/TFBS/uniprobe"
14
-#------------------------------------------------------------------------------------------------------------------------
15 12
 run = function (dataDir)
16 13
 {
17 14
   all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
... ...
@@ -25,7 +22,7 @@ run = function (dataDir)
25 22
   serializedFile <- "uniprobe.RData"
26 23
   save (matrices, tbl.md, file=serializedFile)
27 24
   printf("saved %d matrices to %s", length(matrices), serializedFile)
28
-  printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
25
+  printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
29 26
 
30 27
 } # run
31 28
 #------------------------------------------------------------------------------------------------------------------------
Browse code

first version

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75333 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 04:53:56
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,519 @@
1
+# uniprobe/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (RMySQL)
4
+library (org.Hs.eg.db)
5
+library (org.Mm.eg.db)
6
+library (org.Sc.sgd.db)
7
+library (org.Ce.eg.db)
8
+library(tools)   # for md5sum
9
+#------------------------------------------------------------------------------------------------------------------------
10
+printf <- function(...) print(noquote(sprintf(...)))
11
+#------------------------------------------------------------------------------------------------------------------------
12
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/uniprobe"
13
+kDataDir <- "~/s/data/public/TFBS/uniprobe"
14
+#------------------------------------------------------------------------------------------------------------------------
15
+run = function (dataDir)
16
+{
17
+  all.files = identifyFiles (file.path(dataDir,'All_PWMs'))
18
+  matrices = readAndParse (all.files)
19
+  tbl.pubRef = createPublicationRefTable ()
20
+  tbl.geneRef = createGeneRefTable ()
21
+  tbl.md = createMetadata (matrices, tbl.pubRef, tbl.geneRef)
22
+  stopifnot (length (matrices) == nrow (tbl.md))
23
+  matrices = renameMatrices (matrices, tbl.md)
24
+
25
+  serializedFile <- "uniprobe.RData"
26
+  save (matrices, tbl.md, file=serializedFile)
27
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
28
+  printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
29
+
30
+} # run
31
+#------------------------------------------------------------------------------------------------------------------------
32
+createMatrixNameUniqifier = function (matrix)
33
+{
34
+  temporary.file <<- tempfile ()
35
+  write (as.character (matrix), file=temporary.file)
36
+  md5sum.string <<- as.character (md5sum (temporary.file))
37
+  stopifnot (nchar (md5sum.string) == 32)
38
+  md5.6chars = substr (md5sum.string, 29, 32)
39
+  #unlink (temporary.file)
40
+
41
+} # createMatrixNameUniqifier
42
+#------------------------------------------------------------------------------------------------------------------------
43
+parsePWMfromText = function (lines.of.text)
44
+{
45
+  z = lines.of.text
46
+  stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4)
47
+
48
+    # determine the number of columns
49
+  token.count.first.line = length (strsplit (lines.of.text [1], '\t')[[1]])
50
+  column.count = token.count.first.line - 1  # subtract off the 'A:\t' token
51
+  result = matrix (nrow=4, ncol=column.count, byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:column.count))
52
+
53
+  for (line in lines.of.text) {
54
+    tokens = strsplit (line, '\\s*[:\\|]') [[1]]
55
+    nucleotide = tokens [1]
56
+    numbers.raw = tokens [2]
57
+    zz = numbers.raw
58
+    number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
59
+    while (nchar (number.tokens [1]) == 0)
60
+      number.tokens = number.tokens [-1]
61
+    zzz = number.tokens
62
+    numbers = as.numeric (number.tokens)
63
+    zzzz = numbers
64
+    #printf ('adding %s: %s', nucleotide, list.to.string (numbers))
65
+    result [nucleotide,] = numbers
66
+    }
67
+
68
+  return (result)
69
+
70
+} # parsePWMfromText
71
+#------------------------------------------------------------------------------------------------------------------------
72
+extractPWMfromFile = function (filename)
73
+{
74
+  text = scan (filename, sep='\n', what=character (0), quiet=TRUE)
75
+  matrix.start.lines = grep ('A\\s*[:\\|]', text)
76
+  stopifnot (length (matrix.start.lines) == 1)
77
+  #printf ('%50s: %s', filename, list.to.string (matrix.start.lines))
78
+  start.line = matrix.start.lines [1]
79
+  end.line = start.line + 3
80
+  lines = text [start.line:end.line]
81
+  pwm.matrix = parsePWMfromText (lines)
82
+  return (pwm.matrix)
83
+
84
+} # extractPWMfromFile
85
+#------------------------------------------------------------------------------------------------------------------------
86
+createPublicationRefTable = function ()
87
+{
88
+  # tbl.pubmed <<- data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
89
+  #                     pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
90
+  #                     stringsAsFactors=FALSE)
91
+
92
+  options (stringsAsFactors=FALSE)
93
+  tbl.ref = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'))
94
+  tbl.ref = cbind (tbl.ref, author=c('Scharer', 'Berger', 'Grove', 'Wei', 'Lesch', 'Zhu', 'Pompeani', 'Berger', 'De Silva', 'Badis'))
95
+  tbl.ref = cbind (tbl.ref, pmid=c('19147588','18585359','19632181','20517297','19204119','19158363','18681939','16998473','18541913','19443739'))
96
+  tbl.ref = cbind (tbl.ref, organism=c('Hsapiens', 'Mmusculus', 'Celegans', 'Mmusculus', 'Celegans', 'Scerevisiae', 'Vharveyi',
97
+                                       'Scerevisiae;Hsapiens;Mmusculus', 'Apicomplexa', 'Mmusculus'))
98
+  tbl.ref = cbind (tbl.ref, count=c(1, 168, 34, 25, 1, 89, 1, 5, 3, 104))
99
+  titles = c ('Genome-wide promoter analysis of the SOX4 transcriptional network in prostate cancer cells',
100
+              'Variation in homeodomain DNA binding revealed by high-resolution analysis of sequence preferences',
101
+              'A Multiparameter Network Reveals Extensive Divergence between C. elegans bHLH Transcription Factors',
102
+              'Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo',
103
+              'Transcriptional regulation and stabilization of left right neuronal identity in C. elegans',
104
+              'High-resolution DNA-binding specificity analysis of yeast transcription factors',
105
+              'The Vibrio harveyi master quorum-sensing regulator, LuxR...',
106
+              'Compact, universal DNA microarrays...',
107
+              'Specific DNA-binding by Apicomplexan AP2 transcription factors',
108
+              'Diversity and complexity in DNA recognition by transcription factors')
109
+  tbl.ref = cbind (tbl.ref, title=titles)
110
+
111
+  tbl.ref
112
+
113
+} # createPublicationRefTable
114
+#------------------------------------------------------------------------------------------------------------------------
115
+identifyFiles = function (filePath)
116
+{
117
+  cmd = sprintf ('find %s -name "*pwm*"', filePath)
118
+
119
+  files.raw = system (cmd, intern=TRUE)
120
+
121
+      # all legit files end in ".pwm" or ".txt"
122
+  files = c (grep (".pwm$", files.raw, value=T, ignore.case=T), 
123
+             grep (".txt$", files.raw, value=T, ignore.case=T))
124
+
125
+  reverseComplementMatrices = grep ('RC', files)
126
+  if (length (reverseComplementMatrices) > 0)
127
+    files = files [-reverseComplementMatrices]
128
+
129
+      # don't know why these were excluded.  leave them in for now
130
+  embo10.excluders = grep ('EMBO', files)
131
+  if (length (embo10.excluders) > 0)
132
+    files = files [-embo10.excluders]
133
+
134
+  cell09.excluders = grep ('Cell09', files)   # include these once the sql tables are updated
135
+  if (length (cell09.excluders) > 0)
136
+    files = files [-cell09.excluders]
137
+  secondary.excluders = grep ('secondary', files)
138
+  if (length (secondary.excluders) > 0)
139
+    files = files [-secondary.excluders]
140
+
141
+  invisible (files)
142
+
143
+} # identifyFiles
144
+#------------------------------------------------------------------------------------------------------------------------
145
+readAndParse = function (file.list)
146
+{
147
+  matrices = list ()
148
+
149
+  for (file in file.list) {
150
+    #printf ('read and parse %s', file)
151
+    text = scan (file, sep='\n', what=character (0), quiet=TRUE)
152
+    aaa <<- text
153
+    matrix.start.lines = grep ('A:', text)
154
+    stopifnot (length (matrix.start.lines) == 1)
155
+    start.line = matrix.start.lines [1]
156
+    end.line = start.line + 3
157
+    bbb <<- start.line
158
+    ccc <<- end.line
159
+    lines.of.text = text [start.line:end.line]
160
+    zzz <<- lines.of.text
161
+    pwm.matrix = parsePWMfromText (lines.of.text)
162
+    name.tokens <- strsplit(file,"/")[[1]]
163
+    token.count <- length(name.tokens)
164
+
165
+    matrix.name <- paste(name.tokens[(token.count-1):token.count], collapse="/")
166
+    matrices [[matrix.name]] = pwm.matrix
167
+    }
168
+
169
+  invisible (matrices)
170
+
171
+} # readAndParse
172
+#------------------------------------------------------------------------------------------------------------------------
173
+# eg, ./All_PWMs/GD09/Nsy-7.pwm to simply 'Nsy-7'
174
+#
175
+translateFileNameToGeneName = function (long.name)
176
+{
177
+  if (length (grep ('/', long.name)) > 0) {
178
+    tokens = strsplit (long.name, '/')[[1]]
179
+    count = length (tokens)
180
+    gene.name.raw = tokens [count]  # get the last one
181
+    }
182
+  else {
183
+    gene.name.raw = long.name
184
+    }
185
+
186
+  gene.name = strsplit (gene.name.raw, '\\.')[[1]][1]   # remove any file suffix
187
+
188
+  gene.name
189
+
190
+} # test.translateFileNameToGeneName
191
+#------------------------------------------------------------------------------------------------------------------------
192
+# and update matrix names
193
+createMetadata = function (matrices, tbl.pubRef, tbl.geneRef)
194
+{
195
+  options (stringsAsFactors=FALSE)
196
+  dataSource = 'UniPROBE'
197
+  trim = function (s) {sub (' +$', '', sub ('^ +', '', s))}
198
+  tbl.md = data.frame ()
199
+  
200
+  removers = list (Cart1=110, Cutl1=115, Hoxa7=156, Irx3=185)
201
+ 
202
+  for (m in 1: length (matrices)) {
203
+    matrix.name = names (matrices) [m]
204
+    #printf ('%d: %s', m, matrix.name)
205
+    native.name.raw = gsub ('All_PWMs/', '', matrix.name)
206
+    native.name = extractNativeNames (native.name.raw)
207
+
208
+       # extractNativeNames needs to be rethought.  but for now, just hack in some fixes
209
+    if (native.name == 'Cgd2') native.name='Cgd2_3490'   # inconsistency at uniprobe, accomodated here
210
+    if (native.name == 'PF14') native.name='PF14_0633'
211
+    if (native.name == 'Uncx4') native.name='Uncx4.1'
212
+
213
+    if (!native.name %in% tbl.geneRef$name) {
214
+      browser (text='native.name not in tbl.geneRef')
215
+      }
216
+    
217
+    experiment.folder = strsplit (native.name.raw, '/')[[1]][1]
218
+    up.id.number = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$id
219
+    # todo BUG here!
220
+    full.uniprobe.id = sprintf ('UP%05d', up.id.number)
221
+    if (length (full.uniprobe.id) != 1) {
222
+      browser (text='full.uniprobe.id has wrong length')
223
+      }
224
+
225
+    geneId = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$stdID
226
+    if (is.na (geneId) | geneId == '')
227
+       geneId = NA_character_
228
+
229
+    bindingDomain = trim (subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$domain)
230
+    organism = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$species
231
+
232
+      # a native name (a gene symbol) may have a dash, but for the long name, we want dashes to separate
233
+      # the organism-dataSource-geneIdentifier matrix name.
234
+      # so covert this here, but do not eclipse any dash-including native.names
235
+
236
+    native.name.no.dashes = gsub ('-', '_', native.name)
237
+    native.name.uniq = sprintf ('%s-%s-%s.%s', organism, dataSource, native.name.no.dashes, full.uniprobe.id)
238
+    if (native.name.uniq %in% rownames (tbl.md)) {
239
+      matrix = matrices [[m]]
240
+      uniqifier = createMatrixNameUniqifier (matrix)
241
+      #printf ('before, not unique: %s', native.name.uniq)
242
+      native.name.uniq = paste (native.name.uniq, uniqifier, sep='.')
243
+      #printf ('after, not unique: %s', native.name.uniq)
244
+      }
245
+      
246
+    sequenceCount = NA_integer_
247
+    bindingSequence = NA_character_
248
+    tfFamily = NA_character_
249
+
250
+    experimentType = 'protein binding microarray'
251
+    pubmedID = subset (tbl.pubRef, folder==experiment.folder)$pmid
252
+    #printf ('%12s: %20s', native.name, organism)
253
+
254
+    if (is.na (geneId))
255
+      geneIdType = NA
256
+    else if (organism == 'Scerevisiae')
257
+      geneIdType = 'SGD'
258
+    else if (organism %in% c ('Mmusculus', 'Hsapiens', 'Celegans'))
259
+      geneIdType = 'ENTREZ'
260
+    else
261
+      geneIdType = 'todo'
262
+
263
+    proteinId = NA_character_
264
+    proteinIdType = NA_character_
265
+
266
+    proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$uniprot
267
+    if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
268
+      #printf ('getting uniprot id for %s: %s', native.name, proteinId.tmp)
269
+      proteinId = proteinId.tmp
270
+      proteinIdType = 'UNIPROT'
271
+      }  # found good id in the uniprot column
272
+
273
+    if (is.na (proteinId.tmp) | nchar (proteinId.tmp) == 0) {
274
+      proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$refseq_id
275
+      if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
276
+        #printf ('getting refseq id for %s: %s', native.name, proteinId.tmp)
277
+        proteinId = proteinId.tmp
278
+        proteinIdType = 'REFSEQ'
279
+        }
280
+      } # need to look in the refseq column
281
+
282
+
283
+    bindingSequence =  subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$bindingSequence
284
+    #printf ('%12s: %40s', native.name, bindingSequence)
285
+
286
+    new.row = list (providerName=native.name.raw,
287
+                    providerId=full.uniprobe.id,
288
+                    dataSource=dataSource,
289
+                    geneSymbol=native.name,
290
+                    geneId=geneId,
291
+                    geneIdType=geneIdType,
292
+                    proteinId=proteinId,
293
+                    proteinIdType=proteinIdType,
294
+                    organism=organism,
295
+                    sequenceCount=NA,
296
+                    bindingSequence=bindingSequence,
297
+                    bindingDomain=bindingDomain,
298
+                    tfFamily=tfFamily,
299
+                    experimentType=experimentType,
300
+                    pubmedID=pubmedID)
301
+    #if (native.name == 'Cart1')  browser (text='Cart1')
302
+    if (native.name.uniq %in% rownames (tbl.md)) browser (text='dup row name')
303
+    #print (new.row)
304
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
305
+    if (length (native.name.uniq) != 1) browser (text='native.name.unique length != 1')
306
+    rownames (tbl.md) [m] = native.name.uniq
307
+    }
308
+
309
+  invisible (tbl.md)
310
+
311
+} # createMetadata
312
+#------------------------------------------------------------------------------------------------------------------------
313
+extractNativeNames = function (native.names.raw)
314
+{
315
+  name.count = length (native.names.raw)
316
+  result = vector ('character', name.count)
317
+
318
+  for (i in 1:name.count) {
319
+    native.name.raw = native.names.raw [i]
320
+    tokens = strsplit (native.name.raw, '/') [[1]]
321
+    count = length (tokens)
322
+    cooked.1 = native.name.raw
323
+    if (count > 1)
324
+      cooked.1 = tokens [length (tokens)]
325
+    tokens = strsplit (cooked.1, '[_\\.]')[[1]]
326
+    cooked.2 = tokens [1]
327
+    result [i] = cooked.2
328
+    }
329
+
330
+  invisible (result)
331
+
332
+} # extractNativeNames
333
+#------------------------------------------------------------------------------------------------------------------------
334
+# 'standard' is usually entrez gene ID.  for yeast it is orf. for fly ...
335
+uniprotToStandardID = function (organism, uniprot.ids)
336
+{
337
+#  uniprot.ids = unique (uniprot.ids)
338
+
339
+  if (!exists ('lst.yeast.uniprot')) {
340
+    tbl.tmp = toTable (org.Sc.sgdUNIPROT)
341
+    yeast.uniprot <<- tbl.tmp$systematic_name
342
+    names (yeast.uniprot) <<- tbl.tmp$uniprot_id
343
+    }
344
+
345
+  if (!exists ('mouse.uniprot')) {
346
+    tbl.tmp = toTable (org.Mm.egUNIPROT)
347
+    mouse.uniprot <<- tbl.tmp$gene_id
348
+    names (mouse.uniprot) <<- tbl.tmp$uniprot_id
349
+    }
350
+
351
+  if (!exists ('human.uniprot')) {
352
+    tbl.tmp = toTable (org.Hs.egUNIPROT)
353
+    human.uniprot <<- tbl.tmp$gene_id
354
+    names (human.uniprot) <<- tbl.tmp$uniprot_id
355
+    }
356
+
357
+  if (!exists ('worm.uniprot')) {
358
+    tbl.tmp = toTable (org.Ce.egUNIPROT)
359
+    worm.uniprot <<- tbl.tmp$gene_id
360
+    names (worm.uniprot) <<- tbl.tmp$uniprot_id
361
+    }
362
+
363
+  organism = unique (organism)
364
+  stopifnot (length (unique (organism)) == 1)
365
+  stopifnot (organism %in% (c ('human', 'mouse', 'yeast', 'worm')))
366
+ 
367
+  if (organism == 'human') {
368
+    ids = human.uniprot [uniprot.ids]
369
+    }
370
+  else if (organism == 'mouse') {
371
+    ids = mouse.uniprot [uniprot.ids]
372
+    }
373
+  else if (organism == 'yeast') {
374
+    ids = yeast.uniprot [uniprot.ids]
375
+    }
376
+  else if (organism == 'worm') {
377
+    ids = worm.uniprot [uniprot.ids]
378
+    }
379
+
380
+
381
+    # embarrassing but true:  could not get this to work by creating a list directly
382
+    # round-about solution:  make a 1-column data.frame, then construct a named list from that
383
+  df = data.frame (uniprot=uniprot.ids, std=as.character (ids), stringsAsFactors=FALSE)
384
+  #rownames (df) = uniprot.ids
385
+  #result = df$stdID
386
+  #names (result) = uniprot.ids
387
+  return (df)
388
+
389
+} # uniprotToStandardID
390
+#------------------------------------------------------------------------------------------------------------------------
391
+# see ~/v/snotes/log "* load uniprobe sql dump file (11 may 2012)" for info on the uniprobe mysql database 
392
+# used here.  
393
+createGeneRefTable = function ()
394
+{
395
+  if (!exists ('db'))
396
+    db <<- dbConnect (MySQL (), dbname='uniprobe')
397
+
398
+  tbl.pubmed = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
399
+                 pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
400
+                 stringsAsFactors=FALSE)
401
+       #CR09	1	Scharer 	19147588	human	Genome-wide promoter analysis of the SOX4 transcriptional 
402
+       #                                                        network in prostate cancer cells
403
+       #Cell08	168	Berger  	18585359	mouse	Variation in homeodomain DNA binding revealed by high-resolution 
404
+       #                                                        analysis of sequence preferences
405
+       #Cell09	34	Grove   	19632181	worm	A Multiparameter Network Reveals Extensive Divergence between 
406
+       #                                                        C. elegans bHLH Transcription Factors
407
+       #EMBO10	25	Wei     	20517297	mouse	Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo
408
+       #GD09	1	Lesch   	19204119	worm	Transcriptional regulation and stabilization of left right neuronal 
409
+       #                                                         identity in C. elegans
410
+       #GR09	89	Zhu     	19158363	yeast	High-resolution DNA-binding specificity analysis of yeast transcription factors
411
+       #MMB08	1	Pompeani	18681939	Vibrio	The Vibrio harveyi master quorum-sensing regulator, LuxR...
412
+       #NBT06	5	Berger  	16998473	yeast;human;mouse	Compact, universal DNA microarrays...
413
+       #PNAS08	3	De Silva	18541913	apicomplexa	Specific DNA-binding by Apicomplexan AP2 transcription factors
414
+       #SCI09	104	Badis   	19443739	mouse	Diversity and complexity in DNA recognition by transcription factors
415
+
416
+
417
+  tbl.gene = dbGetQuery (db, 'select * from gene_ids_public')
418
+  colnames (tbl.gene) = c ('id', 'name', 'species', 'pub', 'type')
419
+  tbl.genomic = dbGetQuery (db, 'select * from genomic_info') [, -c(2,3,8:11)]  # remove the longish 'description' field
420
+  tbl.pub = dbGetQuery (db, 'select * from publication_ids') [,-3]  # remove 'full_ref' for legibility
421
+  tbl = merge (tbl.gene, tbl.pub [, c (1,4)], by.x='pub', by.y='publication_id', all.x=TRUE)
422
+  tbl = merge (tbl, tbl.pubmed, by.x='folder_name', by.y='folder', all.x=TRUE)
423
+  tbl = merge (tbl, tbl.genomic, all.x=TRUE, by.x='name', by.y='gene_name')
424
+  redundant.column = grep ('species.y', colnames (tbl))
425
+  stopifnot (length (redundant.column) == 1)
426
+  tbl = tbl [, -redundant.column]
427
+  column.to.rename = grep ('species.x', colnames (tbl))
428
+  stopifnot (length (column.to.rename) == 1)
429
+  colnames (tbl) [column.to.rename] = 'species'
430
+  
431
+
432
+   # now add 'standard IDs' -- orf names for yeast, entrez geneIDs for everything else
433
+
434
+  stdID = getAllStandardIDs (tbl)
435
+  tbl = cbind (tbl, stdID)
436
+  duplicates = which (duplicated (tbl [, c (1,2)]))
437
+  if (length (duplicates) > 0)
438
+    tbl = tbl [-duplicates,]
439
+  
440
+    # fix the organism (species) name, from (e.g.) "Homo sapiens" to "Hsapiens"
441
+  tbl$species = standardizeSpeciesNames (tbl$species)
442
+  invisible (tbl)
443
+
444
+    # now add bindingSequence, from DBDs:  gene_name + seq, apparently the binding sequence when known, of 171 of 372 genes
445
+  tbl.dbds = dbGetQuery (db, 'select * from DBDs')  
446
+  dbds.list = tbl.dbds$seq
447
+  names (dbds.list) = tbl.dbds$gene_name
448
+  bindingSequence = rep (NA_character_, nrow (tbl))
449
+  names (bindingSequence) = tbl$name
450
+  shared.names = unique (intersect (tbl.dbds$gene_name, tbl$name))
451
+  bindingSequence [shared.names] = dbds.list [shared.names]
452
+  tbl = cbind (tbl, bindingSequence)
453
+  invisible (tbl)
454
+  
455
+} # createGeneRefTable
456
+#------------------------------------------------------------------------------------------------------------------------
457
+# make successive species-specific calls to 'uniprotToStandardID', assembling a new column to be added to tbl.geneRef
458
+getAllStandardIDs = function (tbl.geneRef)
459
+{
460
+  mouse.rows  = grep ('Mus musculus', tbl.geneRef$species)
461
+  yeast.rows = grep ('Saccharomyces cerevisiae', tbl.geneRef$species)
462
+  human.rows = grep ('Homo sapiens', tbl.geneRef$species)
463
+  worm.rows = grep ('Caenorhabditis elegans', tbl.geneRef$species)
464
+
465
+  stdID = rep (NA_character_, nrow (tbl.geneRef))
466
+
467
+  mouse.uids = tbl.geneRef$uniprot [mouse.rows]
468
+  yeast.uids = tbl.geneRef$uniprot [yeast.rows]
469
+  human.uids = tbl.geneRef$uniprot [human.rows]
470
+  worm.uids = tbl.geneRef$uniprot [worm.rows]
471
+
472
+    # add mouse geneIDs
473
+  tbl.mouse =  uniprotToStandardID ('mouse', mouse.uids)
474
+  stopifnot (length (mouse.rows) == nrow (tbl.mouse))
475
+  stdID [mouse.rows] = tbl.mouse$std
476
+  
477
+    # add yeast orfs
478
+  tbl.yeast =  uniprotToStandardID ('yeast', yeast.uids)
479
+  stopifnot (length (yeast.rows) == nrow (tbl.yeast))
480
+  stdID [yeast.rows] = tbl.yeast$std
481
+  
482
+    # add human geneIDs
483
+  tbl.human =  uniprotToStandardID ('human', human.uids)
484
+  stopifnot (length (human.rows) == nrow (tbl.human))
485
+  stdID [human.rows] = tbl.human$std
486
+
487
+    # add worm geneIDs
488
+  tbl.worm =  uniprotToStandardID ('worm', worm.uids)
489
+  stopifnot (length (worm.rows) == nrow (tbl.worm))
490
+  stdID [worm.rows] = tbl.worm$std
491
+
492
+  invisible (stdID)
493
+
494
+} # getAllStandardIDs
495
+#------------------------------------------------------------------------------------------------------------------------
496
+# change, e.g., "Homo sapiens" to "Hsapiens"
497
+standardizeSpeciesNames = function (names)
498
+{
499
+   fix = function (name) {
500
+     tokens = strsplit (name, ' ')[[1]]
501
+     stopifnot (length (tokens) == 2)
502
+     genus = tokens [1]
503
+     species = tokens [2]
504
+     return (paste (substr (genus, 1, 1), species, sep=''))
505
+     } 
506
+
507
+   fixed.names = as.character (sapply (names, fix))
508
+   invisible (fixed.names)
509
+
510
+} # standardizeSpeciesNames
511
+#------------------------------------------------------------------------------------------------------------------------
512
+renameMatrices = function (matrices, tbl.md)
513
+{
514
+  stopifnot (length (matrices) == nrow (tbl.md))
515
+  names (matrices) = rownames (tbl.md)
516
+  invisible (matrices)
517
+
518
+} # renameMatrices
519
+#------------------------------------------------------------------------------------------------------------------------