Browse code

first version

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75345 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 13:35:38
Showing2 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,349 @@
1
+# stamlab/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (org.Hs.eg.db)
4
+library (org.Mm.eg.db)
5
+#------------------------------------------------------------------------------------------------------------------------
6
+printf <- function(...) print(noquote(sprintf(...)))
7
+#------------------------------------------------------------------------------------------------------------------------
8
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab"
9
+kDataDir <- "~/s/data/public/TFBS/stam"
10
+#------------------------------------------------------------------------------------------------------------------------
11
+run = function (dataDir=kDataDir)
12
+{
13
+  rawMatrixList <- readRawMatrices (dataDir)
14
+  novels <- readNovelStatus (dataDir)
15
+  matrices <- extractAndNormalizeMatrices (rawMatrixList)
16
+  tbl.md <- createMetadataTable (matrices, novels)
17
+  matrices <- renameMatrices (matrices, tbl.md)
18
+
19
+  serializedFile <- "stamlab.RData"
20
+  save (matrices, tbl.md, file=serializedFile)
21
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
22
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
23
+
24
+} # run
25
+#------------------------------------------------------------------------------------------------------------------------
26
+readRawMatrices = function (dataDir)
27
+{
28
+  filename <- file.path(dataDir, "de.novo.pwm")
29
+  all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
30
+  title.lines = grep ('UW.Motif', all.lines)
31
+  title.line.count <<- length (title.lines)
32
+  max = title.line.count - 1
33
+
34
+  pwms = list ()
35
+  
36
+  for (i in 1:max) {
37
+    start.line = title.lines [i]
38
+    end.line = title.lines [i+1] - 1
39
+    new.pwm = parsePwm (all.lines [start.line:end.line])
40
+    pwms = c (pwms, list (new.pwm))
41
+    } # for i
42
+
43
+  
44
+  invisible (pwms)
45
+
46
+} # readRawMatrices
47
+#------------------------------------------------------------------------------------------------------------------------
48
+readNovelStatus = function (dataDir)
49
+{
50
+  filename <- file.path(dataDir, "novels.txt")
51
+  novels = scan (filename, what=character(0), sep='\n', quiet=TRUE)
52
+  all.names = sprintf ('UW.Motif.%04d', 1:683)
53
+  status = rep (FALSE, length (all.names))
54
+  true.novels = match (novels, all.names)
55
+  status [true.novels] = TRUE
56
+  names (status) = all.names
57
+  return (status)
58
+  
59
+} # readNovelStatus
60
+#------------------------------------------------------------------------------------------------------------------------
61
+extractAndNormalizeMatrices = function (pwm.list)
62
+{
63
+  ms = sapply (pwm.list, function (element) element$matrix)
64
+  nms = normalizeMatrices (ms)
65
+  names (nms) = sapply (pwm.list, function (element) element$title)
66
+  return (nms)
67
+
68
+} # extractAndNormalizeMatrices
69
+#------------------------------------------------------------------------------------------------------------------------
70
+#  matrices = sapply (list.pwms, function (pwm) pwm$matrix)
71
+#  matrix.names = sapply (list.pwms, function (pwm) pwm$title)
72
+#  names (matrices) = matrix.names
73
+convertRawMatricesToStandard = function (tbl.rmat)
74
+{
75
+  matrix.ids = unique (tbl.rmat$id)
76
+  result =  vector ('list', length (matrix.ids))
77
+
78
+  i = 1
79
+  for (matrix.id in matrix.ids) {
80
+    tbl.sub = subset (tbl.rmat, id == matrix.id)
81
+      # sanity check this rather unusual representation of a position count matrix
82
+    base.count = as.data.frame (table (tbl.sub$base))
83
+    stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
84
+      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
85
+    if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
86
+      printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
87
+      }
88
+    stopifnot (all (base.count$Freq == base.count$Freq [1]))
89
+    nucleotide.counts = tbl.sub$count
90
+    row.names = c('A', 'C', 'G', 'T'); 
91
+    col.names = 1:(nrow (tbl.sub) / 4)
92
+    m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
93
+    result [[i]] = m
94
+    i = i + 1
95
+    } # for matrix.id
96
+
97
+  names (result) = matrix.ids
98
+  return (result)
99
+
100
+} # convertRawMatricesToStandard 
101
+#------------------------------------------------------------------------------------------------------------------------
102
+createAnnotationTable = function ()
103
+{
104
+  tbl.matrix =  read.table ('MATRIX.txt', sep='\t', header=F, as.is=TRUE)
105
+  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
106
+
107
+  tbl.protein = read.table ('MATRIX_PROTEIN.txt', sep='\t', header=F, as.is=TRUE)
108
+  colnames (tbl.protein) =  c ('id', 'proteinID')
109
+
110
+  tbl.species = read.table ('MATRIX_SPECIES.txt', sep='\t', header=F, as.is=TRUE)
111
+  colnames (tbl.species) = c ('id', 'speciesID')
112
+
113
+  tbl.anno = read.table ('MATRIX_ANNOTATION.txt', sep='\t', header=F, as.is=TRUE, quote="")
114
+  colnames (tbl.anno) = c ('id', 'attribute', 'value')
115
+
116
+  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
117
+  colnames (tbl.family) = c ('id', 'family')
118
+
119
+  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
120
+  colnames (tbl.tax) = c ('id', 'tax')
121
+
122
+  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
123
+  colnames (tbl.class) = c ('id', 'class')
124
+
125
+  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
126
+  colnames (tbl.comment) = c ('id', 'comment')
127
+
128
+  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
129
+  colnames (tbl.pubmed) = c ('id', 'pubmed')
130
+
131
+  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
132
+  colnames (tbl.type) = c ('id', 'type')
133
+
134
+
135
+  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
136
+  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
137
+  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
138
+  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
139
+  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
140
+  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
141
+  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
142
+
143
+  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
144
+  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
145
+
146
+  invisible (tbl.md)
147
+
148
+} # createAnnotationTable
149
+#------------------------------------------------------------------------------------------------------------------------
150
+# assemble these columns:
151
+#                      names=character(),                    # species-source-gene: stamlab-Hsapiens-UW.Motif.0001
152
+#                      nativeNames=character(),              # UW.Motif.0001
153
+#                      geneSymbols=character(),              # NA
154
+#                      sequenceCounts=integer(),             # NA
155
+#                      organisms=character(),                # Hsapiens
156
+#                      bindingMolecules=character(),         # NA
157
+#                      bindingMoleculeIdTypes=character(),   # NA
158
+#                      bindingDomainTypes=character(),       # NA
159
+#                      dataSources=character(),              # stamlab
160
+#                      experimentTypes=character(),          # digital genomic footprinting
161
+#                      pubmedIDs=character(),                # 22959076
162
+#                      tfFamilies=character())               # NA
163
+#
164
+# from these
165
+#
166
+createMetadataTable = function (matrices, novels)
167
+{
168
+  options (stringsAsFactors=FALSE)
169
+  tbl.md = data.frame ()
170
+  matrix.ids = names (matrices) 
171
+  
172
+  for (matrix.id in matrix.ids) {
173
+    matrix = matrices [[matrix.id]]
174
+    taxon.code = 'Hsapiens'
175
+    geneId.info = NA
176
+    new.row = list (providerName=matrix.id,
177
+                    providerId=matrix.id,
178
+                    dataSource='stamlab',
179
+                    geneSymbol=NA,
180
+                    geneId=NA,
181
+                    geneIdType=NA,
182
+                    proteinId=NA,
183
+                    proteinIdType=NA,
184
+                    organism='Hsapiens',
185
+                    sequenceCount=NA,
186
+                    bindingSequence=NA_character_,
187
+                    bindingDomain=NA,
188
+                    tfFamily=NA,
189
+                    experimentType='digital genomic footprinting',
190
+                    pubmedID='22959076')
191
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
192
+    full.name = sprintf ('%s-%s-%s', 'Hsapiens', 'stamlab', matrix.id)
193
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
194
+    } # for i
195
+
196
+  novelPFM = rep ('knownMotif', nrow (tbl.md))
197
+  novels.ordered = novels [tbl.md$providerName]  # make sure we follow the order in the tbl
198
+  novelPFM [which (novels.ordered)] = 'novelMotif'
199
+  tbl.md$geneId = novelPFM
200
+  tbl.md$geneIdType = rep ('comment', nrow (tbl.md))
201
+
202
+  invisible (tbl.md)
203
+
204
+} # createMetadataTable
205
+#------------------------------------------------------------------------------------------------------------------------
206
+renameMatrices = function (matrices, tbl.md)
207
+{
208
+  stopifnot (length (matrices) == nrow (tbl.md))
209
+  names (matrices) = rownames (tbl.md)
210
+  invisible (matrices)
211
+
212
+} # renameMatrices
213
+#------------------------------------------------------------------------------------------------------------------------
214
+convertTaxonCode = function (ncbi.code)
215
+{
216
+  #if (is.na (ncbi.code))  
217
+  #  return (NA_character_)
218
+  ncbi.code = as.character (ncbi.code)
219
+  if (ncbi.code %in% c ('-', NA_character_, 'NA'))
220
+    return ('Vertebrata')
221
+
222
+  tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
223
+                            '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
224
+                            '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
225
+                    name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
226
+                            'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
227
+                            'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
228
+                            'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
229
+                            'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
230
+                    stringsAsFactors=FALSE)
231
+
232
+  ncbi.code = as.character (ncbi.code)
233
+  index = which (tbl$code == ncbi.code)
234
+  if (length (index) == 1)
235
+    return (tbl$name [index])
236
+  else {
237
+    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
238
+    return (NA_character_)
239
+    }
240
+
241
+} # convertTaxonCode
242
+#------------------------------------------------------------------------------------------------------------------------
243
+# an empirical and not altogether trustworthy solution to identifying identifier types.
244
+guessProteinIdentifierType = function (moleculeName)
245
+{
246
+  if (nchar (moleculeName) == 0)
247
+    return (NA_character_)
248
+  if (is.na (moleculeName))
249
+    return (NA_character_) 
250
+
251
+  first.char = substr (moleculeName, 1, 1)
252
+
253
+  if (first.char == 'Y')
254
+    return ('SGD')
255
+
256
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
257
+    return ('UNIPROT')
258
+
259
+  if (length (grep ('^EAW', moleculeName)) == 1)
260
+    return ('NCBI')
261
+
262
+  if (length (grep ('^EAX', moleculeName)) == 1)
263
+    return ('NCBI')
264
+
265
+  if (length (grep ('^NP_', moleculeName)) == 1)
266
+    return ('REFSEQ')
267
+
268
+  if (length (grep ('^BAD', moleculeName)) == 1)
269
+    return ('EMBL')
270
+
271
+   return (NA_character_)
272
+
273
+} # guessProteinIdentifierType
274
+#------------------------------------------------------------------------------------------------------------------------
275
+normalizeMatrices = function (matrices)
276
+{
277
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
278
+  invisible (mtx.normalized)
279
+
280
+} # normalizeMatrices
281
+#------------------------------------------------------------------------------------------------------------------------
282
+assignGeneId = function (proteinId)
283
+{
284
+  if (!exists ('id.uniprot.human')) {
285
+
286
+    tbl = toTable (org.Hs.egUNIPROT)
287
+    id.uniprot.human <<- as.character (tbl$gene_id)
288
+    names (id.uniprot.human) <<- tbl$uniprot_id
289
+
290
+    tbl = toTable (org.Hs.egREFSEQ)
291
+    tbl = tbl [grep ('^NP_', tbl$accession),]
292
+    id.refseq.human <<- as.character (tbl$gene_id)
293
+    names (id.refseq.human) <<- tbl$accession
294
+
295
+    tbl = toTable (org.Mm.egUNIPROT)
296
+    id.uniprot.mouse <<- as.character (tbl$gene_id)
297
+    names (id.uniprot.mouse) <<- tbl$uniprot_id
298
+
299
+    tbl = toTable (org.Mm.egREFSEQ)
300
+    tbl = tbl [grep ('^NP_', tbl$accession),]
301
+    id.refseq.mouse <<- as.character (tbl$gene_id)
302
+    names (id.refseq.mouse) <<- tbl$accession
303
+    }
304
+
305
+  proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
306
+
307
+  if (proteinId %in% names (id.uniprot.human))
308
+    return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
309
+
310
+  if (proteinId %in% names (id.uniprot.mouse))
311
+    return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
312
+
313
+  if (proteinId %in% names (id.refseq.human))
314
+    return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
315
+
316
+  if (proteinId %in% names (id.refseq.mouse))
317
+    return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
318
+
319
+  found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
320
+
321
+  if (found.leading.Y == 1)
322
+    return (list (geneId=proteinId, type='SGD'))
323
+
324
+  return (list (geneId=NA_character_, type=NA_character_))
325
+
326
+} # assignGeneId
327
+#------------------------------------------------------------------------------------------------------------------------
328
+parsePwm = function (text)
329
+{
330
+  #printf ('parsing pwm %s', text [1])
331
+  lines = strsplit (text, '\t')
332
+  title = lines [[1]][1]
333
+  consensus.sequence = lines [[1]][2]
334
+  line.count = length (lines)
335
+  #printf ('%s: %s', title, consensus.sequence)
336
+
337
+  result = matrix (nrow=line.count-1, ncol=4, dimnames=list(1:(line.count-1), c ('A','C','G','T')))  
338
+  row = 1
339
+  for (line in lines [2:line.count]) {
340
+    result [row,] = as.numeric (line)
341
+    row = row + 1
342
+    } # for line
343
+
344
+  result = t (result)
345
+    
346
+  return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
347
+
348
+} # parsePwm
349
+#------------------------------------------------------------------------------------------------------------------------
0 350
new file mode 100644
... ...
@@ -0,0 +1,210 @@
1
+# stamlab/test.R
2
+# notes:  3 matrices come w/o speciesID, tax = 'vertebrates'.  not our problem to fix, at least not yet.
3
+#         TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
4
+#------------------------------------------------------------------------------------------------------------------------
5
+library (RUnit)
6
+#------------------------------------------------------------------------------------------------------------------------
7
+source("import.R")
8
+#------------------------------------------------------------------------------------------------------------------------
9
+run.tests = function (dataDir=kDataDir)
10
+{
11
+  x.rawMatrixList <<- test.readRawMatrices (dataDir)
12
+  x.novels <<- test.readNovelStatus (dataDir)
13
+  x.matrices <<- test.extractAndNormalizeMatrices (x.rawMatrixList)
14
+  x.tbl.md <<- test.createMetadataTable (x.matrices, x.novels)
15
+  x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md)
16
+
17
+} # run.tests
18
+#------------------------------------------------------------------------------------------------------------------------
19
+test.readRawMatrices = function (dataDir)
20
+{
21
+  print ('--- test.readMatrices')
22
+  list.pwms = readRawMatrices (dataDir)
23
+  checkEquals (length (list.pwms), 683)
24
+  checkEquals (names (list.pwms [[1]]), c ("title", "consensus.sequence", "matrix"))
25
+  checkEquals (rownames (list.pwms[[1]]$matrix),  c ("A", "C", "G", "T"))
26
+  invisible (list.pwms)
27
+
28
+} # test.readRawMatrices
29
+#------------------------------------------------------------------------------------------------------------------------
30
+test.readNovelStatus = function (dataDir)
31
+{
32
+  print ('--- test.readNovelStatus')
33
+  novel.status = readNovelStatus (dataDir)
34
+  checkEquals (length (novel.status), 683)
35
+  checkEquals (length (which (novel.status == TRUE)), 289)
36
+    # do a spot check around first novel in novels.txt
37
+  checkEquals (as.logical (novel.status [c ('UW.Motif.0010', 'UW.Motif.0011', 'UW.Motif.0012')]),
38
+                             c (FALSE, FALSE, TRUE))
39
+  invisible (novel.status)
40
+
41
+} # readNovelStatus
42
+#------------------------------------------------------------------------------------------------------------------------
43
+test.extractAndNormalizeMatrices = function (x.rawMatrixList)
44
+{
45
+  print ('--- test.extractAndNormalizeMatrices')
46
+  matrices.fixed <<- extractAndNormalizeMatrices (x.rawMatrixList)
47
+    # make sure a UW.Motif.0nnn name accompanies each matrix
48
+  checkEquals (length (grep ('UW.Motif.0', names (matrices.fixed))), length (matrices.fixed))
49
+    # make sure all columns in all matrices sum to 1.0
50
+  checkTrue (all (sapply (matrices.fixed, function (m) all (abs (colSums (m) - 1.0) < 1e-10))))
51
+  invisible (matrices.fixed)
52
+
53
+} # test.extractAndNormalizeMatrices
54
+#------------------------------------------------------------------------------------------------------------------------
55
+test.convertRawMatricesToStandard = function (tbl.rmat)
56
+{
57
+  print ('--- test.convertRawMatricesToStandard')
58
+   # get just the first two raw matrices
59
+
60
+  first.two.ids = head (unique (tbl.rmat$id), n=2)
61
+  rows = nrow (subset (tbl.rmat, id %in% first.two.ids))
62
+  matrices = convertRawMatricesToStandard (tbl.rmat [1:rows,])
63
+  checkEquals (length (matrices), 2)
64
+  checkEquals (names (matrices), first.two.ids)
65
+
66
+    # it will not always be true, but IS true for the first two matrices, currently "9229" and  "9231", that there
67
+    # are an equal number of nucleotides at each position.
68
+  checkTrue (all (colSums (matrices [[1]]) == 97))
69
+  checkTrue (all (colSums (matrices [[2]]) == 185))
70
+
71
+    # now run all the matrices through
72
+  matrices = convertRawMatricesToStandard (tbl.rmat)
73
+  checkEquals (length (matrices), 459)
74
+  checkEquals (names (matrices)[1:2], first.two.ids)
75
+  
76
+  invisible (matrices)
77
+  
78
+} # test.convertRawMatricesToStandard 
79
+#------------------------------------------------------------------------------------------------------------------------
80
+test.createAnnotationTable = function ()
81
+{
82
+  print ('--- test.createAnnotationTable')
83
+  tbl.anno = createAnnotationTable ()
84
+  checkEquals (dim (tbl.anno), c (513, 13))
85
+  expected = c ("fullID", "id", "category", "mID", "version", "binder", "speciesID", "proteinID", "family", "tax", "class", "pubmed", "type")
86
+  checkEquals (colnames (tbl.anno), expected)
87
+
88
+  checkEquals (head (tbl.anno$fullID),  c ("MA0001.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1", "MA0006.1"))
89
+  invisible (tbl.anno)
90
+
91
+} # test.createAnnotationTable
92
+#------------------------------------------------------------------------------------------------------------------------
93
+test.createMetadataTable = function (x.matrices, x.novels)
94
+{
95
+  print ('--- test.createMetadataTable')
96
+   # try it first with just two matrices
97
+  tbl.md = createMetadataTable (x.matrices [1:12], x.novels [1:12])
98
+  checkEquals (dim (tbl.md), c (12, 15))
99
+  checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", 
100
+                                     "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence",
101
+                                     "bindingDomain", "tfFamily", "experimentType", "pubmedID"))
102
+   checkEquals (tbl.md$providerName [1:2], c ('UW.Motif.0001', 'UW.Motif.0002'))
103
+   checkEquals (tbl.md$providerId [1:2], c ('UW.Motif.0001', 'UW.Motif.0002'))
104
+   checkEquals (tbl.md$pubmedID [1:2], c ('22959076', '22959076'))
105
+   checkEquals (tbl.md$dataSource [1:2], c ('stamlab', 'stamlab'))
106
+   checkEquals (tbl.md$organism [1:2], c ('Hsapiens', 'Hsapiens'))
107
+   checkEquals (tbl.md$experimentType [1:2], c ('digital genomic footprinting', 'digital genomic footprinting'))
108
+   checkEquals (tbl.md$geneId, c (rep ('knownMotif', 11), 'novelMotif'))
109
+
110
+  invisible (tbl.md)
111
+
112
+} # test.createMetadataTable
113
+#------------------------------------------------------------------------------------------------------------------------
114
+test.renameMatrices = function (matrices, tbl.md)
115
+{
116
+  print("--- test.renameMatrices")
117
+  
118
+    # try it with just the first two matrices
119
+  matrix.pair = matrices [1:2]
120
+  tbl.pair = tbl.md [1:2,]
121
+  matrix.pair.renamed = renameMatrices (matrix.pair, tbl.pair)
122
+  checkEquals (names (matrix.pair.renamed), c ("Hsapiens-stamlab-UW.Motif.0001", "Hsapiens-stamlab-UW.Motif.0002"))
123
+
124
+} # test.renameMatrices
125
+#------------------------------------------------------------------------------------------------------------------------
126
+test.convertTaxonCode = function ()
127
+{
128
+  print ('--- test.convertTaxonCode')
129
+
130
+  checkEquals (convertTaxonCode ('9606'), 'Hsapiens')
131
+  checkEquals (convertTaxonCode (9606), 'Hsapiens')
132
+     # anomalous codes, which an examination of the jaspar website reveals as 'vertebrates'
133
+  checkEquals (convertTaxonCode (NA), 'Vertebrata')
134
+  checkEquals (convertTaxonCode ('NA'), 'Vertebrata')
135
+  checkEquals (convertTaxonCode (NA_character_), 'Vertebrata')
136
+  checkEquals (convertTaxonCode ('-'), 'Vertebrata')
137
+
138
+} # test.convertTaxonCode
139
+#------------------------------------------------------------------------------------------------------------------------
140
+test.guessProteinIdentifierType = function (moleculeName)
141
+{
142
+  print ('--- test.guessProteinIdentifierType')
143
+  checkEquals (guessProteinIdentifierType ('P29383'), 'UNIPROT')
144
+
145
+  all.types = sapply (x.tbl.anno$proteinID, guessProteinIdentifierType)
146
+  checkTrue (length (which (is.na (all.types))) < 12)   # got most of them.
147
+
148
+} # test.guessProteinIdentifierType
149
+#------------------------------------------------------------------------------------------------------------------------
150
+test.normalizeMatrices = function (matrices)
151
+{
152
+  print ('--- test.normalizeMatrices')
153
+
154
+  colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
155
+  #checkTrue (all (colsums > 1))
156
+
157
+  matrices.norm = normalizeMatrices (matrices)
158
+
159
+  colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx))))))
160
+  checkTrue (all (colsums == 1))
161
+
162
+  invisible (matrices.norm)
163
+
164
+} # test.normalizeMatrices
165
+#------------------------------------------------------------------------------------------------------------------------
166
+test.assignGeneId = function (proteinId)
167
+{
168
+  print ('--- test.assignGeneId')
169
+  uniprot.ids = c ('Q9GRA5', 'P31314', 'AAC18941', 'O49397')
170
+  refseq.ids  = c ('NP_995315.1', 'NP_032840', 'NP_599022')
171
+  yeast.ids   = c ('YKL112W', 'YMR072W', 'YLR131C')
172
+
173
+  checkEquals (assignGeneId ('NP_995315.1'), list (geneId='4782', type='ENTREZ'))
174
+  checkEquals (assignGeneId ('NP_599022'),   list (geneId='6095', type='ENTREZ'))
175
+
176
+  checkEquals (assignGeneId ('P31314'),      list (geneId='3195', type='ENTREZ'))
177
+
178
+  checkEquals (assignGeneId ('YKL112W'),     list (geneId='YKL112W', type='SGD'))
179
+
180
+    # see how successful this is over all 513 proteinIds
181
+
182
+  tbl.anno = createAnnotationTable ()
183
+  mtx.geneId = as.data.frame (t (sapply (tbl.anno$proteinID, assignGeneId)))
184
+  tbl.types = as.data.frame (table (as.character (mtx.geneId$type), useNA='always'), stringsAsFactors=FALSE)
185
+  checkEquals (tbl.types$Var1, c ("ENTREZ", "SGD", NA))
186
+  checkEquals (tbl.types$Freq, c (141, 177, 195))
187
+
188
+} # test.assignGeneId
189
+#------------------------------------------------------------------------------------------------------------------------
190
+test.parsePwm = function ()
191
+{
192
+  print ('--- test.parsePwm')
193
+  lines = c ('UW.Motif.0006	aggaaatg',
194
+             '0.890585	0.007855	0.051323	0.050237',
195
+             '0.060732	0.004506	0.894170	0.040593',
196
+             '0.072765	0.037935	0.860704	0.028596',
197
+             '0.929585	0.024037	0.034914	0.011464',
198
+             '0.931220	0.023231	0.029078	0.016471',
199
+             '0.857934	0.044211	0.072594	0.025261',
200
+             '0.065840	0.013777	0.058013	0.862370',
201
+             '0.049937	0.036238	0.861871	0.051953')
202
+  m6 = parsePwm (lines)
203
+  checkEquals (names (m6), c ("title", "consensus.sequence", "matrix"))
204
+  pwm = m6$matrix
205
+  checkEquals (dim (pwm), c (4, 8))
206
+  checkEquals (rownames (pwm), c ('A', 'C', 'G', 'T'))
207
+  invisible (m6)
208
+
209
+} # test.parsePwm
210
+#------------------------------------------------------------------------------------------------------------------------