Browse code

tristan's working versions

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@88754 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 10/04/2014 19:09:39
Showing2 changed files

... ...
@@ -1,180 +1,131 @@
1 1
 # jaspar2014/import.R
2
-# 
3
-#------------------------------------------------------------------------------------------------------------------------
2
+#-----------------------------------------------------------------------------------
4 3
 library (org.Hs.eg.db)
5 4
 library (org.Mm.eg.db)
6 5
 #------------------------------------------------------------------------------------------------------------------------
6
+options (stringsAsFactors=FALSE)
7 7
 printf <- function(...) print(noquote(sprintf(...)))
8 8
 #------------------------------------------------------------------------------------------------------------------------
9 9
 run = function (dataDir)
10 10
 {
11
-  dataDir <- file.path(dataDir, "jaspar")
12
-  tbl.rmat = readRawMatrices (dataDir)
13
-  matrices = convertRawMatricesToStandard (tbl.rmat)
14
-  tbl.anno = createAnnotationTable (dataDir)
15
-  tbl.md = createMetadataTable (tbl.anno, matrices)
16
-  matrices = renameMatrices (matrices, tbl.md)
17
-  matrices = normalizeMatrices (matrices)
18
-  serializedFile <- "jaspar.RData"
11
+  dataDir <- file.path(dataDir,"jaspar2014")
12
+  rawMatrixList <- readRawMatrices (dataDir)
13
+  matrices <- extractMatrices (rawMatrixList)
14
+  tbl.anno <- createAnnotationTable(dataDir)
15
+  tbl.md <- createMetadataTable (tbl.anno, matrices)
16
+  matrices <- normalizeMatrices (matrices)
17
+  matrices <- renameMatrices (matrices, tbl.md)
18
+  serializedFile <- file.path(dataDir, "jaspar2014.RData")
19 19
   save (matrices, tbl.md, file=serializedFile)
20 20
   printf("saved %d matrices to %s", length(matrices), serializedFile)
21
-  printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
21
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
22 22
 
23 23
 } # run
24 24
 #------------------------------------------------------------------------------------------------------------------------
25
-
26 25
 readRawMatrices = function (dataDir)
27 26
 {
28
-  file <- file.path(dataDir, 'MATRIX_DATA.txt')
29
-  #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
30
-  #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
31
-  tbl.matrices = read.table (file, sep='\n', header=FALSE )
27
+    # our convention is that there is a shared "dataDir" visible to
28
+    # the importer, and that within that directory there is one
29
+    # subdirectory for each data source.
30
+    # for this example importer, that directory will be <dataDir>/test
31
+    # within which we will look for one small file "sample.pcm"
32
+    
33
+
34
+  filename <- file.path(dataDir, "matrix_data.txt")
35
+  stopifnot(file.exists(filename))
32 36
   
33
-  invisible (tbl.matrices)
37
+  all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
38
+  title.lines = grep ('^>', all.lines)
39
+  title.line.count <<- length (title.lines)
40
+  max = title.line.count - 1
34 41
 
42
+  pwms = list ()
43
+  
44
+  for (i in 1:max) {
45
+    start.line = title.lines [i]
46
+    end.line = title.lines [i+1] - 1
47
+    new.pwm = parsePwm (all.lines [start.line:end.line])
48
+    pwms = c (pwms, list (new.pwm))
49
+    } # for i
50
+  invisible (pwms)
35 51
 } # readRawMatrices
36 52
 #------------------------------------------------------------------------------------------------------------------------
37
-convertRawMatricesToStandard = function (tbl.rmat)
53
+extractMatrices = function (pwm.list)
38 54
 {
39
-  #matrix.ids = unique (tbl.rmat$id)
40
-  #result =  vector ('list', length (matrix.ids))
41
-
42
-  #i = 1
43
-  #for (matrix.id in matrix.ids) {
44
-  #  tbl.sub = subset (tbl.rmat, id == matrix.id)
45
-  #    # sanity check this rather unusual representation of a position count matrix
46
-  #  base.count = as.data.frame (table (tbl.sub$base))
47
-  #  stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
48
-      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
49
-  #  if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
50
-  #    printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
51
-  #    }
52
-  #  stopifnot (all (base.count$Freq == base.count$Freq [1]))
53
-  # nucleotide.counts = tbl.sub$count
54
-  #  row.names = c('A', 'C', 'G', 'T'); 
55
-  #  col.names = 1:(nrow (tbl.sub) / 4)
56
-  #  m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
57
-  #  result [[i]] = m
58
-  #  i = i + 1
59
-  #  } # for matrix.id
60
-
61
-  #names (result) = matrix.ids
62
-
63
-  result<- vector("list",length=593)
64
-  matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE)
65
-  names (result) = matrices.id
66
-  #Get a list of the ID's and then assign them as names 
67
-  m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE)
68
-  data.list=list()
69
-  x = 1
70
-  y = 1
71
-  while (x < length(m.d))
72
-    {   
73
-      matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3])
74
-      matrices.formatted <- gsub("\t"," ",matrices.position)
75
-      #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t"))
76
-      row.names = c('A', 'C', 'G', 'T')
77
-      col.names = 1:(ncol (matrices.formatted))
78
-      #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,))
79
-      result[[y]] <- matrices.formatted
80
-      y=y+1
81
-      x=x+4
82
-    }
83
-  browser()
84
-  return (result)
85
-
86
-} # convertRawMatricesToStandard 
55
+  matrices = sapply (pwm.list, function (element) element$matrix)
56
+  matrix.names <- sapply (pwm.list, function (element) element$title)
57
+  matrix.names <- sub("^>", "", matrix.names)
58
+  names (matrices) <- matrix.names
59
+  
60
+  return (matrices)
61
+} # extractMatrices
87 62
 #------------------------------------------------------------------------------------------------------------------------
88
-# read 'mysql' tables provide by jaspar: 
89
-#          MATRIX.txt:  9229	CORE	MA0001	1	AGL3
90
-#  MATRIX_PROTEIN.txt: 9229	P29383
91
-#  MATRIX_SPECIES.txt: 9229	3702
92
-#  MATRIX_ANNOTATION.txt: 
93
-#     9229	class	Other Alpha-Helix
94
-#     9229	comment	-
95
-#     9229	family	MADS
96
-#     9229	medline	7632923
97
-#     9229	tax_group	plants
98
-#     9229	type	SELEX
99
-createAnnotationTable = function (dataDir)
63
+createAnnotationTable <- function(dataDir)
100 64
 {
101
-  file <- file.path(dataDir, "MATRIX.txt")
102
-  tbl.matrix =  read.table (file, sep='\t', header=F, as.is=TRUE)
103
-  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
104
-
105
-  file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
106
-  tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
107
-  colnames (tbl.protein) =  c ('id', 'proteinID')
108
-
109
-  file <- file.path(dataDir, "MATRIX_SPECIES.txt")
110
-  tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
111
-  colnames (tbl.species) = c ('id', 'speciesID')
112
-
113
-  file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
114
-  tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
115
-  colnames (tbl.anno) = c ('id', 'attribute', 'value')
116
-
117
-  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
118
-  colnames (tbl.family) = c ('id', 'family')
119
-
120
-  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
121
-  colnames (tbl.tax) = c ('id', 'tax')
122
-
123
-  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
124
-  colnames (tbl.class) = c ('id', 'class')
125
-
126
-  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
127
-  colnames (tbl.comment) = c ('id', 'comment')
128
-
129
-  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
130
-  colnames (tbl.pubmed) = c ('id', 'pubmed')
131
-
132
-  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
133
-  colnames (tbl.type) = c ('id', 'type')
134
-
135
-
136
-  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
137
-  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
138
-  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
139
-  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
140
-  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
141
-  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
142
-  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
143
-
144
-  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
145
-  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
146
-
147
-  invisible (tbl.md)
148
-
65
+    file <- file.path(dataDir, "Matrix.txt")
66
+    tbl.matrix <-  read.table(file, sep='\t', header=FALSE, as.is=TRUE)
67
+    colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder')
68
+      
69
+    file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
70
+    stopifnot(file.exists(file))
71
+    tbl.protein <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
72
+    colnames(tbl.protein) <- c('id', 'proteinID')
73
+
74
+    file <- file.path(dataDir, "MATRIX_SPECIES.txt")
75
+    stopifnot(file.exists(file))
76
+    tbl.species <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
77
+    colnames(tbl.species) <- c('id', 'speciesID')
78
+    
79
+    file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
80
+    stopifnot(file.exists(file))
81
+    tbl.anno <- read.table(file, sep='\t', header=FALSE, as.is=TRUE, quote="")
82
+    colnames(tbl.anno) <- c('id', 'attribute', 'value')
83
+
84
+    file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
85
+    tbl.family  <- subset(tbl.anno, attribute=='family') [, -2];   
86
+    colnames(tbl.family) <- c('id', 'family')
87
+       # create 5 2-column data.frames out of tbl.anno
88
+       # which can all then be merged on the "id" column
89
+    tbl.tax <- subset(tbl.anno, attribute=='tax_group') [,-2]; 
90
+    colnames(tbl.tax) <- c('id', 'tax')
91
+
92
+    tbl.class <- subset(tbl.anno, attribute=='class') [,-2];     
93
+    colnames(tbl.class) <- c('id', 'class')
94
+
95
+    tbl.comment <- subset(tbl.anno, attribute=='comment')[,-2];    
96
+    colnames(tbl.comment) <- c('id', 'comment')
97
+
98
+    tbl.pubmed  <- subset(tbl.anno, attribute=='medline')[,-2];    
99
+    colnames(tbl.pubmed) <- c('id', 'pubmed')
100
+
101
+    tbl.type    <- subset(tbl.anno, attribute=='type')[,-2];       
102
+    colnames(tbl.type) <- c('id', 'type')
103
+
104
+    tbl.md <- merge(tbl.matrix, tbl.species, all.x=TRUE)
105
+    tbl.md <- merge(tbl.md, tbl.protein, all.x=TRUE)
106
+    tbl.md <- merge(tbl.md, tbl.family, all.x=TRUE)
107
+    tbl.md <- merge(tbl.md, tbl.tax, all.x=TRUE)
108
+    tbl.md <- merge(tbl.md, tbl.class, all.x=TRUE)
109
+    tbl.md <- merge(tbl.md, tbl.pubmed, all.x=TRUE)
110
+    tbl.md <- merge(tbl.md, tbl.type, all.x=TRUE)
111
+
112
+    fullID <- paste(tbl.md$mID, tbl.md$version, sep='.')
113
+    tbl.md <- cbind(fullID, tbl.md, stringsAsFactors=FALSE)
114
+
115
+    invisible(tbl.md)
116
+    
149 117
 } # createAnnotationTable
150
-#------------------------------------------------------------------------------------------------------------------------
151
-# assemble these columns:
152
-#                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
153
-#                      nativeNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
154
-#                      geneSymbols=character(),              # ABF2, Rhox11
155
-#                      sequenceCounts=integer(),             # often NA
156
-#                      organisms=character(),                # Scerevisiae, Mmusculus
157
-#                      bindingMolecules=character(),         # YMR072W, 194738
158
-#                      bindingMoleculeIdTypes=character(),   # SGD, entrez gene
159
-#                      bindingDomainTypes=character(),       # NA, Homeo
160
-#                      dataSources=character(),              # ScerTF, UniPROBE
161
-#                      experimentTypes=character(),          # NA, protein-binding microarray
162
-#                      pubmedIDs=character(),                # 19111667, 1858359
163
-#                      tfFamilies=character())               # NA, NA
164
-#
165
-# from these
166
-#
118
+#-------------------------------------------------------------------------------
167 119
 createMetadataTable = function (tbl.anno, matrices)
168 120
 {
169 121
   options (stringsAsFactors=FALSE)
170 122
   tbl.md = data.frame ()
171 123
   matrix.ids = names (matrices) 
172
-  dataSource = 'JASPAR_CORE'
124
+  dataSource = 'JASPAR_2014'
173 125
   
174 126
   for (matrix.id in matrix.ids) {
175 127
     matrix = matrices [[matrix.id]]
176
-    stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
177
-    tbl.sub = subset (tbl.anno, id==matrix.id)
128
+    tbl.sub = subset (tbl.anno, fullID==substr(matrix.id,0,8))
178 129
     if (nrow (tbl.sub) > 1) {  
179 130
         # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
180 131
         # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
... ...
@@ -208,19 +159,77 @@ createMetadataTable = function (tbl.anno, matrices)
208 159
     NA.string.indices = grep ('NA', tbl.md$proteinId)
209 160
     if (length (NA.string.indices) > 0)
210 161
       tbl.md$proteinId [NA.string.indices] = NA
211
-     
212 162
    invisible (tbl.md)
213
-
214
-} # createMetadataTable
163
+}#createMetadataTable
215 164
 #------------------------------------------------------------------------------------------------------------------------
216 165
 renameMatrices = function (matrices, tbl.md)
217 166
 {
218 167
   stopifnot (length (matrices) == nrow (tbl.md))
219 168
   names (matrices) = rownames (tbl.md)
220 169
   invisible (matrices)
221
-
170
+  
222 171
 } # renameMatrices
223 172
 #------------------------------------------------------------------------------------------------------------------------
173
+# an empirical and not altogether trustworthy solution to identifying identifier types.
174
+guessProteinIdentifierType = function (moleculeName)
175
+{
176
+  if (nchar (moleculeName) == 0)
177
+    return (NA_character_)
178
+  if (is.na (moleculeName))
179
+    return (NA_character_) 
180
+
181
+  first.char = substr (moleculeName, 1, 1)
182
+
183
+  if (first.char == 'Y')
184
+    return ('SGD')
185
+
186
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
187
+    return ('UNIPROT')
188
+
189
+  if (length (grep ('^NP_', moleculeName)) == 1)
190
+    return ('REFSEQ')
191
+
192
+   return (NA_character_)
193
+
194
+} # guessProteinIdentifierType
195
+#------------------------------------------------------------------------------------------------------------------------
196
+normalizeMatrices = function (matrices)
197
+{
198
+  mtx.normalized = sapply (matrices,
199
+      function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
200
+
201
+  invisible (mtx.normalized)
202
+
203
+} # normalizeMatrices
204
+#------------------------------------------------------------------------------------------------------------------------
205
+parsePwm = function (text)
206
+{
207
+   lines = strsplit (text, '\t')
208
+   #browser()
209
+   stopifnot(length(lines)==5) # title line, one line for each base
210
+   title = lines [[1]][1]
211
+   line.count = length(lines)
212
+
213
+     # expect 4 rows, and a number of columns we can discern from
214
+     # the incoming text.
215
+  cols <- length(lines[[2]])
216
+  result <- matrix (nrow=4, ncol=cols,
217
+                   dimnames=list(c('A','C','G','T'),
218
+                                 as.character(1:cols)))
219
+  row = 1
220
+  for(i in 2:line.count){
221
+    result [row,] = as.numeric (lines[[i]])
222
+    row = row + 1
223
+    } # for i
224
+
225
+  #result = t (result)
226
+    
227
+  #return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
228
+  return (list (title=title, matrix=result))
229
+
230
+} # parsePwm
231
+#----------------------------------------------------------------------------------------------------
232
+#------------------------------------------------------------------------------------------------------------------------
224 233
 # full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
225 234
 #                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
226 235
 #                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
... ...
@@ -248,52 +257,16 @@ convertTaxonCode = function (ncbi.code)
248 257
   index = which (tbl$code == ncbi.code)
249 258
   if (length (index) == 1)
250 259
     return (tbl$name [index])
260
+  if (nchar(ncbi.code)>6)
261
+    return ('Vertebrata')
251 262
   else {
252
-    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
263
+    browser()
264
+    write (sprintf ("unable to map organism code |%s|", ncbi.code), stderr ())
253 265
     return (NA_character_)
254 266
     }
255 267
 
256 268
 } # convertTaxonCode
257
-#------------------------------------------------------------------------------------------------------------------------
258
-# an empirical and not altogether trustworthy solution to identifying identifier types.
259
-guessProteinIdentifierType = function (moleculeName)
260
-{
261
-  if (nchar (moleculeName) == 0)
262
-    return (NA_character_)
263
-  if (is.na (moleculeName))
264
-    return (NA_character_) 
265
-
266
-  first.char = substr (moleculeName, 1, 1)
267
-
268
-  if (first.char == 'Y')
269
-    return ('SGD')
270
-
271
-  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
272
-    return ('UNIPROT')
273
-
274
-  if (length (grep ('^EAW', moleculeName)) == 1)
275
-    return ('NCBI')
276
-
277
-  if (length (grep ('^EAX', moleculeName)) == 1)
278
-    return ('NCBI')
279
-
280
-  if (length (grep ('^NP_', moleculeName)) == 1)
281
-    return ('REFSEQ')
282
-
283
-  if (length (grep ('^BAD', moleculeName)) == 1)
284
-    return ('EMBL')
285
-
286
-   return (NA_character_)
287
-
288
-} # guessProteinIdentifierType
289
-#------------------------------------------------------------------------------------------------------------------------
290
-normalizeMatrices = function (matrices)
291
-{
292
-  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
293
-  invisible (mtx.normalized)
294
-
295
-} # normalizeMatrices
296
-#------------------------------------------------------------------------------------------------------------------------
269
+#----------------------------------------------------------------------------------------------------
297 270
 assignGeneId = function (proteinId)
298 271
 {
299 272
   if (!exists ('id.uniprot.human')) {
... ...
@@ -339,4 +312,4 @@ assignGeneId = function (proteinId)
339 312
   return (list (geneId=NA_character_, type=NA_character_))
340 313
 
341 314
 } # assignGeneId
342
-#------------------------------------------------------------------------------------------------------------------------
315
+#-----------------------------------------------------------------------------
... ...
@@ -1,61 +1,45 @@
1
-# jaspar/test.R
2
-# 
1
+# jaspar2014/test.R
3 2
 #------------------------------------------------------------------------------------------------------------------------
4 3
 library (RUnit)
5 4
 source("import.R")
6 5
 #------------------------------------------------------------------------------------------------------------------------
7 6
 run.tests = function (dataDir)
8 7
 {
9
-  dataDir <- file.path(dataDir, "jaspar")
8
+  dataDir <- file.path(dataDir, "jaspar2014")
10 9
   x.tbl.rmat <<- test.readRawMatrices (dataDir)
11
-  x.matrices <<- test.convertRawMatricesToStandard (x.tbl.rmat)
10
+  x.matrices <<- test.extractMatrices (x.tbl.rmat)
12 11
   x.tbl.anno <<- test.createAnnotationTable (dataDir)
13 12
   test.assignGeneId (dataDir)
14 13
   x.tbl.md <<- test.createMetadataTable (x.tbl.anno, x.matrices)
15 14
   x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md, x.tbl.anno)
16 15
   x.matrices.normalized <<- test.normalizeMatrices (x.matrices.renamed)
17 16
 
17
+
18 18
 } # run.tests
19 19
 #------------------------------------------------------------------------------------------------------------------------
20 20
 test.readRawMatrices = function (dataDir)
21 21
 {
22 22
   print ('--- test.readMatrices')
23 23
   tbl.rmat = readRawMatrices (dataDir)
24
-  checkEquals (ncol (tbl.rmat), 4)
25
-  checkEquals (colnames (tbl.rmat), c ('id', 'base', 'pos', 'count'))
26
-  checkEquals (class (tbl.rmat$id),    'character')
27
-  checkEquals (class (tbl.rmat$base),  'character')
28
-  checkEquals (class (tbl.rmat$pos),   'numeric')
29
-  checkEquals (class (tbl.rmat$count), 'numeric')
30
-
31
-  checkTrue (nrow (tbl.rmat) > 18000)   # about 450 motifs, each represeted by 4 rows (ACGT) and about 10 positions
32
-  checkTrue (length (unique (tbl.rmat$id)) > 450)
24
+  browser()
25
+  checkTrue (length (tbl.rmat) == 592)
26
+  testMatrice <-tbl.rmat[[2]]
27
+  checkTrue(testMatrice[1]==">MA0006.1 Arnt::Ahr")
28
+    
33 29
   invisible (tbl.rmat)
34 30
 
35 31
 } # test.readRawMatrices
36 32
 #------------------------------------------------------------------------------------------------------------------------
37
-test.convertRawMatricesToStandard = function (tbl.rmat)
33
+test.extractMatrices = function (tbl.rmat)
38 34
 {
39
-  print ('--- test.convertRawMatricesToStandard')
40
-   # get just the first two raw matrices
41
-
42
-  first.two.ids = head (unique (tbl.rmat$id), n=2)
43
-  rows = nrow (subset (tbl.rmat, id %in% first.two.ids))
44
-  matrices = convertRawMatricesToStandard (tbl.rmat [1:rows,])
45
-  checkEquals (length (matrices), 2)
46
-  checkEquals (names (matrices), first.two.ids)
47
-
48
-    # it will not always be true, but IS true for the first two matrices, currently "9229" and  "9231", that there
49
-    # are an equal number of nucleotides at each position.
50
-  checkTrue (all (colSums (matrices [[1]]) == 97))
51
-  checkTrue (all (colSums (matrices [[2]]) == 185))
52
-
53
-    # now run all the matrices through
54
-  matrices = convertRawMatricesToStandard (tbl.rmat)
55
-  checkEquals (length (matrices), 459)
56
-  checkEquals (names (matrices)[1:2], first.two.ids)
57
-  
58
-  invisible (matrices)
35
+  print ('--- test.convertRawMatricesToStandard') 
36
+  matrices = extractMatrices (tbl.rmat)
37
+
38
+  checkTrue(length(matrices) == length (tbl.rmat))
39
+  checkTrue(typeof(names(matrices)) == "character")
40
+  checkTrue(sum(colSums(matrices[[1]])) ==120)
41
+  checkTrue(sum(colSums(matrices[[2]])) ==144)
42
+    invisible (matrices)
59 43
   
60 44
 } # test.convertRawMatricesToStandard 
61 45
 #------------------------------------------------------------------------------------------------------------------------
... ...
@@ -63,11 +47,10 @@ test.createAnnotationTable = function (dataDir)
63 47
 {
64 48
   print ('--- test.createAnnotationTable')
65 49
   tbl.anno = createAnnotationTable (dataDir)
66
-  checkEquals (dim (tbl.anno), c (513, 13))
67 50
   expected = c ("fullID", "id", "category", "mID", "version", "binder", "speciesID", "proteinID", "family", "tax", "class", "pubmed", "type")
68 51
   checkEquals (colnames (tbl.anno), expected)
69 52
 
70
-  checkEquals (head (tbl.anno$fullID),  c ("MA0001.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1", "MA0006.1"))
53
+  checkEquals (head (tbl.anno$fullID),  c ("MA0001.1", "MA0002.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1"))
71 54
   invisible (tbl.anno)
72 55
 
73 56
 } # test.createAnnotationTable
... ...
@@ -81,16 +64,16 @@ test.createMetadataTable = function (tbl.anno, matrices)
81 64
   checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", 
82 65
                                      "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence",
83 66
                                      "bindingDomain", "tfFamily", "experimentType", "pubmedID"))
84
-  checkEquals (tbl.md$proteinId, c ('P29383', 'P05549'))
67
+  checkEquals (tbl.md$proteinId, c ('P53762', 'P30561, P53762'))
85 68
   checkEquals (tbl.md$proteinIdType, c ('UNIPROT', 'UNIPROT'))
86 69
 
87 70
     # now use the whole table
88 71
   tbl.md = createMetadataTable (tbl.anno, matrices)
89 72
   checkEquals (dim (tbl.md), c (length (matrices), 15))
90 73
     # test for proper conversion of speciesID = NA or '-' to Vertebrata
74
+
91 75
   checkEquals (which (is.na (tbl.md$organism)), integer (0))
92 76
   checkEquals (grep ('-', tbl.md$organism), integer (0))
93
-
94 77
     # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 had 'NA' for proteinID, not <NA>. fixed?
95 78
   checkEquals (grep ('NA', tbl.md$proteinId), integer (0))
96 79
   invisible (tbl.md)
... ...
@@ -105,17 +88,16 @@ test.renameMatrices = function (matrices, tbl.md, tbl.anno)
105 88
   checkEquals (dim (tbl.md), c (2, 15))
106 89
   old.matrix.names = names (matrix.pair)
107 90
   matrices.renamed = renameMatrices (matrix.pair, tbl.md)
108
-
109 91
     # test:  the old name is an id, '9229'.  find, in tbl.anno, the fullID, 'MA0001.1'.  then make sure 'MA000.1' is
110 92
     # in the new name of that same matrix
111 93
 
112 94
   for (i in 1:length (matrix.pair)) {
113
-    fullID = subset (x.tbl.anno, id==old.matrix.names [i])$fullID
95
+    fullID = subset(x.tbl.anno, fullID== substring(old.matrix.names[i],0,8))$fullID
114 96
     checkTrue (length (grep (fullID, names (matrices.renamed) [i])) == 1)
115 97
     } # for i
116 98
 
117 99
     # now try it for the whole set, with selective focused tests
118
-
100
+ 
119 101
   tbl.md = createMetadataTable (tbl.anno, matrices)
120 102
   checkEquals (nrow (tbl.md), length (matrices))
121 103
   old.matrix.names = names (matrices)
... ...
@@ -182,13 +164,7 @@ test.assignGeneId = function (dataDir, proteinId)
182 164
 
183 165
   checkEquals (assignGeneId ('YKL112W'),     list (geneId='YKL112W', type='SGD'))
184 166
 
185
-    # see how successful this is over all 513 proteinIds
186
-
187
-  tbl.anno = createAnnotationTable (dataDir)
188
-  mtx.geneId = as.data.frame (t (sapply (tbl.anno$proteinID, assignGeneId)))
189
-  tbl.types = as.data.frame (table (as.character (mtx.geneId$type), useNA='always'), stringsAsFactors=FALSE)
190
-  checkEquals (tbl.types$Var1, c ("ENTREZ", "SGD", NA))
191
-  checkEquals (tbl.types$Freq, c (142, 177, 194))
167
+ 
192 168
 
193 169
 } # test.assignGeneId
194 170
 #------------------------------------------------------------------------------------------------------------------------