git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75325 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,340 @@ |
1 |
+# jaspar/import.R |
|
2 |
+# notes: 3 matrices come w/o speciesID, tax = 'vertebrates'. not our problem to fix, at least not yet. |
|
3 |
+# TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2) |
|
4 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
5 |
+library (org.Hs.eg.db) |
|
6 |
+library (org.Mm.eg.db) |
|
7 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
8 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
9 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
10 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/jaspar" |
|
11 |
+kDataDir <- "~/s/data/public/TFBS/jaspar/sql_tables" |
|
12 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
13 |
+run = function (dataDir) |
|
14 |
+{ |
|
15 |
+ tbl.rmat = readRawMatrices (dataDir) |
|
16 |
+ matrices = convertRawMatricesToStandard (tbl.rmat) |
|
17 |
+ tbl.anno = createAnnotationTable (dataDir) |
|
18 |
+ tbl.md = createMetadataTable (tbl.anno, matrices) |
|
19 |
+ matrices = renameMatrices (matrices, tbl.md) |
|
20 |
+ matrices = normalizeMatrices (matrices) |
|
21 |
+ serializedFile <- "jaspar.RData" |
|
22 |
+ save (matrices, tbl.md, file=serializedFile) |
|
23 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
24 |
+ printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
25 |
+ |
|
26 |
+} # run |
|
27 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
28 |
+# jaspar's sql rendering of their matrices is unusual. see below. translate them to a 4-row, n-column form in |
|
29 |
+# convertRawMatricesToStandard, below. |
|
30 |
+# the id can be mapped to other names and information via simple subset lookup in tbl.anno |
|
31 |
+# id base pos count |
|
32 |
+# 1 9229 A 1 0 |
|
33 |
+# 2 9229 A 2 3 |
|
34 |
+# 3 9229 A 3 79 |
|
35 |
+# ... |
|
36 |
+# 11 9229 C 1 94 |
|
37 |
+# 12 9229 C 2 75 |
|
38 |
+# 13 9229 C 3 4 |
|
39 |
+# ... |
|
40 |
+# 20 9229 C 10 3 |
|
41 |
+# 21 9229 G 1 1 |
|
42 |
+# 22 9229 G 2 0 |
|
43 |
+# ... |
|
44 |
+# 38 9229 T 8 81 |
|
45 |
+# 39 9229 T 9 1 |
|
46 |
+# 40 9229 T 10 6 |
|
47 |
+readRawMatrices = function (dataDir) |
|
48 |
+{ |
|
49 |
+ file <- file.path(dataDir, 'MATRIX_DATA.txt') |
|
50 |
+ tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric')) |
|
51 |
+ colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count') |
|
52 |
+ |
|
53 |
+ invisible (tbl.matrices) |
|
54 |
+ |
|
55 |
+} # readRawMatrices |
|
56 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
57 |
+convertRawMatricesToStandard = function (tbl.rmat) |
|
58 |
+{ |
|
59 |
+ matrix.ids = unique (tbl.rmat$id) |
|
60 |
+ result = vector ('list', length (matrix.ids)) |
|
61 |
+ |
|
62 |
+ i = 1 |
|
63 |
+ for (matrix.id in matrix.ids) { |
|
64 |
+ tbl.sub = subset (tbl.rmat, id == matrix.id) |
|
65 |
+ # sanity check this rather unusual representation of a position count matrix |
|
66 |
+ base.count = as.data.frame (table (tbl.sub$base)) |
|
67 |
+ stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T')) |
|
68 |
+ # conservative length check. actually expect sequence lengths of 6 - 20 bases |
|
69 |
+ if (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) { |
|
70 |
+ printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1]) |
|
71 |
+ } |
|
72 |
+ stopifnot (all (base.count$Freq == base.count$Freq [1])) |
|
73 |
+ nucleotide.counts = tbl.sub$count |
|
74 |
+ row.names = c('A', 'C', 'G', 'T'); |
|
75 |
+ col.names = 1:(nrow (tbl.sub) / 4) |
|
76 |
+ m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names)) |
|
77 |
+ result [[i]] = m |
|
78 |
+ i = i + 1 |
|
79 |
+ } # for matrix.id |
|
80 |
+ |
|
81 |
+ names (result) = matrix.ids |
|
82 |
+ return (result) |
|
83 |
+ |
|
84 |
+} # convertRawMatricesToStandard |
|
85 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
86 |
+# read 'mysql' tables provide by jaspar: |
|
87 |
+# MATRIX.txt: 9229 CORE MA0001 1 AGL3 |
|
88 |
+# MATRIX_PROTEIN.txt: 9229 P29383 |
|
89 |
+# MATRIX_SPECIES.txt: 9229 3702 |
|
90 |
+# MATRIX_ANNOTATION.txt: |
|
91 |
+# 9229 class Other Alpha-Helix |
|
92 |
+# 9229 comment - |
|
93 |
+# 9229 family MADS |
|
94 |
+# 9229 medline 7632923 |
|
95 |
+# 9229 tax_group plants |
|
96 |
+# 9229 type SELEX |
|
97 |
+createAnnotationTable = function (dataDir) |
|
98 |
+{ |
|
99 |
+ file <- file.path(dataDir, "MATRIX.txt") |
|
100 |
+ tbl.matrix = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
101 |
+ colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder') |
|
102 |
+ |
|
103 |
+ file <- file.path(dataDir, "MATRIX_PROTEIN.txt") |
|
104 |
+ tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
105 |
+ colnames (tbl.protein) = c ('id', 'proteinID') |
|
106 |
+ |
|
107 |
+ file <- file.path(dataDir, "MATRIX_SPECIES.txt") |
|
108 |
+ tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
109 |
+ colnames (tbl.species) = c ('id', 'speciesID') |
|
110 |
+ |
|
111 |
+ file <- file.path(dataDir, "MATRIX_ANNOTATION.txt") |
|
112 |
+ tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="") |
|
113 |
+ colnames (tbl.anno) = c ('id', 'attribute', 'value') |
|
114 |
+ |
|
115 |
+ tbl.family = subset (tbl.anno, attribute=='family') [, -2]; |
|
116 |
+ colnames (tbl.family) = c ('id', 'family') |
|
117 |
+ |
|
118 |
+ tbl.tax = subset (tbl.anno, attribute=='tax_group') [,-2]; |
|
119 |
+ colnames (tbl.tax) = c ('id', 'tax') |
|
120 |
+ |
|
121 |
+ tbl.class = subset (tbl.anno, attribute=='class') [,-2]; |
|
122 |
+ colnames (tbl.class) = c ('id', 'class') |
|
123 |
+ |
|
124 |
+ tbl.comment = subset (tbl.anno, attribute=='comment')[,-2]; |
|
125 |
+ colnames (tbl.comment) = c ('id', 'comment') |
|
126 |
+ |
|
127 |
+ tbl.pubmed = subset (tbl.anno, attribute=='medline')[,-2]; |
|
128 |
+ colnames (tbl.pubmed) = c ('id', 'pubmed') |
|
129 |
+ |
|
130 |
+ tbl.type = subset (tbl.anno, attribute=='type')[,-2]; |
|
131 |
+ colnames (tbl.type) = c ('id', 'type') |
|
132 |
+ |
|
133 |
+ |
|
134 |
+ tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE) |
|
135 |
+ tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE) |
|
136 |
+ tbl.md = merge (tbl.md, tbl.family, all.x=TRUE) |
|
137 |
+ tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE) |
|
138 |
+ tbl.md = merge (tbl.md, tbl.class, all.x=TRUE) |
|
139 |
+ tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE) |
|
140 |
+ tbl.md = merge (tbl.md, tbl.type, all.x=TRUE) |
|
141 |
+ |
|
142 |
+ fullID = paste (tbl.md$mID, tbl.md$version, sep='.') |
|
143 |
+ tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE) |
|
144 |
+ |
|
145 |
+ invisible (tbl.md) |
|
146 |
+ |
|
147 |
+} # createAnnotationTable |
|
148 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
149 |
+# assemble these columns: |
|
150 |
+# names=character(), # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a |
|
151 |
+# nativeNames=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
152 |
+# geneSymbols=character(), # ABF2, Rhox11 |
|
153 |
+# sequenceCounts=integer(), # often NA |
|
154 |
+# organisms=character(), # Scerevisiae, Mmusculus |
|
155 |
+# bindingMolecules=character(), # YMR072W, 194738 |
|
156 |
+# bindingMoleculeIdTypes=character(), # SGD, entrez gene |
|
157 |
+# bindingDomainTypes=character(), # NA, Homeo |
|
158 |
+# dataSources=character(), # ScerTF, UniPROBE |
|
159 |
+# experimentTypes=character(), # NA, protein-binding microarray |
|
160 |
+# pubmedIDs=character(), # 19111667, 1858359 |
|
161 |
+# tfFamilies=character()) # NA, NA |
|
162 |
+# |
|
163 |
+# from these |
|
164 |
+# |
|
165 |
+createMetadataTable = function (tbl.anno, matrices) |
|
166 |
+{ |
|
167 |
+ options (stringsAsFactors=FALSE) |
|
168 |
+ tbl.md = data.frame () |
|
169 |
+ matrix.ids = names (matrices) |
|
170 |
+ dataSource = 'JASPAR_CORE' |
|
171 |
+ |
|
172 |
+ for (matrix.id in matrix.ids) { |
|
173 |
+ matrix = matrices [[matrix.id]] |
|
174 |
+ stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1) |
|
175 |
+ tbl.sub = subset (tbl.anno, id==matrix.id) |
|
176 |
+ if (nrow (tbl.sub) > 1) { |
|
177 |
+ # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr |
|
178 |
+ # some others, a sampling: P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510 |
|
179 |
+ dimer = paste (unique (tbl.sub$proteinID), collapse=';') |
|
180 |
+ tbl.sub [1, 'proteinID'] = dimer |
|
181 |
+ } |
|
182 |
+ anno = as.list (tbl.sub [1,]) |
|
183 |
+ taxon.code = anno$speciesID |
|
184 |
+ geneId.info = assignGeneId (anno$proteinID) |
|
185 |
+ new.row = list (providerName=anno$binder, |
|
186 |
+ providerId=anno$fullID, |
|
187 |
+ dataSource=dataSource, |
|
188 |
+ geneSymbol=anno$binder, |
|
189 |
+ geneId=geneId.info$geneId, |
|
190 |
+ geneIdType=geneId.info$type, |
|
191 |
+ proteinId=anno$proteinID, |
|
192 |
+ proteinIdType=guessProteinIdentifierType (anno$proteinID), |
|
193 |
+ organism=convertTaxonCode(anno$speciesID), |
|
194 |
+ sequenceCount=as.integer (mean (colSums (matrix))), |
|
195 |
+ bindingSequence=NA_character_, |
|
196 |
+ bindingDomain=anno$class, |
|
197 |
+ tfFamily=anno$family, |
|
198 |
+ experimentType=anno$type, |
|
199 |
+ pubmedID=anno$pubmed) |
|
200 |
+ tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE)) |
|
201 |
+ full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID) |
|
202 |
+ rownames (tbl.md) [nrow (tbl.md)] = full.name |
|
203 |
+ } # for i |
|
204 |
+ |
|
205 |
+ # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA> |
|
206 |
+ NA.string.indices = grep ('NA', tbl.md$proteinId) |
|
207 |
+ if (length (NA.string.indices) > 0) |
|
208 |
+ tbl.md$proteinId [NA.string.indices] = NA |
|
209 |
+ |
|
210 |
+ invisible (tbl.md) |
|
211 |
+ |
|
212 |
+} # createMetadataTable |
|
213 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
214 |
+renameMatrices = function (matrices, tbl.md) |
|
215 |
+{ |
|
216 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
217 |
+ names (matrices) = rownames (tbl.md) |
|
218 |
+ invisible (matrices) |
|
219 |
+ |
|
220 |
+} # renameMatrices |
|
221 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
222 |
+# full names: ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', |
|
223 |
+# 'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam', |
|
224 |
+# 'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster', |
|
225 |
+# 'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', |
|
226 |
+# 'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'), |
|
227 |
+convertTaxonCode = function (ncbi.code) |
|
228 |
+{ |
|
229 |
+ #if (is.na (ncbi.code)) |
|
230 |
+ # return (NA_character_) |
|
231 |
+ ncbi.code = as.character (ncbi.code) |
|
232 |
+ if (ncbi.code %in% c ('-', NA_character_, 'NA')) |
|
233 |
+ return ('Vertebrata') |
|
234 |
+ |
|
235 |
+ tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102', |
|
236 |
+ '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729', |
|
237 |
+ '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'), |
|
238 |
+ name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', |
|
239 |
+ 'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam', |
|
240 |
+ 'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster', |
|
241 |
+ 'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', |
|
242 |
+ 'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'), |
|
243 |
+ stringsAsFactors=FALSE) |
|
244 |
+ |
|
245 |
+ ncbi.code = as.character (ncbi.code) |
|
246 |
+ index = which (tbl$code == ncbi.code) |
|
247 |
+ if (length (index) == 1) |
|
248 |
+ return (tbl$name [index]) |
|
249 |
+ else { |
|
250 |
+ write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ()) |
|
251 |
+ return (NA_character_) |
|
252 |
+ } |
|
253 |
+ |
|
254 |
+} # convertTaxonCode |
|
255 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
256 |
+# an empirical and not altogether trustworthy solution to identifying identifier types. |
|
257 |
+guessProteinIdentifierType = function (moleculeName) |
|
258 |
+{ |
|
259 |
+ if (nchar (moleculeName) == 0) |
|
260 |
+ return (NA_character_) |
|
261 |
+ if (is.na (moleculeName)) |
|
262 |
+ return (NA_character_) |
|
263 |
+ |
|
264 |
+ first.char = substr (moleculeName, 1, 1) |
|
265 |
+ |
|
266 |
+ if (first.char == 'Y') |
|
267 |
+ return ('SGD') |
|
268 |
+ |
|
269 |
+ if (first.char %in% c ('P', 'Q', 'O', 'A', 'C')) |
|
270 |
+ return ('UNIPROT') |
|
271 |
+ |
|
272 |
+ if (length (grep ('^EAW', moleculeName)) == 1) |
|
273 |
+ return ('NCBI') |
|
274 |
+ |
|
275 |
+ if (length (grep ('^EAX', moleculeName)) == 1) |
|
276 |
+ return ('NCBI') |
|
277 |
+ |
|
278 |
+ if (length (grep ('^NP_', moleculeName)) == 1) |
|
279 |
+ return ('REFSEQ') |
|
280 |
+ |
|
281 |
+ if (length (grep ('^BAD', moleculeName)) == 1) |
|
282 |
+ return ('EMBL') |
|
283 |
+ |
|
284 |
+ return (NA_character_) |
|
285 |
+ |
|
286 |
+} # guessProteinIdentifierType |
|
287 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
288 |
+normalizeMatrices = function (matrices) |
|
289 |
+{ |
|
290 |
+ mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
291 |
+ invisible (mtx.normalized) |
|
292 |
+ |
|
293 |
+} # normalizeMatrices |
|
294 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
295 |
+assignGeneId = function (proteinId) |
|
296 |
+{ |
|
297 |
+ if (!exists ('id.uniprot.human')) { |
|
298 |
+ |
|
299 |
+ tbl = toTable (org.Hs.egUNIPROT) |
|
300 |
+ id.uniprot.human <<- as.character (tbl$gene_id) |
|
301 |
+ names (id.uniprot.human) <<- tbl$uniprot_id |
|
302 |
+ |
|
303 |
+ tbl = toTable (org.Hs.egREFSEQ) |
|
304 |
+ tbl = tbl [grep ('^NP_', tbl$accession),] |
|
305 |
+ id.refseq.human <<- as.character (tbl$gene_id) |
|
306 |
+ names (id.refseq.human) <<- tbl$accession |
|
307 |
+ |
|
308 |
+ tbl = toTable (org.Mm.egUNIPROT) |
|
309 |
+ id.uniprot.mouse <<- as.character (tbl$gene_id) |
|
310 |
+ names (id.uniprot.mouse) <<- tbl$uniprot_id |
|
311 |
+ |
|
312 |
+ tbl = toTable (org.Mm.egREFSEQ) |
|
313 |
+ tbl = tbl [grep ('^NP_', tbl$accession),] |
|
314 |
+ id.refseq.mouse <<- as.character (tbl$gene_id) |
|
315 |
+ names (id.refseq.mouse) <<- tbl$accession |
|
316 |
+ } |
|
317 |
+ |
|
318 |
+ proteinId = strsplit (proteinId, '\\.')[[1]][1] # remove any trailing '.*' |
|
319 |
+ |
|
320 |
+ if (proteinId %in% names (id.uniprot.human)) |
|
321 |
+ return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ')) |
|
322 |
+ |
|
323 |
+ if (proteinId %in% names (id.uniprot.mouse)) |
|
324 |
+ return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ')) |
|
325 |
+ |
|
326 |
+ if (proteinId %in% names (id.refseq.human)) |
|
327 |
+ return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ')) |
|
328 |
+ |
|
329 |
+ if (proteinId %in% names (id.refseq.mouse)) |
|
330 |
+ return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ')) |
|
331 |
+ |
|
332 |
+ found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE)) |
|
333 |
+ |
|
334 |
+ if (found.leading.Y == 1) |
|
335 |
+ return (list (geneId=proteinId, type='SGD')) |
|
336 |
+ |
|
337 |
+ return (list (geneId=NA_character_, type=NA_character_)) |
|
338 |
+ |
|
339 |
+} # assignGeneId |
|
340 |
+#------------------------------------------------------------------------------------------------------------------------ |
0 | 341 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,194 @@ |
1 |
+# jaspar/test.R |
|
2 |
+# notes: 3 matrices come w/o speciesID, tax = 'vertebrates'. not our problem to fix, at least not yet. |
|
3 |
+# TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2) |
|
4 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
5 |
+library (RUnit) |
|
6 |
+source("import.R") |
|
7 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
8 |
+run.tests = function (dataDir=kDataDir) |
|
9 |
+{ |
|
10 |
+ x.tbl.rmat <<- test.readRawMatrices (dataDir) |
|
11 |
+ x.matrices <<- test.convertRawMatricesToStandard (x.tbl.rmat) |
|
12 |
+ x.tbl.anno <<- test.createAnnotationTable (dataDir) |
|
13 |
+ test.assignGeneId (dataDir) |
|
14 |
+ x.tbl.md <<- test.createMetadataTable (x.tbl.anno, x.matrices) |
|
15 |
+ x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md, x.tbl.anno) |
|
16 |
+ x.matrices.normalized <<- test.normalizeMatrices (x.matrices.renamed) |
|
17 |
+ |
|
18 |
+} # run.tests |
|
19 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
20 |
+test.readRawMatrices = function (dataDir) |
|
21 |
+{ |
|
22 |
+ print ('--- test.readMatrices') |
|
23 |
+ tbl.rmat = readRawMatrices (dataDir) |
|
24 |
+ checkEquals (ncol (tbl.rmat), 4) |
|
25 |
+ checkEquals (colnames (tbl.rmat), c ('id', 'base', 'pos', 'count')) |
|
26 |
+ checkEquals (class (tbl.rmat$id), 'character') |
|
27 |
+ checkEquals (class (tbl.rmat$base), 'character') |
|
28 |
+ checkEquals (class (tbl.rmat$pos), 'numeric') |
|
29 |
+ checkEquals (class (tbl.rmat$count), 'numeric') |
|
30 |
+ |
|
31 |
+ checkTrue (nrow (tbl.rmat) > 18000) # about 450 motifs, each represeted by 4 rows (ACGT) and about 10 positions |
|
32 |
+ checkTrue (length (unique (tbl.rmat$id)) > 450) |
|
33 |
+ invisible (tbl.rmat) |
|
34 |
+ |
|
35 |
+} # test.readRawMatrices |
|
36 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
37 |
+test.convertRawMatricesToStandard = function (tbl.rmat) |
|
38 |
+{ |
|
39 |
+ print ('--- test.convertRawMatricesToStandard') |
|
40 |
+ # get just the first two raw matrices |
|
41 |
+ |
|
42 |
+ first.two.ids = head (unique (tbl.rmat$id), n=2) |
|
43 |
+ rows = nrow (subset (tbl.rmat, id %in% first.two.ids)) |
|
44 |
+ matrices = convertRawMatricesToStandard (tbl.rmat [1:rows,]) |
|
45 |
+ checkEquals (length (matrices), 2) |
|
46 |
+ checkEquals (names (matrices), first.two.ids) |
|
47 |
+ |
|
48 |
+ # it will not always be true, but IS true for the first two matrices, currently "9229" and "9231", that there |
|
49 |
+ # are an equal number of nucleotides at each position. |
|
50 |
+ checkTrue (all (colSums (matrices [[1]]) == 97)) |
|
51 |
+ checkTrue (all (colSums (matrices [[2]]) == 185)) |
|
52 |
+ |
|
53 |
+ # now run all the matrices through |
|
54 |
+ matrices = convertRawMatricesToStandard (tbl.rmat) |
|
55 |
+ checkEquals (length (matrices), 459) |
|
56 |
+ checkEquals (names (matrices)[1:2], first.two.ids) |
|
57 |
+ |
|
58 |
+ invisible (matrices) |
|
59 |
+ |
|
60 |
+} # test.convertRawMatricesToStandard |
|
61 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
62 |
+test.createAnnotationTable = function (dataDir) |
|
63 |
+{ |
|
64 |
+ print ('--- test.createAnnotationTable') |
|
65 |
+ tbl.anno = createAnnotationTable (dataDir) |
|
66 |
+ checkEquals (dim (tbl.anno), c (513, 13)) |
|
67 |
+ expected = c ("fullID", "id", "category", "mID", "version", "binder", "speciesID", "proteinID", "family", "tax", "class", "pubmed", "type") |
|
68 |
+ checkEquals (colnames (tbl.anno), expected) |
|
69 |
+ |
|
70 |
+ checkEquals (head (tbl.anno$fullID), c ("MA0001.1", "MA0003.1", "MA0004.1", "MA0005.1", "MA0006.1", "MA0006.1")) |
|
71 |
+ invisible (tbl.anno) |
|
72 |
+ |
|
73 |
+} # test.createAnnotationTable |
|
74 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
75 |
+test.createMetadataTable = function (tbl.anno, matrices) |
|
76 |
+{ |
|
77 |
+ print ('--- test.createMetadataTable') |
|
78 |
+ # try it first with just two matrices |
|
79 |
+ tbl.md = createMetadataTable (tbl.anno, matrices [1:2]) |
|
80 |
+ checkEquals (dim (tbl.md), c (2, 15)) |
|
81 |
+ checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", |
|
82 |
+ "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence", |
|
83 |
+ "bindingDomain", "tfFamily", "experimentType", "pubmedID")) |
|
84 |
+ checkEquals (tbl.md$proteinId, c ('P29383', 'P05549')) |
|
85 |
+ checkEquals (tbl.md$proteinIdType, c ('UNIPROT', 'UNIPROT')) |
|
86 |
+ |
|
87 |
+ # now use the whole table |
|
88 |
+ tbl.md = createMetadataTable (tbl.anno, matrices) |
|
89 |
+ checkEquals (dim (tbl.md), c (length (matrices), 15)) |
|
90 |
+ # test for proper conversion of speciesID = NA or '-' to Vertebrata |
|
91 |
+ checkEquals (which (is.na (tbl.md$organism)), integer (0)) |
|
92 |
+ checkEquals (grep ('-', tbl.md$organism), integer (0)) |
|
93 |
+ |
|
94 |
+ # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 had 'NA' for proteinID, not <NA>. fixed? |
|
95 |
+ checkEquals (grep ('NA', tbl.md$proteinId), integer (0)) |
|
96 |
+ invisible (tbl.md) |
|
97 |
+ |
|
98 |
+} # test.createMetadataTable |
|
99 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
100 |
+test.renameMatrices = function (matrices, tbl.md, tbl.anno) |
|
101 |
+{ |
|
102 |
+ # try it with just the first two matrices |
|
103 |
+ matrix.pair = matrices [1:2] |
|
104 |
+ tbl.md = createMetadataTable (tbl.anno, matrix.pair) |
|
105 |
+ checkEquals (dim (tbl.md), c (2, 15)) |
|
106 |
+ old.matrix.names = names (matrix.pair) |
|
107 |
+ matrices.renamed = renameMatrices (matrix.pair, tbl.md) |
|
108 |
+ |
|
109 |
+ # test: the old name is an id, '9229'. find, in tbl.anno, the fullID, 'MA0001.1'. then make sure 'MA000.1' is |
|
110 |
+ # in the new name of that same matrix |
|
111 |
+ |
|
112 |
+ for (i in 1:length (matrix.pair)) { |
|
113 |
+ fullID = subset (x.tbl.anno, id==old.matrix.names [i])$fullID |
|
114 |
+ checkTrue (length (grep (fullID, names (matrices.renamed) [i])) == 1) |
|
115 |
+ } # for i |
|
116 |
+ |
|
117 |
+ # now try it for the whole set, with selective focused tests |
|
118 |
+ |
|
119 |
+ tbl.md = createMetadataTable (tbl.anno, matrices) |
|
120 |
+ checkEquals (nrow (tbl.md), length (matrices)) |
|
121 |
+ old.matrix.names = names (matrices) |
|
122 |
+ matrices.renamed = renameMatrices (matrices, tbl.md) |
|
123 |
+ |
|
124 |
+ checkEquals (nrow (tbl.md), length (matrices.renamed)) |
|
125 |
+ checkEquals (length (grep ('-MA0', names (matrices.renamed))), length (matrices.renamed)) |
|
126 |
+ |
|
127 |
+ invisible (matrices.renamed) |
|
128 |
+ |
|
129 |
+} # test.renameMatrices |
|
130 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
131 |
+test.convertTaxonCode = function () |
|
132 |
+{ |
|
133 |
+ print ('--- test.convertTaxonCode') |
|
134 |
+ |
|
135 |
+ checkEquals (convertTaxonCode ('9606'), 'Hsapiens') |
|
136 |
+ checkEquals (convertTaxonCode (9606), 'Hsapiens') |
|
137 |
+ # anomalous codes, which an examination of the jaspar website reveals as 'vertebrates' |
|
138 |
+ checkEquals (convertTaxonCode (NA), 'Vertebrata') |
|
139 |
+ checkEquals (convertTaxonCode ('NA'), 'Vertebrata') |
|
140 |
+ checkEquals (convertTaxonCode (NA_character_), 'Vertebrata') |
|
141 |
+ checkEquals (convertTaxonCode ('-'), 'Vertebrata') |
|
142 |
+ |
|
143 |
+} # test.convertTaxonCode |
|
144 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
145 |
+test.guessProteinIdentifierType = function (moleculeName) |
|
146 |
+{ |
|
147 |
+ print ('--- test.guessProteinIdentifierType') |
|
148 |
+ checkEquals (guessProteinIdentifierType ('P29383'), 'UNIPROT') |
|
149 |
+ |
|
150 |
+ all.types = sapply (x.tbl.anno$proteinID, guessProteinIdentifierType) |
|
151 |
+ checkTrue (length (which (is.na (all.types))) < 12) # got most of them. |
|
152 |
+ |
|
153 |
+} # test.guessProteinIdentifierType |
|
154 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
155 |
+test.normalizeMatrices = function (matrices) |
|
156 |
+{ |
|
157 |
+ print ('--- test.normalizeMatrices') |
|
158 |
+ |
|
159 |
+ colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx)))))) |
|
160 |
+ #checkTrue (all (colsums > 1)) |
|
161 |
+ |
|
162 |
+ matrices.norm = normalizeMatrices (matrices) |
|
163 |
+ |
|
164 |
+ colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx)))))) |
|
165 |
+ checkTrue (all (colsums == 1)) |
|
166 |
+ |
|
167 |
+ invisible (matrices.norm) |
|
168 |
+ |
|
169 |
+} # test.normalizeMatrices |
|
170 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
171 |
+test.assignGeneId = function (dataDir, proteinId) |
|
172 |
+{ |
|
173 |
+ print ('--- test.assignGeneId') |
|
174 |
+ uniprot.ids = c ('Q9GRA5', 'P31314', 'AAC18941', 'O49397') |
|
175 |
+ refseq.ids = c ('NP_995315.1', 'NP_032840', 'NP_599022') |
|
176 |
+ yeast.ids = c ('YKL112W', 'YMR072W', 'YLR131C') |
|
177 |
+ |
|
178 |
+ checkEquals (assignGeneId ('NP_995315.1'), list (geneId='4782', type='ENTREZ')) |
|
179 |
+ checkEquals (assignGeneId ('NP_599022'), list (geneId='6095', type='ENTREZ')) |
|
180 |
+ |
|
181 |
+ checkEquals (assignGeneId ('P31314'), list (geneId='3195', type='ENTREZ')) |
|
182 |
+ |
|
183 |
+ checkEquals (assignGeneId ('YKL112W'), list (geneId='YKL112W', type='SGD')) |
|
184 |
+ |
|
185 |
+ # see how successful this is over all 513 proteinIds |
|
186 |
+ |
|
187 |
+ tbl.anno = createAnnotationTable (dataDir) |
|
188 |
+ mtx.geneId = as.data.frame (t (sapply (tbl.anno$proteinID, assignGeneId))) |
|
189 |
+ tbl.types = as.data.frame (table (as.character (mtx.geneId$type), useNA='always'), stringsAsFactors=FALSE) |
|
190 |
+ checkEquals (tbl.types$Var1, c ("ENTREZ", "SGD", NA)) |
|
191 |
+ checkEquals (tbl.types$Freq, c (142, 177, 194)) |
|
192 |
+ |
|
193 |
+} # test.assignGeneId |
|
194 |
+#------------------------------------------------------------------------------------------------------------------------ |