git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75333 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,519 @@ |
1 |
+# uniprobe/import.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library (RMySQL) |
|
4 |
+library (org.Hs.eg.db) |
|
5 |
+library (org.Mm.eg.db) |
|
6 |
+library (org.Sc.sgd.db) |
|
7 |
+library (org.Ce.eg.db) |
|
8 |
+library(tools) # for md5sum |
|
9 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
10 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
11 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
12 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/uniprobe" |
|
13 |
+kDataDir <- "~/s/data/public/TFBS/uniprobe" |
|
14 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
15 |
+run = function (dataDir) |
|
16 |
+{ |
|
17 |
+ all.files = identifyFiles (file.path(dataDir,'All_PWMs')) |
|
18 |
+ matrices = readAndParse (all.files) |
|
19 |
+ tbl.pubRef = createPublicationRefTable () |
|
20 |
+ tbl.geneRef = createGeneRefTable () |
|
21 |
+ tbl.md = createMetadata (matrices, tbl.pubRef, tbl.geneRef) |
|
22 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
23 |
+ matrices = renameMatrices (matrices, tbl.md) |
|
24 |
+ |
|
25 |
+ serializedFile <- "uniprobe.RData" |
|
26 |
+ save (matrices, tbl.md, file=serializedFile) |
|
27 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
28 |
+ printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
29 |
+ |
|
30 |
+} # run |
|
31 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
32 |
+createMatrixNameUniqifier = function (matrix) |
|
33 |
+{ |
|
34 |
+ temporary.file <<- tempfile () |
|
35 |
+ write (as.character (matrix), file=temporary.file) |
|
36 |
+ md5sum.string <<- as.character (md5sum (temporary.file)) |
|
37 |
+ stopifnot (nchar (md5sum.string) == 32) |
|
38 |
+ md5.6chars = substr (md5sum.string, 29, 32) |
|
39 |
+ #unlink (temporary.file) |
|
40 |
+ |
|
41 |
+} # createMatrixNameUniqifier |
|
42 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
43 |
+parsePWMfromText = function (lines.of.text) |
|
44 |
+{ |
|
45 |
+ z = lines.of.text |
|
46 |
+ stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4) |
|
47 |
+ |
|
48 |
+ # determine the number of columns |
|
49 |
+ token.count.first.line = length (strsplit (lines.of.text [1], '\t')[[1]]) |
|
50 |
+ column.count = token.count.first.line - 1 # subtract off the 'A:\t' token |
|
51 |
+ result = matrix (nrow=4, ncol=column.count, byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:column.count)) |
|
52 |
+ |
|
53 |
+ for (line in lines.of.text) { |
|
54 |
+ tokens = strsplit (line, '\\s*[:\\|]') [[1]] |
|
55 |
+ nucleotide = tokens [1] |
|
56 |
+ numbers.raw = tokens [2] |
|
57 |
+ zz = numbers.raw |
|
58 |
+ number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]] |
|
59 |
+ while (nchar (number.tokens [1]) == 0) |
|
60 |
+ number.tokens = number.tokens [-1] |
|
61 |
+ zzz = number.tokens |
|
62 |
+ numbers = as.numeric (number.tokens) |
|
63 |
+ zzzz = numbers |
|
64 |
+ #printf ('adding %s: %s', nucleotide, list.to.string (numbers)) |
|
65 |
+ result [nucleotide,] = numbers |
|
66 |
+ } |
|
67 |
+ |
|
68 |
+ return (result) |
|
69 |
+ |
|
70 |
+} # parsePWMfromText |
|
71 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
72 |
+extractPWMfromFile = function (filename) |
|
73 |
+{ |
|
74 |
+ text = scan (filename, sep='\n', what=character (0), quiet=TRUE) |
|
75 |
+ matrix.start.lines = grep ('A\\s*[:\\|]', text) |
|
76 |
+ stopifnot (length (matrix.start.lines) == 1) |
|
77 |
+ #printf ('%50s: %s', filename, list.to.string (matrix.start.lines)) |
|
78 |
+ start.line = matrix.start.lines [1] |
|
79 |
+ end.line = start.line + 3 |
|
80 |
+ lines = text [start.line:end.line] |
|
81 |
+ pwm.matrix = parsePWMfromText (lines) |
|
82 |
+ return (pwm.matrix) |
|
83 |
+ |
|
84 |
+} # extractPWMfromFile |
|
85 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
86 |
+createPublicationRefTable = function () |
|
87 |
+{ |
|
88 |
+ # tbl.pubmed <<- data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'), |
|
89 |
+ # pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'), |
|
90 |
+ # stringsAsFactors=FALSE) |
|
91 |
+ |
|
92 |
+ options (stringsAsFactors=FALSE) |
|
93 |
+ tbl.ref = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09')) |
|
94 |
+ tbl.ref = cbind (tbl.ref, author=c('Scharer', 'Berger', 'Grove', 'Wei', 'Lesch', 'Zhu', 'Pompeani', 'Berger', 'De Silva', 'Badis')) |
|
95 |
+ tbl.ref = cbind (tbl.ref, pmid=c('19147588','18585359','19632181','20517297','19204119','19158363','18681939','16998473','18541913','19443739')) |
|
96 |
+ tbl.ref = cbind (tbl.ref, organism=c('Hsapiens', 'Mmusculus', 'Celegans', 'Mmusculus', 'Celegans', 'Scerevisiae', 'Vharveyi', |
|
97 |
+ 'Scerevisiae;Hsapiens;Mmusculus', 'Apicomplexa', 'Mmusculus')) |
|
98 |
+ tbl.ref = cbind (tbl.ref, count=c(1, 168, 34, 25, 1, 89, 1, 5, 3, 104)) |
|
99 |
+ titles = c ('Genome-wide promoter analysis of the SOX4 transcriptional network in prostate cancer cells', |
|
100 |
+ 'Variation in homeodomain DNA binding revealed by high-resolution analysis of sequence preferences', |
|
101 |
+ 'A Multiparameter Network Reveals Extensive Divergence between C. elegans bHLH Transcription Factors', |
|
102 |
+ 'Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo', |
|
103 |
+ 'Transcriptional regulation and stabilization of left right neuronal identity in C. elegans', |
|
104 |
+ 'High-resolution DNA-binding specificity analysis of yeast transcription factors', |
|
105 |
+ 'The Vibrio harveyi master quorum-sensing regulator, LuxR...', |
|
106 |
+ 'Compact, universal DNA microarrays...', |
|
107 |
+ 'Specific DNA-binding by Apicomplexan AP2 transcription factors', |
|
108 |
+ 'Diversity and complexity in DNA recognition by transcription factors') |
|
109 |
+ tbl.ref = cbind (tbl.ref, title=titles) |
|
110 |
+ |
|
111 |
+ tbl.ref |
|
112 |
+ |
|
113 |
+} # createPublicationRefTable |
|
114 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
115 |
+identifyFiles = function (filePath) |
|
116 |
+{ |
|
117 |
+ cmd = sprintf ('find %s -name "*pwm*"', filePath) |
|
118 |
+ |
|
119 |
+ files.raw = system (cmd, intern=TRUE) |
|
120 |
+ |
|
121 |
+ # all legit files end in ".pwm" or ".txt" |
|
122 |
+ files = c (grep (".pwm$", files.raw, value=T, ignore.case=T), |
|
123 |
+ grep (".txt$", files.raw, value=T, ignore.case=T)) |
|
124 |
+ |
|
125 |
+ reverseComplementMatrices = grep ('RC', files) |
|
126 |
+ if (length (reverseComplementMatrices) > 0) |
|
127 |
+ files = files [-reverseComplementMatrices] |
|
128 |
+ |
|
129 |
+ # don't know why these were excluded. leave them in for now |
|
130 |
+ embo10.excluders = grep ('EMBO', files) |
|
131 |
+ if (length (embo10.excluders) > 0) |
|
132 |
+ files = files [-embo10.excluders] |
|
133 |
+ |
|
134 |
+ cell09.excluders = grep ('Cell09', files) # include these once the sql tables are updated |
|
135 |
+ if (length (cell09.excluders) > 0) |
|
136 |
+ files = files [-cell09.excluders] |
|
137 |
+ secondary.excluders = grep ('secondary', files) |
|
138 |
+ if (length (secondary.excluders) > 0) |
|
139 |
+ files = files [-secondary.excluders] |
|
140 |
+ |
|
141 |
+ invisible (files) |
|
142 |
+ |
|
143 |
+} # identifyFiles |
|
144 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
145 |
+readAndParse = function (file.list) |
|
146 |
+{ |
|
147 |
+ matrices = list () |
|
148 |
+ |
|
149 |
+ for (file in file.list) { |
|
150 |
+ #printf ('read and parse %s', file) |
|
151 |
+ text = scan (file, sep='\n', what=character (0), quiet=TRUE) |
|
152 |
+ aaa <<- text |
|
153 |
+ matrix.start.lines = grep ('A:', text) |
|
154 |
+ stopifnot (length (matrix.start.lines) == 1) |
|
155 |
+ start.line = matrix.start.lines [1] |
|
156 |
+ end.line = start.line + 3 |
|
157 |
+ bbb <<- start.line |
|
158 |
+ ccc <<- end.line |
|
159 |
+ lines.of.text = text [start.line:end.line] |
|
160 |
+ zzz <<- lines.of.text |
|
161 |
+ pwm.matrix = parsePWMfromText (lines.of.text) |
|
162 |
+ name.tokens <- strsplit(file,"/")[[1]] |
|
163 |
+ token.count <- length(name.tokens) |
|
164 |
+ |
|
165 |
+ matrix.name <- paste(name.tokens[(token.count-1):token.count], collapse="/") |
|
166 |
+ matrices [[matrix.name]] = pwm.matrix |
|
167 |
+ } |
|
168 |
+ |
|
169 |
+ invisible (matrices) |
|
170 |
+ |
|
171 |
+} # readAndParse |
|
172 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
173 |
+# eg, ./All_PWMs/GD09/Nsy-7.pwm to simply 'Nsy-7' |
|
174 |
+# |
|
175 |
+translateFileNameToGeneName = function (long.name) |
|
176 |
+{ |
|
177 |
+ if (length (grep ('/', long.name)) > 0) { |
|
178 |
+ tokens = strsplit (long.name, '/')[[1]] |
|
179 |
+ count = length (tokens) |
|
180 |
+ gene.name.raw = tokens [count] # get the last one |
|
181 |
+ } |
|
182 |
+ else { |
|
183 |
+ gene.name.raw = long.name |
|
184 |
+ } |
|
185 |
+ |
|
186 |
+ gene.name = strsplit (gene.name.raw, '\\.')[[1]][1] # remove any file suffix |
|
187 |
+ |
|
188 |
+ gene.name |
|
189 |
+ |
|
190 |
+} # test.translateFileNameToGeneName |
|
191 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
192 |
+# and update matrix names |
|
193 |
+createMetadata = function (matrices, tbl.pubRef, tbl.geneRef) |
|
194 |
+{ |
|
195 |
+ options (stringsAsFactors=FALSE) |
|
196 |
+ dataSource = 'UniPROBE' |
|
197 |
+ trim = function (s) {sub (' +$', '', sub ('^ +', '', s))} |
|
198 |
+ tbl.md = data.frame () |
|
199 |
+ |
|
200 |
+ removers = list (Cart1=110, Cutl1=115, Hoxa7=156, Irx3=185) |
|
201 |
+ |
|
202 |
+ for (m in 1: length (matrices)) { |
|
203 |
+ matrix.name = names (matrices) [m] |
|
204 |
+ #printf ('%d: %s', m, matrix.name) |
|
205 |
+ native.name.raw = gsub ('All_PWMs/', '', matrix.name) |
|
206 |
+ native.name = extractNativeNames (native.name.raw) |
|
207 |
+ |
|
208 |
+ # extractNativeNames needs to be rethought. but for now, just hack in some fixes |
|
209 |
+ if (native.name == 'Cgd2') native.name='Cgd2_3490' # inconsistency at uniprobe, accomodated here |
|
210 |
+ if (native.name == 'PF14') native.name='PF14_0633' |
|
211 |
+ if (native.name == 'Uncx4') native.name='Uncx4.1' |
|
212 |
+ |
|
213 |
+ if (!native.name %in% tbl.geneRef$name) { |
|
214 |
+ browser (text='native.name not in tbl.geneRef') |
|
215 |
+ } |
|
216 |
+ |
|
217 |
+ experiment.folder = strsplit (native.name.raw, '/')[[1]][1] |
|
218 |
+ up.id.number = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$id |
|
219 |
+ # todo BUG here! |
|
220 |
+ full.uniprobe.id = sprintf ('UP%05d', up.id.number) |
|
221 |
+ if (length (full.uniprobe.id) != 1) { |
|
222 |
+ browser (text='full.uniprobe.id has wrong length') |
|
223 |
+ } |
|
224 |
+ |
|
225 |
+ geneId = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$stdID |
|
226 |
+ if (is.na (geneId) | geneId == '') |
|
227 |
+ geneId = NA_character_ |
|
228 |
+ |
|
229 |
+ bindingDomain = trim (subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$domain) |
|
230 |
+ organism = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$species |
|
231 |
+ |
|
232 |
+ # a native name (a gene symbol) may have a dash, but for the long name, we want dashes to separate |
|
233 |
+ # the organism-dataSource-geneIdentifier matrix name. |
|
234 |
+ # so covert this here, but do not eclipse any dash-including native.names |
|
235 |
+ |
|
236 |
+ native.name.no.dashes = gsub ('-', '_', native.name) |
|
237 |
+ native.name.uniq = sprintf ('%s-%s-%s.%s', organism, dataSource, native.name.no.dashes, full.uniprobe.id) |
|
238 |
+ if (native.name.uniq %in% rownames (tbl.md)) { |
|
239 |
+ matrix = matrices [[m]] |
|
240 |
+ uniqifier = createMatrixNameUniqifier (matrix) |
|
241 |
+ #printf ('before, not unique: %s', native.name.uniq) |
|
242 |
+ native.name.uniq = paste (native.name.uniq, uniqifier, sep='.') |
|
243 |
+ #printf ('after, not unique: %s', native.name.uniq) |
|
244 |
+ } |
|
245 |
+ |
|
246 |
+ sequenceCount = NA_integer_ |
|
247 |
+ bindingSequence = NA_character_ |
|
248 |
+ tfFamily = NA_character_ |
|
249 |
+ |
|
250 |
+ experimentType = 'protein binding microarray' |
|
251 |
+ pubmedID = subset (tbl.pubRef, folder==experiment.folder)$pmid |
|
252 |
+ #printf ('%12s: %20s', native.name, organism) |
|
253 |
+ |
|
254 |
+ if (is.na (geneId)) |
|
255 |
+ geneIdType = NA |
|
256 |
+ else if (organism == 'Scerevisiae') |
|
257 |
+ geneIdType = 'SGD' |
|
258 |
+ else if (organism %in% c ('Mmusculus', 'Hsapiens', 'Celegans')) |
|
259 |
+ geneIdType = 'ENTREZ' |
|
260 |
+ else |
|
261 |
+ geneIdType = 'todo' |
|
262 |
+ |
|
263 |
+ proteinId = NA_character_ |
|
264 |
+ proteinIdType = NA_character_ |
|
265 |
+ |
|
266 |
+ proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$uniprot |
|
267 |
+ if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) { |
|
268 |
+ #printf ('getting uniprot id for %s: %s', native.name, proteinId.tmp) |
|
269 |
+ proteinId = proteinId.tmp |
|
270 |
+ proteinIdType = 'UNIPROT' |
|
271 |
+ } # found good id in the uniprot column |
|
272 |
+ |
|
273 |
+ if (is.na (proteinId.tmp) | nchar (proteinId.tmp) == 0) { |
|
274 |
+ proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$refseq_id |
|
275 |
+ if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) { |
|
276 |
+ #printf ('getting refseq id for %s: %s', native.name, proteinId.tmp) |
|
277 |
+ proteinId = proteinId.tmp |
|
278 |
+ proteinIdType = 'REFSEQ' |
|
279 |
+ } |
|
280 |
+ } # need to look in the refseq column |
|
281 |
+ |
|
282 |
+ |
|
283 |
+ bindingSequence = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$bindingSequence |
|
284 |
+ #printf ('%12s: %40s', native.name, bindingSequence) |
|
285 |
+ |
|
286 |
+ new.row = list (providerName=native.name.raw, |
|
287 |
+ providerId=full.uniprobe.id, |
|
288 |
+ dataSource=dataSource, |
|
289 |
+ geneSymbol=native.name, |
|
290 |
+ geneId=geneId, |
|
291 |
+ geneIdType=geneIdType, |
|
292 |
+ proteinId=proteinId, |
|
293 |
+ proteinIdType=proteinIdType, |
|
294 |
+ organism=organism, |
|
295 |
+ sequenceCount=NA, |
|
296 |
+ bindingSequence=bindingSequence, |
|
297 |
+ bindingDomain=bindingDomain, |
|
298 |
+ tfFamily=tfFamily, |
|
299 |
+ experimentType=experimentType, |
|
300 |
+ pubmedID=pubmedID) |
|
301 |
+ #if (native.name == 'Cart1') browser (text='Cart1') |
|
302 |
+ if (native.name.uniq %in% rownames (tbl.md)) browser (text='dup row name') |
|
303 |
+ #print (new.row) |
|
304 |
+ tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE)) |
|
305 |
+ if (length (native.name.uniq) != 1) browser (text='native.name.unique length != 1') |
|
306 |
+ rownames (tbl.md) [m] = native.name.uniq |
|
307 |
+ } |
|
308 |
+ |
|
309 |
+ invisible (tbl.md) |
|
310 |
+ |
|
311 |
+} # createMetadata |
|
312 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
313 |
+extractNativeNames = function (native.names.raw) |
|
314 |
+{ |
|
315 |
+ name.count = length (native.names.raw) |
|
316 |
+ result = vector ('character', name.count) |
|
317 |
+ |
|
318 |
+ for (i in 1:name.count) { |
|
319 |
+ native.name.raw = native.names.raw [i] |
|
320 |
+ tokens = strsplit (native.name.raw, '/') [[1]] |
|
321 |
+ count = length (tokens) |
|
322 |
+ cooked.1 = native.name.raw |
|
323 |
+ if (count > 1) |
|
324 |
+ cooked.1 = tokens [length (tokens)] |
|
325 |
+ tokens = strsplit (cooked.1, '[_\\.]')[[1]] |
|
326 |
+ cooked.2 = tokens [1] |
|
327 |
+ result [i] = cooked.2 |
|
328 |
+ } |
|
329 |
+ |
|
330 |
+ invisible (result) |
|
331 |
+ |
|
332 |
+} # extractNativeNames |
|
333 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
334 |
+# 'standard' is usually entrez gene ID. for yeast it is orf. for fly ... |
|
335 |
+uniprotToStandardID = function (organism, uniprot.ids) |
|
336 |
+{ |
|
337 |
+# uniprot.ids = unique (uniprot.ids) |
|
338 |
+ |
|
339 |
+ if (!exists ('lst.yeast.uniprot')) { |
|
340 |
+ tbl.tmp = toTable (org.Sc.sgdUNIPROT) |
|
341 |
+ yeast.uniprot <<- tbl.tmp$systematic_name |
|
342 |
+ names (yeast.uniprot) <<- tbl.tmp$uniprot_id |
|
343 |
+ } |
|
344 |
+ |
|
345 |
+ if (!exists ('mouse.uniprot')) { |
|
346 |
+ tbl.tmp = toTable (org.Mm.egUNIPROT) |
|
347 |
+ mouse.uniprot <<- tbl.tmp$gene_id |
|
348 |
+ names (mouse.uniprot) <<- tbl.tmp$uniprot_id |
|
349 |
+ } |
|
350 |
+ |
|
351 |
+ if (!exists ('human.uniprot')) { |
|
352 |
+ tbl.tmp = toTable (org.Hs.egUNIPROT) |
|
353 |
+ human.uniprot <<- tbl.tmp$gene_id |
|
354 |
+ names (human.uniprot) <<- tbl.tmp$uniprot_id |
|
355 |
+ } |
|
356 |
+ |
|
357 |
+ if (!exists ('worm.uniprot')) { |
|
358 |
+ tbl.tmp = toTable (org.Ce.egUNIPROT) |
|
359 |
+ worm.uniprot <<- tbl.tmp$gene_id |
|
360 |
+ names (worm.uniprot) <<- tbl.tmp$uniprot_id |
|
361 |
+ } |
|
362 |
+ |
|
363 |
+ organism = unique (organism) |
|
364 |
+ stopifnot (length (unique (organism)) == 1) |
|
365 |
+ stopifnot (organism %in% (c ('human', 'mouse', 'yeast', 'worm'))) |
|
366 |
+ |
|
367 |
+ if (organism == 'human') { |
|
368 |
+ ids = human.uniprot [uniprot.ids] |
|
369 |
+ } |
|
370 |
+ else if (organism == 'mouse') { |
|
371 |
+ ids = mouse.uniprot [uniprot.ids] |
|
372 |
+ } |
|
373 |
+ else if (organism == 'yeast') { |
|
374 |
+ ids = yeast.uniprot [uniprot.ids] |
|
375 |
+ } |
|
376 |
+ else if (organism == 'worm') { |
|
377 |
+ ids = worm.uniprot [uniprot.ids] |
|
378 |
+ } |
|
379 |
+ |
|
380 |
+ |
|
381 |
+ # embarrassing but true: could not get this to work by creating a list directly |
|
382 |
+ # round-about solution: make a 1-column data.frame, then construct a named list from that |
|
383 |
+ df = data.frame (uniprot=uniprot.ids, std=as.character (ids), stringsAsFactors=FALSE) |
|
384 |
+ #rownames (df) = uniprot.ids |
|
385 |
+ #result = df$stdID |
|
386 |
+ #names (result) = uniprot.ids |
|
387 |
+ return (df) |
|
388 |
+ |
|
389 |
+} # uniprotToStandardID |
|
390 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
391 |
+# see ~/v/snotes/log "* load uniprobe sql dump file (11 may 2012)" for info on the uniprobe mysql database |
|
392 |
+# used here. |
|
393 |
+createGeneRefTable = function () |
|
394 |
+{ |
|
395 |
+ if (!exists ('db')) |
|
396 |
+ db <<- dbConnect (MySQL (), dbname='uniprobe') |
|
397 |
+ |
|
398 |
+ tbl.pubmed = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'), |
|
399 |
+ pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'), |
|
400 |
+ stringsAsFactors=FALSE) |
|
401 |
+ #CR09 1 Scharer 19147588 human Genome-wide promoter analysis of the SOX4 transcriptional |
|
402 |
+ # network in prostate cancer cells |
|
403 |
+ #Cell08 168 Berger 18585359 mouse Variation in homeodomain DNA binding revealed by high-resolution |
|
404 |
+ # analysis of sequence preferences |
|
405 |
+ #Cell09 34 Grove 19632181 worm A Multiparameter Network Reveals Extensive Divergence between |
|
406 |
+ # C. elegans bHLH Transcription Factors |
|
407 |
+ #EMBO10 25 Wei 20517297 mouse Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo |
|
408 |
+ #GD09 1 Lesch 19204119 worm Transcriptional regulation and stabilization of left right neuronal |
|
409 |
+ # identity in C. elegans |
|
410 |
+ #GR09 89 Zhu 19158363 yeast High-resolution DNA-binding specificity analysis of yeast transcription factors |
|
411 |
+ #MMB08 1 Pompeani 18681939 Vibrio The Vibrio harveyi master quorum-sensing regulator, LuxR... |
|
412 |
+ #NBT06 5 Berger 16998473 yeast;human;mouse Compact, universal DNA microarrays... |
|
413 |
+ #PNAS08 3 De Silva 18541913 apicomplexa Specific DNA-binding by Apicomplexan AP2 transcription factors |
|
414 |
+ #SCI09 104 Badis 19443739 mouse Diversity and complexity in DNA recognition by transcription factors |
|
415 |
+ |
|
416 |
+ |
|
417 |
+ tbl.gene = dbGetQuery (db, 'select * from gene_ids_public') |
|
418 |
+ colnames (tbl.gene) = c ('id', 'name', 'species', 'pub', 'type') |
|
419 |
+ tbl.genomic = dbGetQuery (db, 'select * from genomic_info') [, -c(2,3,8:11)] # remove the longish 'description' field |
|
420 |
+ tbl.pub = dbGetQuery (db, 'select * from publication_ids') [,-3] # remove 'full_ref' for legibility |
|
421 |
+ tbl = merge (tbl.gene, tbl.pub [, c (1,4)], by.x='pub', by.y='publication_id', all.x=TRUE) |
|
422 |
+ tbl = merge (tbl, tbl.pubmed, by.x='folder_name', by.y='folder', all.x=TRUE) |
|
423 |
+ tbl = merge (tbl, tbl.genomic, all.x=TRUE, by.x='name', by.y='gene_name') |
|
424 |
+ redundant.column = grep ('species.y', colnames (tbl)) |
|
425 |
+ stopifnot (length (redundant.column) == 1) |
|
426 |
+ tbl = tbl [, -redundant.column] |
|
427 |
+ column.to.rename = grep ('species.x', colnames (tbl)) |
|
428 |
+ stopifnot (length (column.to.rename) == 1) |
|
429 |
+ colnames (tbl) [column.to.rename] = 'species' |
|
430 |
+ |
|
431 |
+ |
|
432 |
+ # now add 'standard IDs' -- orf names for yeast, entrez geneIDs for everything else |
|
433 |
+ |
|
434 |
+ stdID = getAllStandardIDs (tbl) |
|
435 |
+ tbl = cbind (tbl, stdID) |
|
436 |
+ duplicates = which (duplicated (tbl [, c (1,2)])) |
|
437 |
+ if (length (duplicates) > 0) |
|
438 |
+ tbl = tbl [-duplicates,] |
|
439 |
+ |
|
440 |
+ # fix the organism (species) name, from (e.g.) "Homo sapiens" to "Hsapiens" |
|
441 |
+ tbl$species = standardizeSpeciesNames (tbl$species) |
|
442 |
+ invisible (tbl) |
|
443 |
+ |
|
444 |
+ # now add bindingSequence, from DBDs: gene_name + seq, apparently the binding sequence when known, of 171 of 372 genes |
|
445 |
+ tbl.dbds = dbGetQuery (db, 'select * from DBDs') |
|
446 |
+ dbds.list = tbl.dbds$seq |
|
447 |
+ names (dbds.list) = tbl.dbds$gene_name |
|
448 |
+ bindingSequence = rep (NA_character_, nrow (tbl)) |
|
449 |
+ names (bindingSequence) = tbl$name |
|
450 |
+ shared.names = unique (intersect (tbl.dbds$gene_name, tbl$name)) |
|
451 |
+ bindingSequence [shared.names] = dbds.list [shared.names] |
|
452 |
+ tbl = cbind (tbl, bindingSequence) |
|
453 |
+ invisible (tbl) |
|
454 |
+ |
|
455 |
+} # createGeneRefTable |
|
456 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
457 |
+# make successive species-specific calls to 'uniprotToStandardID', assembling a new column to be added to tbl.geneRef |
|
458 |
+getAllStandardIDs = function (tbl.geneRef) |
|
459 |
+{ |
|
460 |
+ mouse.rows = grep ('Mus musculus', tbl.geneRef$species) |
|
461 |
+ yeast.rows = grep ('Saccharomyces cerevisiae', tbl.geneRef$species) |
|
462 |
+ human.rows = grep ('Homo sapiens', tbl.geneRef$species) |
|
463 |
+ worm.rows = grep ('Caenorhabditis elegans', tbl.geneRef$species) |
|
464 |
+ |
|
465 |
+ stdID = rep (NA_character_, nrow (tbl.geneRef)) |
|
466 |
+ |
|
467 |
+ mouse.uids = tbl.geneRef$uniprot [mouse.rows] |
|
468 |
+ yeast.uids = tbl.geneRef$uniprot [yeast.rows] |
|
469 |
+ human.uids = tbl.geneRef$uniprot [human.rows] |
|
470 |
+ worm.uids = tbl.geneRef$uniprot [worm.rows] |
|
471 |
+ |
|
472 |
+ # add mouse geneIDs |
|
473 |
+ tbl.mouse = uniprotToStandardID ('mouse', mouse.uids) |
|
474 |
+ stopifnot (length (mouse.rows) == nrow (tbl.mouse)) |
|
475 |
+ stdID [mouse.rows] = tbl.mouse$std |
|
476 |
+ |
|
477 |
+ # add yeast orfs |
|
478 |
+ tbl.yeast = uniprotToStandardID ('yeast', yeast.uids) |
|
479 |
+ stopifnot (length (yeast.rows) == nrow (tbl.yeast)) |
|
480 |
+ stdID [yeast.rows] = tbl.yeast$std |
|
481 |
+ |
|
482 |
+ # add human geneIDs |
|
483 |
+ tbl.human = uniprotToStandardID ('human', human.uids) |
|
484 |
+ stopifnot (length (human.rows) == nrow (tbl.human)) |
|
485 |
+ stdID [human.rows] = tbl.human$std |
|
486 |
+ |
|
487 |
+ # add worm geneIDs |
|
488 |
+ tbl.worm = uniprotToStandardID ('worm', worm.uids) |
|
489 |
+ stopifnot (length (worm.rows) == nrow (tbl.worm)) |
|
490 |
+ stdID [worm.rows] = tbl.worm$std |
|
491 |
+ |
|
492 |
+ invisible (stdID) |
|
493 |
+ |
|
494 |
+} # getAllStandardIDs |
|
495 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
496 |
+# change, e.g., "Homo sapiens" to "Hsapiens" |
|
497 |
+standardizeSpeciesNames = function (names) |
|
498 |
+{ |
|
499 |
+ fix = function (name) { |
|
500 |
+ tokens = strsplit (name, ' ')[[1]] |
|
501 |
+ stopifnot (length (tokens) == 2) |
|
502 |
+ genus = tokens [1] |
|
503 |
+ species = tokens [2] |
|
504 |
+ return (paste (substr (genus, 1, 1), species, sep='')) |
|
505 |
+ } |
|
506 |
+ |
|
507 |
+ fixed.names = as.character (sapply (names, fix)) |
|
508 |
+ invisible (fixed.names) |
|
509 |
+ |
|
510 |
+} # standardizeSpeciesNames |
|
511 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
512 |
+renameMatrices = function (matrices, tbl.md) |
|
513 |
+{ |
|
514 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
515 |
+ names (matrices) = rownames (tbl.md) |
|
516 |
+ invisible (matrices) |
|
517 |
+ |
|
518 |
+} # renameMatrices |
|
519 |
+#------------------------------------------------------------------------------------------------------------------------ |
0 | 520 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,446 @@ |
1 |
+# uniprobe/test.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library(RUnit) |
|
4 |
+source("import.R") |
|
5 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
6 |
+run.tests = function (dataDir=kDataDir) |
|
7 |
+{ |
|
8 |
+ txxa <<- test.createMatrixNameUniqifier () # in ../common.R |
|
9 |
+ txx1 <<- test.extractNativeNames (dataDir) |
|
10 |
+ txx2 <<- test.createPublicationRefTable () |
|
11 |
+ txxb <<- test.standardizeSpeciesNames () |
|
12 |
+ txx4 <<- test.createGeneRefTable () # read mysql database to get, uniprobe-style, pwm-specific metadata |
|
13 |
+ txx3 <<- test.uniprotToStandardID () |
|
14 |
+ txx5 <<- test.getAllStandardIDs () # build a 'stdID' column to add to tbl.geneRef |
|
15 |
+ txx5a <<- test.parsePWMfromText (dataDir) |
|
16 |
+ |
|
17 |
+ |
|
18 |
+ |
|
19 |
+ txx6 <<- test.extractPWMfromFile (dataDir) |
|
20 |
+ txx7 <<- test.translateFileNameToGeneName () |
|
21 |
+ txx8 <<- test.readAndParse (dataDir) |
|
22 |
+ txx9 <<- test.createMetadata (dataDir) |
|
23 |
+ txxb <<- test.createMetadata.sox4 (dataDir) |
|
24 |
+ txxb <<- test.createMetadata.cbf1 (dataDir) |
|
25 |
+ txxba <<- test.createMetadata.fixTrailingSpaceInBindingDomain (dataDir) |
|
26 |
+ txxc <<- test.renameMatrices () |
|
27 |
+ txxd <<- test.emptyStringProteinId (dataDir) |
|
28 |
+ |
|
29 |
+} # run.tests |
|
30 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
31 |
+# what tables? what columns in each? |
|
32 |
+exploreDatabase = function () |
|
33 |
+{ |
|
34 |
+ if (!exists ('db')) |
|
35 |
+ db <<- dbConnect (MySQL (), dbname='uniprobe') |
|
36 |
+ tables = dbListTables (db) |
|
37 |
+ excluders = grep ('mers_', tables) |
|
38 |
+ if (length (excluders) > 0) |
|
39 |
+ tables = tables [-excluders] |
|
40 |
+ for (table in tables) { |
|
41 |
+ fields = dbListFields (db, table) |
|
42 |
+ printf ('--- %s: %d', table, length (fields)) |
|
43 |
+ printf (' %s', list.to.string (fields)) |
|
44 |
+ } # for table |
|
45 |
+ |
|
46 |
+ |
|
47 |
+} # exploreDatabase |
|
48 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
49 |
+test.parsePWMfromText = function (dataDir) |
|
50 |
+{ |
|
51 |
+ print ('--- test.parsePWMfromText') |
|
52 |
+ |
|
53 |
+ sample.file = file.path(dataDir, 'All_PWMs/Cell08/Phox2a_3947.1_pwm.txt') |
|
54 |
+ lines.of.text = scan (sample.file, what=character(0), sep='\n', skip=2, quiet=TRUE) |
|
55 |
+ yy <<- lines.of.text |
|
56 |
+ checkEquals (grep ('^A:', lines.of.text), 1) |
|
57 |
+ pwm.phox2a = parsePWMfromText (lines.of.text) |
|
58 |
+ checkTrue (is.matrix (pwm.phox2a)) |
|
59 |
+ checkEquals (dim (pwm.phox2a), c (4, 16)) |
|
60 |
+ xx <<- pwm.phox2a |
|
61 |
+ for (c in 1:ncol (pwm.phox2a)) { |
|
62 |
+ checkEqualsNumeric (sum (pwm.phox2a [, c]), 1.0, tol=0.01) |
|
63 |
+ } # for c |
|
64 |
+ |
|
65 |
+ invisible (pwm.phox2a) |
|
66 |
+ |
|
67 |
+} # test.parsePWMfromText |
|
68 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
69 |
+test.extractPWMfromFile = function (dataDir) |
|
70 |
+{ |
|
71 |
+ print ('--- test.extractPWMfromFile') |
|
72 |
+ |
|
73 |
+ # a well-behaved file, with a single unaccompanied tab separating columns |
|
74 |
+ file = file.path(dataDir, 'All_PWMs/SCI09/Arid3a_pwm_primary.txt') |
|
75 |
+ checkTrue(file.exists(file)) |
|
76 |
+ |
|
77 |
+ arid2a.matrix <<- extractPWMfromFile (file) |
|
78 |
+ checkEquals (rownames (arid2a.matrix), c ('A','C', 'G', 'T')) |
|
79 |
+ checkTrue (all (as.integer (round (as.double (colSums (arid2a.matrix)))) == 1)) |
|
80 |
+ |
|
81 |
+ # a file with a confusion of white spaces |
|
82 |
+ # "A:\t 0.247560633305795 0.264091385393575\t\t 0.204731944407109 .... |
|
83 |
+ # depends on perl=T whitespace+ regex in parsePWMfromText |
|
84 |
+ |
|
85 |
+ file = file.path(dataDir, 'All_PWMs/GD09/Nsy-7.pwm') |
|
86 |
+ nsy7.matrix <<- extractPWMfromFile (file) |
|
87 |
+ checkEquals (rownames (nsy7.matrix), c ('A','C', 'G', 'T')) |
|
88 |
+ checkTrue (all (as.integer (round (as.double (colSums (nsy7.matrix)))) == 1)) |
|
89 |
+ |
|
90 |
+} # test.extractPWMfromFile |
|
91 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
92 |
+test.createPublicationRefTable = function () |
|
93 |
+{ |
|
94 |
+ print ('--- test.createPublicationRefTable') |
|
95 |
+ tbl.ref = createPublicationRefTable () |
|
96 |
+ checkEquals (dim (tbl.ref), c (10, 6)) |
|
97 |
+ checkEquals (colnames (tbl.ref), c ('folder', 'author', 'pmid', 'organism', 'count', 'title')) |
|
98 |
+ |
|
99 |
+ # make sure neither author nor pmid repeat |
|
100 |
+ #checkEquals (length (unique (tbl.ref$author)), nrow (tbl.ref)) |
|
101 |
+ #checkEquals (length (unique (tbl.ref$pmid)), nrow (tbl.ref)) |
|
102 |
+ |
|
103 |
+ invisible (tbl.ref) |
|
104 |
+ |
|
105 |
+} # test.createPublicationRefTable |
|
106 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
107 |
+test.identifyFiles = function () |
|
108 |
+{ |
|
109 |
+ print ('--- test.identifyFiles') |
|
110 |
+ |
|
111 |
+ # just one legit pwm file in GD09: |
|
112 |
+ |
|
113 |
+ files.found = identifyFiles ('All_PWMs/GD09') |
|
114 |
+ checkEquals (length (files.found), 1) |
|
115 |
+ |
|
116 |
+ |
|
117 |
+ files.found = identifyFiles ('.') |
|
118 |
+ |
|
119 |
+ checkTrue (length (files.found) > 375) |
|
120 |
+ |
|
121 |
+ invisible (files.found) |
|
122 |
+ |
|
123 |
+ |
|
124 |
+} # test.identifyFiles |
|
125 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
126 |
+test.readAndParse = function (dataDir) |
|
127 |
+{ |
|
128 |
+ print ('--- test.readAndParse') |
|
129 |
+ all.files = identifyFiles (file.path(dataDir, 'All_PWMs')) |
|
130 |
+ test.file = grep ('GD09', all.files, v=TRUE) |
|
131 |
+ checkEquals (length (test.file), 1) |
|
132 |
+ mtx = readAndParse (test.file) |
|
133 |
+ checkEquals (length (mtx), 1) |
|
134 |
+ checkTrue (is.matrix (mtx [[1]])) |
|
135 |
+ |
|
136 |
+ random.sample <- c (98, 99, 251) |
|
137 |
+ test.files.3 = all.files [random.sample] |
|
138 |
+ mtx3 = readAndParse (test.files.3) |
|
139 |
+ checkEquals (length (mtx3), 3) |
|
140 |
+ expected.matrix.names <- c("PNAS08/PF14_0633.pwm", "PNAS08/PFF0200c.pwm", |
|
141 |
+ "Cell08/Pou4f3_2791.1_pwm.txt") |
|
142 |
+ checkEquals(names(mtx3), expected.matrix.names) |
|
143 |
+ |
|
144 |
+ invisible (mtx3) |
|
145 |
+ |
|
146 |
+} # test.readAndParse |
|
147 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
148 |
+test.translateFileNameToGeneName = function () |
|
149 |
+{ |
|
150 |
+ print ('--- test.translateFileNameToGeneName') |
|
151 |
+ checkEquals (translateFileNameToGeneName ('./All_PWMs/GD09/Nsy-7.pwm'), 'Nsy-7') |
|
152 |
+ checkEquals (translateFileNameToGeneName ('Nsy-7.pwm'), 'Nsy-7') |
|
153 |
+ checkEquals (translateFileNameToGeneName ('Nsy-7'), 'Nsy-7') |
|
154 |
+ |
|
155 |
+} # test.translateFileNameToGeneName |
|
156 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
157 |
+test.createMetadata = function (dataDir) |
|
158 |
+{ |
|
159 |
+ print ('--- test.createMetadata') |
|
160 |
+ all.files = identifyFiles (file.path(dataDir,'All_PWMs')) |
|
161 |
+ |
|
162 |
+ sample.files = all.files [c (12, 36, 259, 269, 309)] # sample (1:length (all.files), 5)] |
|
163 |
+ # add in all entries from the NBT paper, which are a mix of organisms |
|
164 |
+ # Scerevisiae;Hsapiens;Mmusculus. want to make sure that organism is drawn from tbl.geneRef, rather than |
|
165 |
+ # the old approach, which learned matrix/gene/organism relationships from tbl.pubRef |
|
166 |
+ sample.files = c (sample.files, grep ('NBT', all.files, v=T)) |
|
167 |
+ |
|
168 |
+ matrices.10 = readAndParse (sample.files) |
|
169 |
+ tbl.pubRef = createPublicationRefTable () |
|
170 |
+ tbl.geneRef = createGeneRefTable () |
|
171 |
+ tbl.md = createMetadata (matrices.10, tbl.pubRef, tbl.geneRef) |
|
172 |
+ |
|
173 |
+ checkEquals (dim (tbl.md), c (10, 15)) |
|
174 |
+ checkEquals (tbl.md$geneId, c ("YPR104C", "YBR089C-A", "194738", "21410", "54131", "YJR060W", "179485", "5451", "YNL216W", "13653")) |
|
175 |
+ |
|
176 |
+ checkEquals (tbl.md$geneIdType, c ("SGD", "SGD", "ENTREZ", "ENTREZ", "ENTREZ", "SGD", "ENTREZ", "ENTREZ", "SGD", "ENTREZ")) |
|
177 |
+ |
|
178 |
+ checkEquals (tbl.md$organism, c ("Scerevisiae", "Scerevisiae", "Mmusculus", "Mmusculus", "Mmusculus", "Scerevisiae", |
|
179 |
+ "Celegans", "Hsapiens", "Scerevisiae", "Mmusculus")) |
|
180 |
+ checkEquals (tbl.md$geneSymbol, c ("Fhl1", "Nhp6b", "Rhox11", "Tcf2", "Irf3", "Cbf1", "Ceh-22", "Oct-1", "Rap1", "Zif268")) |
|
181 |
+ |
|
182 |
+ checkEquals (tbl.md$proteinId, c ("P39521", "P11633", "Q810N8", "P27889", "P70671", "P17106", "P41936", "P14859", "P11938", "P08046")) |
|
183 |
+ checkEquals (tbl.md$proteinIdType, c ("UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT", "UNIPROT")) |
|
184 |
+ checkEquals (tbl.md$bindingDomain, c ("Fork-head", "HMG_box", "Homeo", "Homeo", "IRF", "HLH", "Homeo", "POU", "Myb", "ZnF_C2H2")) |
|
185 |
+ checkEquals (tbl.md$bindingSequence, c (NA_character_, NA_character_, |
|
186 |
+ 'PRKAYRFTPGQLWELQAVFVENQYPDALKRKELAGLLNVDEQKIKDWFNNKRAKYRK', |
|
187 |
+ 'RRNRFKWGPASQQILYQAYDRQKNPSKEEREALVEECNRAECLQRGVSPSKAHGLGSNLVTEVRVYNWFANRRKEEAF', |
|
188 |
+ NA_character_, |
|
189 |
+ NA_character_, |
|
190 |
+ NA_character_, |
|
191 |
+ NA_character_, |
|
192 |
+ NA_character_, |
|
193 |
+ NA_character_)) |
|
194 |
+ |
|
195 |
+ |
|
196 |
+ checkEquals (length (which (duplicated (tbl.md$providerName))), 0) |
|
197 |
+ |
|
198 |
+ full.names = rownames (tbl.md) |
|
199 |
+ checkEquals (length (grep ('.UP0', full.names)), nrow (tbl.md)) |
|
200 |
+ checkEquals (length (grep ('-UniPROBE', full.names)), nrow (tbl.md)) |
|
201 |
+ |
|
202 |
+ # should be 10 full.names, each of 3 tokens: "Scerevisiae-UniPROBE-Fhl1.UP00303" |
|
203 |
+ tokens = strsplit (full.names, '-') |
|
204 |
+ checkEquals (length (tokens), nrow (tbl.md)) |
|
205 |
+ all (sapply (tokens, function (sub.tokens) checkEquals (length (sub.tokens), 3))) |
|
206 |
+ |
|
207 |
+ invisible (tbl.md) |
|
208 |
+ |
|
209 |
+} # test.createMetadata |
|
210 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
211 |
+# uniprobe reports yeast Cbf1 from two studies. one is from NBT06 which is a multiple species pwm collection. this |
|
212 |
+# will be handled properly in the future (TODO) but not yet. the more common UniPROBE practive of one-organism/one-paper |
|
213 |
+# is used throughout to assign organism to pwm. |
|
214 |
+# createMetadata can return too many rows in some cases, mostly (always) |
|
215 |
+# |
|
216 |
+test.createMetadata.cbf1 = function (dataDir) |
|
217 |
+{ |
|
218 |
+ print ('--- test.createMetadata.cbf1') |
|
219 |
+ all.files = identifyFiles (file.path(dataDir, 'All_PWMs')) |
|
220 |
+ cbf1.files = all.files [grep ('cbf1', all.files, ignore.case=T)] |
|
221 |
+ stopifnot (length (cbf1.files) == 2) # one in GR09, one in NBT06 |
|
222 |
+ |
|
223 |
+ matrices.cbf1 = readAndParse (cbf1.files) |
|
224 |
+ tbl.pubRef = createPublicationRefTable () |
|
225 |
+ tbl.geneRef = createGeneRefTable () |
|
226 |
+ tbl.md = createMetadata (matrices.cbf1, tbl.pubRef, tbl.geneRef) |
|
227 |
+ checkEquals (dim (tbl.md), c (2, 15)) |
|
228 |
+ checkEquals (tbl.md$geneSymbol, c ('Cbf1', 'Cbf1')) |
|
229 |
+ checkEquals (tbl.md$providerName, c ('GR09/Cbf1.pwm', 'NBT06/Cbf1.pwm')) |
|
230 |
+ |
|
231 |
+ invisible (tbl.md) |
|
232 |
+ |
|
233 |
+} # test.createMetadata.cbf1 |
|
234 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
235 |
+# some of the metadata for matrices in Cell08, obtained through sql in createGeneRefTable, ended up with a |
|
236 |
+# trailing blank ('Homeo ') in the bindingDomain field. make sure this is fixed. |
|
237 |
+# currently the fix is to define and use a string 'trim' method in createMetaData () |
|
238 |
+test.createMetadata.fixTrailingSpaceInBindingDomain = function (dataDir) |
|
239 |
+{ |
|
240 |
+ print ('--- test.createMetadata.fixTrailingSpaceInBindingDomain') |
|
241 |
+ all.files = identifyFiles (file.path(dataDir,'All_PWMs')) |
|
242 |
+ sample.trailingSpaceCulpritFiles = head (grep ('Irx6', all.files)) |
|
243 |
+ stopifnot (length (sample.trailingSpaceCulpritFiles) == 1) |
|
244 |
+ filenames = all.files [sample.trailingSpaceCulpritFiles] |
|
245 |
+ matrices.tmp = readAndParse (filenames) |
|
246 |
+ tbl.pubRef = createPublicationRefTable () |
|
247 |
+ tbl.geneRef = createGeneRefTable () |
|
248 |
+ # the fix is in here. |
|
249 |
+ tbl.md = createMetadata (matrices.tmp, tbl.pubRef, tbl.geneRef) |
|
250 |
+ checkEquals (dim (tbl.md), c (1, 15)) |
|
251 |
+ checkEquals (tbl.md$bindingDomain, "Homeo") |
|
252 |
+ |
|
253 |
+} # test.createMetadata.fixTrailingSpaceInBindingDomain |
|
254 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
255 |
+# uniprobe reports sox4 in both mouse and human. these different organisms must be recognized for this to work. |
|
256 |
+# it used to fail when organism was variously (e.g.) 'Homo sapiens' and 'Hsapiens'. |
|
257 |
+# the function standardizeSpeciesNames was added to fix this, converting everything in tbl.geneRef$species |
|
258 |
+# |
|
259 |
+test.createMetadata.sox4 = function (dataDir) |
|
260 |
+{ |
|
261 |
+ print ('--- test.createMetadata.sox4') |
|
262 |
+ all.files = identifyFiles (file.path(dataDir, 'All_PWMs')) |
|
263 |
+ sox4.files = all.files [grep ('sox4', all.files, ignore.case=T)] |
|
264 |
+ stopifnot (length (sox4.files) == 2) # one in CR09, one in SCI09 |
|
265 |
+ |
|
266 |
+ matrices.sox4 = readAndParse (sox4.files) |
|
267 |
+ tbl.pubRef = createPublicationRefTable () |
|
268 |
+ tbl.geneRef = createGeneRefTable () |
|
269 |
+ tbl.md = createMetadata (matrices.sox4, tbl.pubRef, tbl.geneRef) |
|
270 |
+ checkEquals (dim (tbl.md), c (2, 15)) |
|
271 |
+ |
|
272 |
+} # test.createMetadata.sox4 |
|
273 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
274 |
+test.extractNativeNames = function (dataDir) |
|
275 |
+{ |
|
276 |
+ print ('--- test.extractNativeNames') |
|
277 |
+ |
|
278 |
+ # check a random sample, extract called one raw name at at time |
|
279 |
+ |
|
280 |
+ checkEquals (extractNativeNames ("Cell08/Lhx8_2247.2_pwm.txt"), 'Lhx8') |
|
281 |
+ checkEquals (extractNativeNames ("Cell08/Phox2a_3947.1_pwm.txt"), 'Phox2a') |
|
282 |
+ |
|
283 |
+ checkEquals (extractNativeNames ("GR09/Srd1.pwm"), 'Srd1') |
|
284 |
+ checkEquals (extractNativeNames ("Cell08/Pou3f3_3235.2_pwm.txt"), 'Pou3f3') |
|
285 |
+ checkEquals (extractNativeNames ("Cell08/Msx3_3206.1_pwm.txt"), 'Msx3') |
|
286 |
+ |
|
287 |
+ # now do the sample again in one call |
|
288 |
+ |
|
289 |
+ names.5 = extractNativeNames (c ("Cell08/Lhx8_2247.2_pwm.txt", |
|
290 |
+ "Cell08/Phox2a_3947.1_pwm.txt", |
|
291 |
+ "GR09/Srd1.pwm", |
|
292 |
+ "Cell08/Pou3f3_3235.2_pwm.txt", |
|
293 |
+ "Cell08/Msx3_3206.1_pwm.txt")) |
|
294 |
+ checkEquals (names.5, c ("Lhx8", "Phox2a", "Srd1", "Pou3f3", "Msx3")) |
|
295 |
+ |
|
296 |
+ all.files = identifyFiles (file.path(dataDir,'All_PWMs')) |
|
297 |
+ matrices.all = readAndParse (all.files) |
|
298 |
+ all.raw.names = names (matrices.all) |
|
299 |
+ cooked.names.all = c () |
|
300 |
+ |
|
301 |
+ for (raw.name in all.raw.names) { |
|
302 |
+ cooked.name = extractNativeNames (raw.name) |
|
303 |
+ cooked.names.all = c (cooked.names.all, cooked.name) |
|
304 |
+ #printf ('%30s: %10s', raw.name, cooked.name) |
|
305 |
+ } # for raw.name |
|
306 |
+ |
|
307 |
+ cooked.names.all.v2 = extractNativeNames (all.raw.names) |
|
308 |
+ checkEquals (cooked.names.all, cooked.names.all.v2) |
|
309 |
+ |
|
310 |
+ invisible (cooked.names.all) |
|
311 |
+ |
|
312 |
+} # test.extractNativeNames |
|
313 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
314 |
+test.uniprotToStandardID = function () |
|
315 |
+{ |
|
316 |
+ print ('--- test.uniprotToStandardID') |
|
317 |
+ checkEquals (uniprotToStandardID ('mouse', 'Q14A64')$std, '11298') |
|
318 |
+ checkEquals (uniprotToStandardID ('human', 'P14859')$std, '5451') |
|
319 |
+ checkEquals (uniprotToStandardID ('yeast', 'P17106')$std, "YJR060W") |
|
320 |
+ checkEquals (uniprotToStandardID ('worm', 'P41936')$std, "179485") |
|
321 |
+ |
|
322 |
+ yeast.uids = c ("P22149", "Q04052", "P40467", "P22035", "P17106") |
|
323 |
+ checkEquals (uniprotToStandardID ('yeast', yeast.uids)$std, |
|
324 |
+ c ("YGL071W", "YDR421W", "YIL130W", "YKR099W", "YJR060W")) |
|
325 |
+ |
|
326 |
+ checkTrue (is.na (uniprotToStandardID ('mouse', 'Q8BI45')$std)) |
|
327 |
+ mouse.uids = c ("O70137", "O35137", "Q62431", "Q8BI45", "O35085", "O35885") |
|
328 |
+ result = uniprotToStandardID ('mouse', mouse.uids) |
|
329 |
+ checkEquals (result$std, c ("11694", "11695", "13496", NA, "11878", "17173")) |
|
330 |
+ |
|
331 |
+ worm.uniprots = c ("P41936", "Q4PIU8") |
|
332 |
+ result = uniprotToStandardID ('worm', worm.uniprots) |
|
333 |
+ checkEquals (result$uniprot, worm.uniprots) |
|
334 |
+ checkEquals (result$std, c ('179485', NA)) |
|
335 |
+ |
|
336 |
+ # expose and solve a problem with duplicates |
|
337 |
+ # "P02831" "P97436" "Q9QZ28" "Q66JY7" "P08046" |
|
338 |
+ mouse.uids = c ("P02831", "P02831", '', '') |
|
339 |
+ result = uniprotToStandardID ('mouse', mouse.uids) |
|
340 |
+ checkEquals (result$uniprot, mouse.uids) |
|
341 |
+ checkEquals (result$std, c ('15400', '15400', NA, NA)) |
|
342 |
+ |
|
343 |
+ # mouse Zic[123] maps to geneID 22771-3. but 22772 is not found for Zic2. why? |
|
344 |
+ checkEquals (uniprotToStandardID ('mouse', 'P46684')$std, '22771') |
|
345 |
+ #checkEquals (uniprotToStandardID ('mouse', 'Q62520')$std, '22772') TODO! |
|
346 |
+ checkEquals (uniprotToStandardID ('mouse', 'Q62521')$std, '22773') |
|
347 |
+ |
|
348 |
+} # test.uniprotToStandardID |
|
349 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
350 |
+test.createGeneRefTable = function () |
|
351 |
+{ |
|
352 |
+ print ('--- test.createGeneRefTable') |
|
353 |
+ tbl.geneRef = createGeneRefTable () |
|
354 |
+ checkEquals (length (grep (' ', tbl.geneRef$species)), 0) # no spaces in the species names |
|
355 |
+ checkEquals (ncol (tbl.geneRef), 13) |
|
356 |
+ checkEquals (sort (colnames (tbl.geneRef)), |
|
357 |
+ c ("bindingSequence", "domain", "folder_name", "id", "ihop", "name", "pmid", "pub", "refseq_id", "species", |
|
358 |
+ "stdID", "type", "uniprot")) |
|
359 |
+ checkEquals (nrow (tbl.geneRef), 372) |
|
360 |
+ checkEquals (length (which (duplicated (tbl.geneRef [, 1:2]))), 0) |
|
361 |
+ checkEquals (length (which (!is.na (tbl.geneRef$bindingSequence))), 168) |
|
362 |
+ |
|
363 |
+ invisible (tbl.geneRef) |
|
364 |
+ |
|
365 |
+} # test.createGeneRefTable |
|
366 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
367 |
+test.getAllStandardIDs = function () |
|
368 |
+{ |
|
369 |
+ print ('--- test.getAllStandardIDs') |
|
370 |
+ if (!exists ('tbl.geneRef')) |
|
371 |
+ tbl.geneRef = createGeneRefTable () |
|
372 |
+ |
|
373 |
+ stdID = getAllStandardIDs (tbl.geneRef) |
|
374 |
+ checkEquals (length (stdID), nrow (tbl.geneRef)) |
|
375 |
+ tbl.g2 = cbind (tbl.geneRef, stdID, stringsAsFactors=FALSE) |
|
376 |
+ checkEquals (nrow (tbl.g2), nrow (tbl.geneRef)) |
|
377 |
+ |
|
378 |
+ # every row in the table for a yeast pwm should have stdID of NA or start with Y |
|
379 |
+ should.be.orf.names = subset (tbl.g2, species=='Scerevisiae')$stdID |
|
380 |
+ checkEquals ((length (which (is.na (should.be.orf.names))) + |
|
381 |
+ length (grep ('^Y', should.be.orf.names))), |
|
382 |
+ length (should.be.orf.names)) |
|
383 |
+ # no other rows should have stdID starting with Y |
|
384 |
+ should.not.be.orf.names = subset (tbl.g2, species!='Scerevisiae')$stdID |
|
385 |
+ checkEquals (length (grep ('^Y', should.not.be.orf.names)), 0) |
|
386 |
+ |
|
387 |
+ checkTrue (length (which (is.na (tbl.g2$stdID))) < 70) |
|
388 |
+ |
|
389 |
+ # now handcheck a few from each species |
|
390 |
+ checkEquals (subset (tbl.g2, name=='Cbf1')$stdID, c ("YJR060W", "YJR060W")) |
|
391 |
+ checkEquals (subset (tbl.g2, name=='Yox1')$stdID, "YML027W") |
|
392 |
+ |
|
393 |
+ invisible (tbl.g2) |
|
394 |
+ |
|
395 |
+} # test.getAllStandardIDs |
|
396 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
397 |
+test.standardizeSpeciesNames = function () |
|
398 |
+{ |
|
399 |
+ print ('--- test.standardizeSpeciesNames') |
|
400 |
+ old.style = c ("Saccharomyces cerevisiae", "Mus musculus", "Caenorhabditis elegans", "Cryptosporidium parvum", |
|
401 |
+ "Vibrio harveyi", "Homo sapiens", "Plasmodium falciparum") |
|
402 |
+ new.style = standardizeSpeciesNames (old.style [1]) |
|
403 |
+ checkEquals (new.style, 'Scerevisiae') |
|
404 |
+ |
|
405 |
+ new.style = standardizeSpeciesNames (old.style) |
|
406 |
+ checkEquals (new.style, c ("Scerevisiae", "Mmusculus", "Celegans", "Cparvum", "Vharveyi", "Hsapiens", "Pfalciparum")) |
|
407 |
+ |
|
408 |
+} # test.standardizeSpeciesNames |
|
409 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
410 |
+test.renameMatrices = function (matrices, tbl.md, tbl.anno) |
|
411 |
+{ |
|
412 |
+ print ('--- test.renameMatrices') |
|
413 |
+ |
|
414 |
+} # test.renameMatrices |
|
415 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
416 |
+# Scerevisiae-UniPROBE-Gsm1.UP00301, GR09/Gsm1.pwm, UP00301, Gsm1, NA, NA, "", UNIPROT, NA, Scerevisiae, Zn2Cys67 |
|
417 |
+test.emptyStringProteinId = function (dataDir) |
|
418 |
+{ |
|
419 |
+ print ('--- test.emptyStringProteinId') |
|
420 |
+ all.files = identifyFiles (file.path(dataDir, 'All_PWMs')) |
|
421 |
+ filename.gsm1 = grep ('Gsm1', all.files, v=T) |
|
422 |
+ mtx = readAndParse (filename.gsm1) |
|
423 |
+ #browser (text='test.emptyStringProteinId') |
|
424 |
+ tbl.pubRef = createPublicationRefTable () |
|
425 |
+ tbl.geneRef = createGeneRefTable () |
|
426 |
+ tbl.md = createMetadata (mtx, tbl.pubRef, tbl.geneRef) |
|
427 |
+ checkEquals (nrow (tbl.md), 1) |
|
428 |
+ md.list = as.list (tbl.md) |
|
429 |
+ checkEquals (md.list$proteinId, 'NP_012432') |
|
430 |
+ checkEquals (md.list$proteinIdType, 'REFSEQ') |
|
431 |
+ |
|
432 |
+ |
|
433 |
+} # test.emptyStringProteinId |
|
434 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
435 |
+test.createMatrixNameUniqifier = function () |
|
436 |
+{ |
|
437 |
+ print ('--- test.createMatrixNameUniqifier') |
|
438 |
+ |
|
439 |
+ data = c (8,8,8,7,2,2,5,6,5,3,10,5,8,6,9,5,8,8,4,5,8,6,9,2,1,0,5,7,7,2,4,4,3,7,7,9,9,6,1,3) |
|
440 |
+ test.matrix = matrix (data=data, nrow=4, ncol=10) |
|
441 |
+ uniqifier = createMatrixNameUniqifier (test.matrix) |
|
442 |
+ xxx <<- uniqifier |
|
443 |
+ checkEquals (uniqifier, "b42f") |
|
444 |
+ |
|
445 |
+} # test.createMatrixNameUniqifier |
|
446 |
+#------------------------------------------------------------------------------------------------------------------------ |