git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86070 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -185,7 +185,7 @@ createMetadataTable = function (matrices, novels) |
185 | 185 |
bindingDomain=NA, |
186 | 186 |
tfFamily=NA, |
187 | 187 |
experimentType='digital genomic footprinting', |
188 |
- pubmedID='22959076') |
|
188 |
+ pubmedID="22955618") |
|
189 | 189 |
tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE)) |
190 | 190 |
full.name = sprintf ('%s-%s-%s', 'Hsapiens', 'stamlab', matrix.id) |
191 | 191 |
rownames (tbl.md) [nrow (tbl.md)] = full.name |
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86014 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -195,7 +195,7 @@ createMetadataTable = function (matrices, novels) |
195 | 195 |
novels.ordered = novels [tbl.md$providerName] # make sure we follow the order in the tbl |
196 | 196 |
novelPFM [which (novels.ordered)] = 'novelMotif' |
197 | 197 |
tbl.md$geneId = novelPFM |
198 |
- tbl.md$geneIdType = rep ('comment', nrow (tbl.md)) |
|
198 |
+ tbl.md$geneIdType = rep (NA_character_, nrow (tbl.md)) |
|
199 | 199 |
|
200 | 200 |
invisible (tbl.md) |
201 | 201 |
|
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75382 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -5,10 +5,7 @@ library (org.Mm.eg.db) |
5 | 5 |
#------------------------------------------------------------------------------------------------------------------------ |
6 | 6 |
printf <- function(...) print(noquote(sprintf(...))) |
7 | 7 |
#------------------------------------------------------------------------------------------------------------------------ |
8 |
-kDataDir <- "~/s/data/public/TFBS" |
|
9 |
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb" |
|
10 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
11 |
-run = function (dataDir=kDataDir) |
|
8 |
+run = function (dataDir) |
|
12 | 9 |
{ |
13 | 10 |
dataDir <- file.path(dataDir, "stamlab") |
14 | 11 |
rawMatrixList <- readRawMatrices (dataDir) |
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75368 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -5,11 +5,12 @@ library (org.Mm.eg.db) |
5 | 5 |
#------------------------------------------------------------------------------------------------------------------------ |
6 | 6 |
printf <- function(...) print(noquote(sprintf(...))) |
7 | 7 |
#------------------------------------------------------------------------------------------------------------------------ |
8 |
-kDataDir <- "~/s/data/public/TFBS/stam" |
|
9 |
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab" |
|
8 |
+kDataDir <- "~/s/data/public/TFBS" |
|
9 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb" |
|
10 | 10 |
#------------------------------------------------------------------------------------------------------------------------ |
11 | 11 |
run = function (dataDir=kDataDir) |
12 | 12 |
{ |
13 |
+ dataDir <- file.path(dataDir, "stamlab") |
|
13 | 14 |
rawMatrixList <- readRawMatrices (dataDir) |
14 | 15 |
novels <- readNovelStatus (dataDir) |
15 | 16 |
matrices <- extractAndNormalizeMatrices (rawMatrixList) |
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75351 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -5,8 +5,8 @@ library (org.Mm.eg.db) |
5 | 5 |
#------------------------------------------------------------------------------------------------------------------------ |
6 | 6 |
printf <- function(...) print(noquote(sprintf(...))) |
7 | 7 |
#------------------------------------------------------------------------------------------------------------------------ |
8 |
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab" |
|
9 | 8 |
kDataDir <- "~/s/data/public/TFBS/stam" |
9 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab" |
|
10 | 10 |
#------------------------------------------------------------------------------------------------------------------------ |
11 | 11 |
run = function (dataDir=kDataDir) |
12 | 12 |
{ |
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75345 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,349 @@ |
1 |
+# stamlab/import.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library (org.Hs.eg.db) |
|
4 |
+library (org.Mm.eg.db) |
|
5 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
6 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
7 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
8 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab" |
|
9 |
+kDataDir <- "~/s/data/public/TFBS/stam" |
|
10 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
11 |
+run = function (dataDir=kDataDir) |
|
12 |
+{ |
|
13 |
+ rawMatrixList <- readRawMatrices (dataDir) |
|
14 |
+ novels <- readNovelStatus (dataDir) |
|
15 |
+ matrices <- extractAndNormalizeMatrices (rawMatrixList) |
|
16 |
+ tbl.md <- createMetadataTable (matrices, novels) |
|
17 |
+ matrices <- renameMatrices (matrices, tbl.md) |
|
18 |
+ |
|
19 |
+ serializedFile <- "stamlab.RData" |
|
20 |
+ save (matrices, tbl.md, file=serializedFile) |
|
21 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
22 |
+ printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
23 |
+ |
|
24 |
+} # run |
|
25 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
26 |
+readRawMatrices = function (dataDir) |
|
27 |
+{ |
|
28 |
+ filename <- file.path(dataDir, "de.novo.pwm") |
|
29 |
+ all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE) |
|
30 |
+ title.lines = grep ('UW.Motif', all.lines) |
|
31 |
+ title.line.count <<- length (title.lines) |
|
32 |
+ max = title.line.count - 1 |
|
33 |
+ |
|
34 |
+ pwms = list () |
|
35 |
+ |
|
36 |
+ for (i in 1:max) { |
|
37 |
+ start.line = title.lines [i] |
|
38 |
+ end.line = title.lines [i+1] - 1 |
|
39 |
+ new.pwm = parsePwm (all.lines [start.line:end.line]) |
|
40 |
+ pwms = c (pwms, list (new.pwm)) |
|
41 |
+ } # for i |
|
42 |
+ |
|
43 |
+ |
|
44 |
+ invisible (pwms) |
|
45 |
+ |
|
46 |
+} # readRawMatrices |
|
47 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
48 |
+readNovelStatus = function (dataDir) |
|
49 |
+{ |
|
50 |
+ filename <- file.path(dataDir, "novels.txt") |
|
51 |
+ novels = scan (filename, what=character(0), sep='\n', quiet=TRUE) |
|
52 |
+ all.names = sprintf ('UW.Motif.%04d', 1:683) |
|
53 |
+ status = rep (FALSE, length (all.names)) |
|
54 |
+ true.novels = match (novels, all.names) |
|
55 |
+ status [true.novels] = TRUE |
|
56 |
+ names (status) = all.names |
|
57 |
+ return (status) |
|
58 |
+ |
|
59 |
+} # readNovelStatus |
|
60 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
61 |
+extractAndNormalizeMatrices = function (pwm.list) |
|
62 |
+{ |
|
63 |
+ ms = sapply (pwm.list, function (element) element$matrix) |
|
64 |
+ nms = normalizeMatrices (ms) |
|
65 |
+ names (nms) = sapply (pwm.list, function (element) element$title) |
|
66 |
+ return (nms) |
|
67 |
+ |
|
68 |
+} # extractAndNormalizeMatrices |
|
69 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
70 |
+# matrices = sapply (list.pwms, function (pwm) pwm$matrix) |
|
71 |
+# matrix.names = sapply (list.pwms, function (pwm) pwm$title) |
|
72 |
+# names (matrices) = matrix.names |
|
73 |
+convertRawMatricesToStandard = function (tbl.rmat) |
|
74 |
+{ |
|
75 |
+ matrix.ids = unique (tbl.rmat$id) |
|
76 |
+ result = vector ('list', length (matrix.ids)) |
|
77 |
+ |
|
78 |
+ i = 1 |
|
79 |
+ for (matrix.id in matrix.ids) { |
|
80 |
+ tbl.sub = subset (tbl.rmat, id == matrix.id) |
|
81 |
+ # sanity check this rather unusual representation of a position count matrix |
|
82 |
+ base.count = as.data.frame (table (tbl.sub$base)) |
|
83 |
+ stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T')) |
|
84 |
+ # conservative length check. actually expect sequence lengths of 6 - 20 bases |
|
85 |
+ if (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) { |
|
86 |
+ printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1]) |
|
87 |
+ } |
|
88 |
+ stopifnot (all (base.count$Freq == base.count$Freq [1])) |
|
89 |
+ nucleotide.counts = tbl.sub$count |
|
90 |
+ row.names = c('A', 'C', 'G', 'T'); |
|
91 |
+ col.names = 1:(nrow (tbl.sub) / 4) |
|
92 |
+ m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names)) |
|
93 |
+ result [[i]] = m |
|
94 |
+ i = i + 1 |
|
95 |
+ } # for matrix.id |
|
96 |
+ |
|
97 |
+ names (result) = matrix.ids |
|
98 |
+ return (result) |
|
99 |
+ |
|
100 |
+} # convertRawMatricesToStandard |
|
101 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
102 |
+createAnnotationTable = function () |
|
103 |
+{ |
|
104 |
+ tbl.matrix = read.table ('MATRIX.txt', sep='\t', header=F, as.is=TRUE) |
|
105 |
+ colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder') |
|
106 |
+ |
|
107 |
+ tbl.protein = read.table ('MATRIX_PROTEIN.txt', sep='\t', header=F, as.is=TRUE) |
|
108 |
+ colnames (tbl.protein) = c ('id', 'proteinID') |
|
109 |
+ |
|
110 |
+ tbl.species = read.table ('MATRIX_SPECIES.txt', sep='\t', header=F, as.is=TRUE) |
|
111 |
+ colnames (tbl.species) = c ('id', 'speciesID') |
|
112 |
+ |
|
113 |
+ tbl.anno = read.table ('MATRIX_ANNOTATION.txt', sep='\t', header=F, as.is=TRUE, quote="") |
|
114 |
+ colnames (tbl.anno) = c ('id', 'attribute', 'value') |
|
115 |
+ |
|
116 |
+ tbl.family = subset (tbl.anno, attribute=='family') [, -2]; |
|
117 |
+ colnames (tbl.family) = c ('id', 'family') |
|
118 |
+ |
|
119 |
+ tbl.tax = subset (tbl.anno, attribute=='tax_group') [,-2]; |
|
120 |
+ colnames (tbl.tax) = c ('id', 'tax') |
|
121 |
+ |
|
122 |
+ tbl.class = subset (tbl.anno, attribute=='class') [,-2]; |
|
123 |
+ colnames (tbl.class) = c ('id', 'class') |
|
124 |
+ |
|
125 |
+ tbl.comment = subset (tbl.anno, attribute=='comment')[,-2]; |
|
126 |
+ colnames (tbl.comment) = c ('id', 'comment') |
|
127 |
+ |
|
128 |
+ tbl.pubmed = subset (tbl.anno, attribute=='medline')[,-2]; |
|
129 |
+ colnames (tbl.pubmed) = c ('id', 'pubmed') |
|
130 |
+ |
|
131 |
+ tbl.type = subset (tbl.anno, attribute=='type')[,-2]; |
|
132 |
+ colnames (tbl.type) = c ('id', 'type') |
|
133 |
+ |
|
134 |
+ |
|
135 |
+ tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE) |
|
136 |
+ tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE) |
|
137 |
+ tbl.md = merge (tbl.md, tbl.family, all.x=TRUE) |
|
138 |
+ tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE) |
|
139 |
+ tbl.md = merge (tbl.md, tbl.class, all.x=TRUE) |
|
140 |
+ tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE) |
|
141 |
+ tbl.md = merge (tbl.md, tbl.type, all.x=TRUE) |
|
142 |
+ |
|
143 |
+ fullID = paste (tbl.md$mID, tbl.md$version, sep='.') |
|
144 |
+ tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE) |
|
145 |
+ |
|
146 |
+ invisible (tbl.md) |
|
147 |
+ |
|
148 |
+} # createAnnotationTable |
|
149 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
150 |
+# assemble these columns: |
|
151 |
+# names=character(), # species-source-gene: stamlab-Hsapiens-UW.Motif.0001 |
|
152 |
+# nativeNames=character(), # UW.Motif.0001 |
|
153 |
+# geneSymbols=character(), # NA |
|
154 |
+# sequenceCounts=integer(), # NA |
|
155 |
+# organisms=character(), # Hsapiens |
|
156 |
+# bindingMolecules=character(), # NA |
|
157 |
+# bindingMoleculeIdTypes=character(), # NA |
|
158 |
+# bindingDomainTypes=character(), # NA |
|
159 |
+# dataSources=character(), # stamlab |
|
160 |
+# experimentTypes=character(), # digital genomic footprinting |
|
161 |
+# pubmedIDs=character(), # 22959076 |
|
162 |
+# tfFamilies=character()) # NA |
|
163 |
+# |
|
164 |
+# from these |
|
165 |
+# |
|
166 |
+createMetadataTable = function (matrices, novels) |
|
167 |
+{ |
|
168 |
+ options (stringsAsFactors=FALSE) |
|
169 |
+ tbl.md = data.frame () |
|
170 |
+ matrix.ids = names (matrices) |
|
171 |
+ |
|
172 |
+ for (matrix.id in matrix.ids) { |
|
173 |
+ matrix = matrices [[matrix.id]] |
|
174 |
+ taxon.code = 'Hsapiens' |
|
175 |
+ geneId.info = NA |
|
176 |
+ new.row = list (providerName=matrix.id, |
|
177 |
+ providerId=matrix.id, |
|
178 |
+ dataSource='stamlab', |
|
179 |
+ geneSymbol=NA, |
|
180 |
+ geneId=NA, |
|
181 |
+ geneIdType=NA, |
|
182 |
+ proteinId=NA, |
|
183 |
+ proteinIdType=NA, |
|
184 |
+ organism='Hsapiens', |
|
185 |
+ sequenceCount=NA, |
|
186 |
+ bindingSequence=NA_character_, |
|
187 |
+ bindingDomain=NA, |
|
188 |
+ tfFamily=NA, |
|
189 |
+ experimentType='digital genomic footprinting', |
|
190 |
+ pubmedID='22959076') |
|
191 |
+ tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE)) |
|
192 |
+ full.name = sprintf ('%s-%s-%s', 'Hsapiens', 'stamlab', matrix.id) |
|
193 |
+ rownames (tbl.md) [nrow (tbl.md)] = full.name |
|
194 |
+ } # for i |
|
195 |
+ |
|
196 |
+ novelPFM = rep ('knownMotif', nrow (tbl.md)) |
|
197 |
+ novels.ordered = novels [tbl.md$providerName] # make sure we follow the order in the tbl |
|
198 |
+ novelPFM [which (novels.ordered)] = 'novelMotif' |
|
199 |
+ tbl.md$geneId = novelPFM |
|
200 |
+ tbl.md$geneIdType = rep ('comment', nrow (tbl.md)) |
|
201 |
+ |
|
202 |
+ invisible (tbl.md) |
|
203 |
+ |
|
204 |
+} # createMetadataTable |
|
205 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
206 |
+renameMatrices = function (matrices, tbl.md) |
|
207 |
+{ |
|
208 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
209 |
+ names (matrices) = rownames (tbl.md) |
|
210 |
+ invisible (matrices) |
|
211 |
+ |
|
212 |
+} # renameMatrices |
|
213 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
214 |
+convertTaxonCode = function (ncbi.code) |
|
215 |
+{ |
|
216 |
+ #if (is.na (ncbi.code)) |
|
217 |
+ # return (NA_character_) |
|
218 |
+ ncbi.code = as.character (ncbi.code) |
|
219 |
+ if (ncbi.code %in% c ('-', NA_character_, 'NA')) |
|
220 |
+ return ('Vertebrata') |
|
221 |
+ |
|
222 |
+ tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102', |
|
223 |
+ '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729', |
|
224 |
+ '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'), |
|
225 |
+ name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', |
|
226 |
+ 'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam', |
|
227 |
+ 'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster', |
|
228 |
+ 'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', |
|
229 |
+ 'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'), |
|
230 |
+ stringsAsFactors=FALSE) |
|
231 |
+ |
|
232 |
+ ncbi.code = as.character (ncbi.code) |
|
233 |
+ index = which (tbl$code == ncbi.code) |
|
234 |
+ if (length (index) == 1) |
|
235 |
+ return (tbl$name [index]) |
|
236 |
+ else { |
|
237 |
+ write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ()) |
|
238 |
+ return (NA_character_) |
|
239 |
+ } |
|
240 |
+ |
|
241 |
+} # convertTaxonCode |
|
242 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
243 |
+# an empirical and not altogether trustworthy solution to identifying identifier types. |
|
244 |
+guessProteinIdentifierType = function (moleculeName) |
|
245 |
+{ |
|
246 |
+ if (nchar (moleculeName) == 0) |
|
247 |
+ return (NA_character_) |
|
248 |
+ if (is.na (moleculeName)) |
|
249 |
+ return (NA_character_) |
|
250 |
+ |
|
251 |
+ first.char = substr (moleculeName, 1, 1) |
|
252 |
+ |
|
253 |
+ if (first.char == 'Y') |
|
254 |
+ return ('SGD') |
|
255 |
+ |
|
256 |
+ if (first.char %in% c ('P', 'Q', 'O', 'A', 'C')) |
|
257 |
+ return ('UNIPROT') |
|
258 |
+ |
|
259 |
+ if (length (grep ('^EAW', moleculeName)) == 1) |
|
260 |
+ return ('NCBI') |
|
261 |
+ |
|
262 |
+ if (length (grep ('^EAX', moleculeName)) == 1) |
|
263 |
+ return ('NCBI') |
|
264 |
+ |
|
265 |
+ if (length (grep ('^NP_', moleculeName)) == 1) |
|
266 |
+ return ('REFSEQ') |
|
267 |
+ |
|
268 |
+ if (length (grep ('^BAD', moleculeName)) == 1) |
|
269 |
+ return ('EMBL') |
|
270 |
+ |
|
271 |
+ return (NA_character_) |
|
272 |
+ |
|
273 |
+} # guessProteinIdentifierType |
|
274 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
275 |
+normalizeMatrices = function (matrices) |
|
276 |
+{ |
|
277 |
+ mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
278 |
+ invisible (mtx.normalized) |
|
279 |
+ |
|
280 |
+} # normalizeMatrices |
|
281 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
282 |
+assignGeneId = function (proteinId) |
|
283 |
+{ |
|
284 |
+ if (!exists ('id.uniprot.human')) { |
|
285 |
+ |
|
286 |
+ tbl = toTable (org.Hs.egUNIPROT) |
|
287 |
+ id.uniprot.human <<- as.character (tbl$gene_id) |
|
288 |
+ names (id.uniprot.human) <<- tbl$uniprot_id |
|
289 |
+ |
|
290 |
+ tbl = toTable (org.Hs.egREFSEQ) |
|
291 |
+ tbl = tbl [grep ('^NP_', tbl$accession),] |
|
292 |
+ id.refseq.human <<- as.character (tbl$gene_id) |
|
293 |
+ names (id.refseq.human) <<- tbl$accession |
|
294 |
+ |
|
295 |
+ tbl = toTable (org.Mm.egUNIPROT) |
|
296 |
+ id.uniprot.mouse <<- as.character (tbl$gene_id) |
|
297 |
+ names (id.uniprot.mouse) <<- tbl$uniprot_id |
|
298 |
+ |
|
299 |
+ tbl = toTable (org.Mm.egREFSEQ) |
|
300 |
+ tbl = tbl [grep ('^NP_', tbl$accession),] |
|
301 |
+ id.refseq.mouse <<- as.character (tbl$gene_id) |
|
302 |
+ names (id.refseq.mouse) <<- tbl$accession |
|
303 |
+ } |
|
304 |
+ |
|
305 |
+ proteinId = strsplit (proteinId, '\\.')[[1]][1] # remove any trailing '.*' |
|
306 |
+ |
|
307 |
+ if (proteinId %in% names (id.uniprot.human)) |
|
308 |
+ return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ')) |
|
309 |
+ |
|
310 |
+ if (proteinId %in% names (id.uniprot.mouse)) |
|
311 |
+ return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ')) |
|
312 |
+ |
|
313 |
+ if (proteinId %in% names (id.refseq.human)) |
|
314 |
+ return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ')) |
|
315 |
+ |
|
316 |
+ if (proteinId %in% names (id.refseq.mouse)) |
|
317 |
+ return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ')) |
|
318 |
+ |
|
319 |
+ found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE)) |
|
320 |
+ |
|
321 |
+ if (found.leading.Y == 1) |
|
322 |
+ return (list (geneId=proteinId, type='SGD')) |
|
323 |
+ |
|
324 |
+ return (list (geneId=NA_character_, type=NA_character_)) |
|
325 |
+ |
|
326 |
+} # assignGeneId |
|
327 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
328 |
+parsePwm = function (text) |
|
329 |
+{ |
|
330 |
+ #printf ('parsing pwm %s', text [1]) |
|
331 |
+ lines = strsplit (text, '\t') |
|
332 |
+ title = lines [[1]][1] |
|
333 |
+ consensus.sequence = lines [[1]][2] |
|
334 |
+ line.count = length (lines) |
|
335 |
+ #printf ('%s: %s', title, consensus.sequence) |
|
336 |
+ |
|
337 |
+ result = matrix (nrow=line.count-1, ncol=4, dimnames=list(1:(line.count-1), c ('A','C','G','T'))) |
|
338 |
+ row = 1 |
|
339 |
+ for (line in lines [2:line.count]) { |
|
340 |
+ result [row,] = as.numeric (line) |
|
341 |
+ row = row + 1 |
|
342 |
+ } # for line |
|
343 |
+ |
|
344 |
+ result = t (result) |
|
345 |
+ |
|
346 |
+ return (list (title=title, consensus.sequence=consensus.sequence, matrix=result)) |
|
347 |
+ |
|
348 |
+} # parsePwm |
|
349 |
+#------------------------------------------------------------------------------------------------------------------------ |