git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@107889 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -62,7 +62,7 @@ extractMatrices = function (pwm.list) |
62 | 62 |
#------------------------------------------------------------------------------------------------------------------------ |
63 | 63 |
createAnnotationTable <- function(dataDir) |
64 | 64 |
{ |
65 |
- file <- file.path(dataDir, "Matrix.txt") |
|
65 |
+ file <- file.path(dataDir, "MATRIX.txt") |
|
66 | 66 |
tbl.matrix <- read.table(file, sep='\t', header=FALSE, as.is=TRUE) |
67 | 67 |
colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder') |
68 | 68 |
|
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@107858 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -32,7 +32,7 @@ readRawMatrices = function (dataDir) |
32 | 32 |
|
33 | 33 |
|
34 | 34 |
filename <- file.path(dataDir, "matrix_data.txt") |
35 |
- stopifnot(file.exists(filename)) |
|
35 |
+ stopifnot(file.exists(filename)) |
|
36 | 36 |
|
37 | 37 |
all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE) |
38 | 38 |
title.lines = grep ('^>', all.lines) |
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@88754 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,180 +1,131 @@ |
1 | 1 |
# jaspar2014/import.R |
2 |
-# |
|
3 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
2 |
+#----------------------------------------------------------------------------------- |
|
4 | 3 |
library (org.Hs.eg.db) |
5 | 4 |
library (org.Mm.eg.db) |
6 | 5 |
#------------------------------------------------------------------------------------------------------------------------ |
6 |
+options (stringsAsFactors=FALSE) |
|
7 | 7 |
printf <- function(...) print(noquote(sprintf(...))) |
8 | 8 |
#------------------------------------------------------------------------------------------------------------------------ |
9 | 9 |
run = function (dataDir) |
10 | 10 |
{ |
11 |
- dataDir <- file.path(dataDir, "jaspar") |
|
12 |
- tbl.rmat = readRawMatrices (dataDir) |
|
13 |
- matrices = convertRawMatricesToStandard (tbl.rmat) |
|
14 |
- tbl.anno = createAnnotationTable (dataDir) |
|
15 |
- tbl.md = createMetadataTable (tbl.anno, matrices) |
|
16 |
- matrices = renameMatrices (matrices, tbl.md) |
|
17 |
- matrices = normalizeMatrices (matrices) |
|
18 |
- serializedFile <- "jaspar.RData" |
|
11 |
+ dataDir <- file.path(dataDir,"jaspar2014") |
|
12 |
+ rawMatrixList <- readRawMatrices (dataDir) |
|
13 |
+ matrices <- extractMatrices (rawMatrixList) |
|
14 |
+ tbl.anno <- createAnnotationTable(dataDir) |
|
15 |
+ tbl.md <- createMetadataTable (tbl.anno, matrices) |
|
16 |
+ matrices <- normalizeMatrices (matrices) |
|
17 |
+ matrices <- renameMatrices (matrices, tbl.md) |
|
18 |
+ serializedFile <- file.path(dataDir, "jaspar2014.RData") |
|
19 | 19 |
save (matrices, tbl.md, file=serializedFile) |
20 | 20 |
printf("saved %d matrices to %s", length(matrices), serializedFile) |
21 |
- printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
21 |
+ printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
22 | 22 |
|
23 | 23 |
} # run |
24 | 24 |
#------------------------------------------------------------------------------------------------------------------------ |
25 |
- |
|
26 | 25 |
readRawMatrices = function (dataDir) |
27 | 26 |
{ |
28 |
- file <- file.path(dataDir, 'MATRIX_DATA.txt') |
|
29 |
- #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric')) |
|
30 |
- #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count') |
|
31 |
- tbl.matrices = read.table (file, sep='\n', header=FALSE ) |
|
27 |
+ # our convention is that there is a shared "dataDir" visible to |
|
28 |
+ # the importer, and that within that directory there is one |
|
29 |
+ # subdirectory for each data source. |
|
30 |
+ # for this example importer, that directory will be <dataDir>/test |
|
31 |
+ # within which we will look for one small file "sample.pcm" |
|
32 |
+ |
|
33 |
+ |
|
34 |
+ filename <- file.path(dataDir, "matrix_data.txt") |
|
35 |
+ stopifnot(file.exists(filename)) |
|
32 | 36 |
|
33 |
- invisible (tbl.matrices) |
|
37 |
+ all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE) |
|
38 |
+ title.lines = grep ('^>', all.lines) |
|
39 |
+ title.line.count <<- length (title.lines) |
|
40 |
+ max = title.line.count - 1 |
|
34 | 41 |
|
42 |
+ pwms = list () |
|
43 |
+ |
|
44 |
+ for (i in 1:max) { |
|
45 |
+ start.line = title.lines [i] |
|
46 |
+ end.line = title.lines [i+1] - 1 |
|
47 |
+ new.pwm = parsePwm (all.lines [start.line:end.line]) |
|
48 |
+ pwms = c (pwms, list (new.pwm)) |
|
49 |
+ } # for i |
|
50 |
+ invisible (pwms) |
|
35 | 51 |
} # readRawMatrices |
36 | 52 |
#------------------------------------------------------------------------------------------------------------------------ |
37 |
-convertRawMatricesToStandard = function (tbl.rmat) |
|
53 |
+extractMatrices = function (pwm.list) |
|
38 | 54 |
{ |
39 |
- #matrix.ids = unique (tbl.rmat$id) |
|
40 |
- #result = vector ('list', length (matrix.ids)) |
|
41 |
- |
|
42 |
- #i = 1 |
|
43 |
- #for (matrix.id in matrix.ids) { |
|
44 |
- # tbl.sub = subset (tbl.rmat, id == matrix.id) |
|
45 |
- # # sanity check this rather unusual representation of a position count matrix |
|
46 |
- # base.count = as.data.frame (table (tbl.sub$base)) |
|
47 |
- # stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T')) |
|
48 |
- # conservative length check. actually expect sequence lengths of 6 - 20 bases |
|
49 |
- # if (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) { |
|
50 |
- # printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1]) |
|
51 |
- # } |
|
52 |
- # stopifnot (all (base.count$Freq == base.count$Freq [1])) |
|
53 |
- # nucleotide.counts = tbl.sub$count |
|
54 |
- # row.names = c('A', 'C', 'G', 'T'); |
|
55 |
- # col.names = 1:(nrow (tbl.sub) / 4) |
|
56 |
- # m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names)) |
|
57 |
- # result [[i]] = m |
|
58 |
- # i = i + 1 |
|
59 |
- # } # for matrix.id |
|
60 |
- |
|
61 |
- #names (result) = matrix.ids |
|
62 |
- |
|
63 |
- result<- vector("list",length=593) |
|
64 |
- matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE) |
|
65 |
- names (result) = matrices.id |
|
66 |
- #Get a list of the ID's and then assign them as names |
|
67 |
- m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE) |
|
68 |
- data.list=list() |
|
69 |
- x = 1 |
|
70 |
- y = 1 |
|
71 |
- while (x < length(m.d)) |
|
72 |
- { |
|
73 |
- matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3]) |
|
74 |
- matrices.formatted <- gsub("\t"," ",matrices.position) |
|
75 |
- #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t")) |
|
76 |
- row.names = c('A', 'C', 'G', 'T') |
|
77 |
- col.names = 1:(ncol (matrices.formatted)) |
|
78 |
- #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,)) |
|
79 |
- result[[y]] <- matrices.formatted |
|
80 |
- y=y+1 |
|
81 |
- x=x+4 |
|
82 |
- } |
|
83 |
- browser() |
|
84 |
- return (result) |
|
85 |
- |
|
86 |
-} # convertRawMatricesToStandard |
|
55 |
+ matrices = sapply (pwm.list, function (element) element$matrix) |
|
56 |
+ matrix.names <- sapply (pwm.list, function (element) element$title) |
|
57 |
+ matrix.names <- sub("^>", "", matrix.names) |
|
58 |
+ names (matrices) <- matrix.names |
|
59 |
+ |
|
60 |
+ return (matrices) |
|
61 |
+} # extractMatrices |
|
87 | 62 |
#------------------------------------------------------------------------------------------------------------------------ |
88 |
-# read 'mysql' tables provide by jaspar: |
|
89 |
-# MATRIX.txt: 9229 CORE MA0001 1 AGL3 |
|
90 |
-# MATRIX_PROTEIN.txt: 9229 P29383 |
|
91 |
-# MATRIX_SPECIES.txt: 9229 3702 |
|
92 |
-# MATRIX_ANNOTATION.txt: |
|
93 |
-# 9229 class Other Alpha-Helix |
|
94 |
-# 9229 comment - |
|
95 |
-# 9229 family MADS |
|
96 |
-# 9229 medline 7632923 |
|
97 |
-# 9229 tax_group plants |
|
98 |
-# 9229 type SELEX |
|
99 |
-createAnnotationTable = function (dataDir) |
|
63 |
+createAnnotationTable <- function(dataDir) |
|
100 | 64 |
{ |
101 |
- file <- file.path(dataDir, "MATRIX.txt") |
|
102 |
- tbl.matrix = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
103 |
- colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder') |
|
104 |
- |
|
105 |
- file <- file.path(dataDir, "MATRIX_PROTEIN.txt") |
|
106 |
- tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
107 |
- colnames (tbl.protein) = c ('id', 'proteinID') |
|
108 |
- |
|
109 |
- file <- file.path(dataDir, "MATRIX_SPECIES.txt") |
|
110 |
- tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
111 |
- colnames (tbl.species) = c ('id', 'speciesID') |
|
112 |
- |
|
113 |
- file <- file.path(dataDir, "MATRIX_ANNOTATION.txt") |
|
114 |
- tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="") |
|
115 |
- colnames (tbl.anno) = c ('id', 'attribute', 'value') |
|
116 |
- |
|
117 |
- tbl.family = subset (tbl.anno, attribute=='family') [, -2]; |
|
118 |
- colnames (tbl.family) = c ('id', 'family') |
|
119 |
- |
|
120 |
- tbl.tax = subset (tbl.anno, attribute=='tax_group') [,-2]; |
|
121 |
- colnames (tbl.tax) = c ('id', 'tax') |
|
122 |
- |
|
123 |
- tbl.class = subset (tbl.anno, attribute=='class') [,-2]; |
|
124 |
- colnames (tbl.class) = c ('id', 'class') |
|
125 |
- |
|
126 |
- tbl.comment = subset (tbl.anno, attribute=='comment')[,-2]; |
|
127 |
- colnames (tbl.comment) = c ('id', 'comment') |
|
128 |
- |
|
129 |
- tbl.pubmed = subset (tbl.anno, attribute=='medline')[,-2]; |
|
130 |
- colnames (tbl.pubmed) = c ('id', 'pubmed') |
|
131 |
- |
|
132 |
- tbl.type = subset (tbl.anno, attribute=='type')[,-2]; |
|
133 |
- colnames (tbl.type) = c ('id', 'type') |
|
134 |
- |
|
135 |
- |
|
136 |
- tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE) |
|
137 |
- tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE) |
|
138 |
- tbl.md = merge (tbl.md, tbl.family, all.x=TRUE) |
|
139 |
- tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE) |
|
140 |
- tbl.md = merge (tbl.md, tbl.class, all.x=TRUE) |
|
141 |
- tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE) |
|
142 |
- tbl.md = merge (tbl.md, tbl.type, all.x=TRUE) |
|
143 |
- |
|
144 |
- fullID = paste (tbl.md$mID, tbl.md$version, sep='.') |
|
145 |
- tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE) |
|
146 |
- |
|
147 |
- invisible (tbl.md) |
|
148 |
- |
|
65 |
+ file <- file.path(dataDir, "Matrix.txt") |
|
66 |
+ tbl.matrix <- read.table(file, sep='\t', header=FALSE, as.is=TRUE) |
|
67 |
+ colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder') |
|
68 |
+ |
|
69 |
+ file <- file.path(dataDir, "MATRIX_PROTEIN.txt") |
|
70 |
+ stopifnot(file.exists(file)) |
|
71 |
+ tbl.protein <- read.table(file, sep='\t', header=FALSE, as.is=TRUE) |
|
72 |
+ colnames(tbl.protein) <- c('id', 'proteinID') |
|
73 |
+ |
|
74 |
+ file <- file.path(dataDir, "MATRIX_SPECIES.txt") |
|
75 |
+ stopifnot(file.exists(file)) |
|
76 |
+ tbl.species <- read.table(file, sep='\t', header=FALSE, as.is=TRUE) |
|
77 |
+ colnames(tbl.species) <- c('id', 'speciesID') |
|
78 |
+ |
|
79 |
+ file <- file.path(dataDir, "MATRIX_ANNOTATION.txt") |
|
80 |
+ stopifnot(file.exists(file)) |
|
81 |
+ tbl.anno <- read.table(file, sep='\t', header=FALSE, as.is=TRUE, quote="") |
|
82 |
+ colnames(tbl.anno) <- c('id', 'attribute', 'value') |
|
83 |
+ |
|
84 |
+ file <- file.path(dataDir, "MATRIX_ANNOTATION.txt") |
|
85 |
+ tbl.family <- subset(tbl.anno, attribute=='family') [, -2]; |
|
86 |
+ colnames(tbl.family) <- c('id', 'family') |
|
87 |
+ # create 5 2-column data.frames out of tbl.anno |
|
88 |
+ # which can all then be merged on the "id" column |
|
89 |
+ tbl.tax <- subset(tbl.anno, attribute=='tax_group') [,-2]; |
|
90 |
+ colnames(tbl.tax) <- c('id', 'tax') |
|
91 |
+ |
|
92 |
+ tbl.class <- subset(tbl.anno, attribute=='class') [,-2]; |
|
93 |
+ colnames(tbl.class) <- c('id', 'class') |
|
94 |
+ |
|
95 |
+ tbl.comment <- subset(tbl.anno, attribute=='comment')[,-2]; |
|
96 |
+ colnames(tbl.comment) <- c('id', 'comment') |
|
97 |
+ |
|
98 |
+ tbl.pubmed <- subset(tbl.anno, attribute=='medline')[,-2]; |
|
99 |
+ colnames(tbl.pubmed) <- c('id', 'pubmed') |
|
100 |
+ |
|
101 |
+ tbl.type <- subset(tbl.anno, attribute=='type')[,-2]; |
|
102 |
+ colnames(tbl.type) <- c('id', 'type') |
|
103 |
+ |
|
104 |
+ tbl.md <- merge(tbl.matrix, tbl.species, all.x=TRUE) |
|
105 |
+ tbl.md <- merge(tbl.md, tbl.protein, all.x=TRUE) |
|
106 |
+ tbl.md <- merge(tbl.md, tbl.family, all.x=TRUE) |
|
107 |
+ tbl.md <- merge(tbl.md, tbl.tax, all.x=TRUE) |
|
108 |
+ tbl.md <- merge(tbl.md, tbl.class, all.x=TRUE) |
|
109 |
+ tbl.md <- merge(tbl.md, tbl.pubmed, all.x=TRUE) |
|
110 |
+ tbl.md <- merge(tbl.md, tbl.type, all.x=TRUE) |
|
111 |
+ |
|
112 |
+ fullID <- paste(tbl.md$mID, tbl.md$version, sep='.') |
|
113 |
+ tbl.md <- cbind(fullID, tbl.md, stringsAsFactors=FALSE) |
|
114 |
+ |
|
115 |
+ invisible(tbl.md) |
|
116 |
+ |
|
149 | 117 |
} # createAnnotationTable |
150 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
151 |
-# assemble these columns: |
|
152 |
-# names=character(), # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a |
|
153 |
-# nativeNames=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
154 |
-# geneSymbols=character(), # ABF2, Rhox11 |
|
155 |
-# sequenceCounts=integer(), # often NA |
|
156 |
-# organisms=character(), # Scerevisiae, Mmusculus |
|
157 |
-# bindingMolecules=character(), # YMR072W, 194738 |
|
158 |
-# bindingMoleculeIdTypes=character(), # SGD, entrez gene |
|
159 |
-# bindingDomainTypes=character(), # NA, Homeo |
|
160 |
-# dataSources=character(), # ScerTF, UniPROBE |
|
161 |
-# experimentTypes=character(), # NA, protein-binding microarray |
|
162 |
-# pubmedIDs=character(), # 19111667, 1858359 |
|
163 |
-# tfFamilies=character()) # NA, NA |
|
164 |
-# |
|
165 |
-# from these |
|
166 |
-# |
|
118 |
+#------------------------------------------------------------------------------- |
|
167 | 119 |
createMetadataTable = function (tbl.anno, matrices) |
168 | 120 |
{ |
169 | 121 |
options (stringsAsFactors=FALSE) |
170 | 122 |
tbl.md = data.frame () |
171 | 123 |
matrix.ids = names (matrices) |
172 |
- dataSource = 'JASPAR_CORE' |
|
124 |
+ dataSource = 'JASPAR_2014' |
|
173 | 125 |
|
174 | 126 |
for (matrix.id in matrix.ids) { |
175 | 127 |
matrix = matrices [[matrix.id]] |
176 |
- stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1) |
|
177 |
- tbl.sub = subset (tbl.anno, id==matrix.id) |
|
128 |
+ tbl.sub = subset (tbl.anno, fullID==substr(matrix.id,0,8)) |
|
178 | 129 |
if (nrow (tbl.sub) > 1) { |
179 | 130 |
# the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr |
180 | 131 |
# some others, a sampling: P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510 |
... | ... |
@@ -208,19 +159,77 @@ createMetadataTable = function (tbl.anno, matrices) |
208 | 159 |
NA.string.indices = grep ('NA', tbl.md$proteinId) |
209 | 160 |
if (length (NA.string.indices) > 0) |
210 | 161 |
tbl.md$proteinId [NA.string.indices] = NA |
211 |
- |
|
212 | 162 |
invisible (tbl.md) |
213 |
- |
|
214 |
-} # createMetadataTable |
|
163 |
+}#createMetadataTable |
|
215 | 164 |
#------------------------------------------------------------------------------------------------------------------------ |
216 | 165 |
renameMatrices = function (matrices, tbl.md) |
217 | 166 |
{ |
218 | 167 |
stopifnot (length (matrices) == nrow (tbl.md)) |
219 | 168 |
names (matrices) = rownames (tbl.md) |
220 | 169 |
invisible (matrices) |
221 |
- |
|
170 |
+ |
|
222 | 171 |
} # renameMatrices |
223 | 172 |
#------------------------------------------------------------------------------------------------------------------------ |
173 |
+# an empirical and not altogether trustworthy solution to identifying identifier types. |
|
174 |
+guessProteinIdentifierType = function (moleculeName) |
|
175 |
+{ |
|
176 |
+ if (nchar (moleculeName) == 0) |
|
177 |
+ return (NA_character_) |
|
178 |
+ if (is.na (moleculeName)) |
|
179 |
+ return (NA_character_) |
|
180 |
+ |
|
181 |
+ first.char = substr (moleculeName, 1, 1) |
|
182 |
+ |
|
183 |
+ if (first.char == 'Y') |
|
184 |
+ return ('SGD') |
|
185 |
+ |
|
186 |
+ if (first.char %in% c ('P', 'Q', 'O', 'A', 'C')) |
|
187 |
+ return ('UNIPROT') |
|
188 |
+ |
|
189 |
+ if (length (grep ('^NP_', moleculeName)) == 1) |
|
190 |
+ return ('REFSEQ') |
|
191 |
+ |
|
192 |
+ return (NA_character_) |
|
193 |
+ |
|
194 |
+} # guessProteinIdentifierType |
|
195 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
196 |
+normalizeMatrices = function (matrices) |
|
197 |
+{ |
|
198 |
+ mtx.normalized = sapply (matrices, |
|
199 |
+ function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
200 |
+ |
|
201 |
+ invisible (mtx.normalized) |
|
202 |
+ |
|
203 |
+} # normalizeMatrices |
|
204 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
205 |
+parsePwm = function (text) |
|
206 |
+{ |
|
207 |
+ lines = strsplit (text, '\t') |
|
208 |
+ #browser() |
|
209 |
+ stopifnot(length(lines)==5) # title line, one line for each base |
|
210 |
+ title = lines [[1]][1] |
|
211 |
+ line.count = length(lines) |
|
212 |
+ |
|
213 |
+ # expect 4 rows, and a number of columns we can discern from |
|
214 |
+ # the incoming text. |
|
215 |
+ cols <- length(lines[[2]]) |
|
216 |
+ result <- matrix (nrow=4, ncol=cols, |
|
217 |
+ dimnames=list(c('A','C','G','T'), |
|
218 |
+ as.character(1:cols))) |
|
219 |
+ row = 1 |
|
220 |
+ for(i in 2:line.count){ |
|
221 |
+ result [row,] = as.numeric (lines[[i]]) |
|
222 |
+ row = row + 1 |
|
223 |
+ } # for i |
|
224 |
+ |
|
225 |
+ #result = t (result) |
|
226 |
+ |
|
227 |
+ #return (list (title=title, consensus.sequence=consensus.sequence, matrix=result)) |
|
228 |
+ return (list (title=title, matrix=result)) |
|
229 |
+ |
|
230 |
+} # parsePwm |
|
231 |
+#---------------------------------------------------------------------------------------------------- |
|
232 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
224 | 233 |
# full names: ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', |
225 | 234 |
# 'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam', |
226 | 235 |
# 'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster', |
... | ... |
@@ -248,52 +257,16 @@ convertTaxonCode = function (ncbi.code) |
248 | 257 |
index = which (tbl$code == ncbi.code) |
249 | 258 |
if (length (index) == 1) |
250 | 259 |
return (tbl$name [index]) |
260 |
+ if (nchar(ncbi.code)>6) |
|
261 |
+ return ('Vertebrata') |
|
251 | 262 |
else { |
252 |
- write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ()) |
|
263 |
+ browser() |
|
264 |
+ write (sprintf ("unable to map organism code |%s|", ncbi.code), stderr ()) |
|
253 | 265 |
return (NA_character_) |
254 | 266 |
} |
255 | 267 |
|
256 | 268 |
} # convertTaxonCode |
257 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
258 |
-# an empirical and not altogether trustworthy solution to identifying identifier types. |
|
259 |
-guessProteinIdentifierType = function (moleculeName) |
|
260 |
-{ |
|
261 |
- if (nchar (moleculeName) == 0) |
|
262 |
- return (NA_character_) |
|
263 |
- if (is.na (moleculeName)) |
|
264 |
- return (NA_character_) |
|
265 |
- |
|
266 |
- first.char = substr (moleculeName, 1, 1) |
|
267 |
- |
|
268 |
- if (first.char == 'Y') |
|
269 |
- return ('SGD') |
|
270 |
- |
|
271 |
- if (first.char %in% c ('P', 'Q', 'O', 'A', 'C')) |
|
272 |
- return ('UNIPROT') |
|
273 |
- |
|
274 |
- if (length (grep ('^EAW', moleculeName)) == 1) |
|
275 |
- return ('NCBI') |
|
276 |
- |
|
277 |
- if (length (grep ('^EAX', moleculeName)) == 1) |
|
278 |
- return ('NCBI') |
|
279 |
- |
|
280 |
- if (length (grep ('^NP_', moleculeName)) == 1) |
|
281 |
- return ('REFSEQ') |
|
282 |
- |
|
283 |
- if (length (grep ('^BAD', moleculeName)) == 1) |
|
284 |
- return ('EMBL') |
|
285 |
- |
|
286 |
- return (NA_character_) |
|
287 |
- |
|
288 |
-} # guessProteinIdentifierType |
|
289 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
290 |
-normalizeMatrices = function (matrices) |
|
291 |
-{ |
|
292 |
- mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
293 |
- invisible (mtx.normalized) |
|
294 |
- |
|
295 |
-} # normalizeMatrices |
|
296 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
269 |
+#---------------------------------------------------------------------------------------------------- |
|
297 | 270 |
assignGeneId = function (proteinId) |
298 | 271 |
{ |
299 | 272 |
if (!exists ('id.uniprot.human')) { |
... | ... |
@@ -339,4 +312,4 @@ assignGeneId = function (proteinId) |
339 | 312 |
return (list (geneId=NA_character_, type=NA_character_)) |
340 | 313 |
|
341 | 314 |
} # assignGeneId |
342 |
-#------------------------------------------------------------------------------------------------------------------------ |
|
315 |
+#----------------------------------------------------------------------------- |
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@88753 bc3139a8-67e5-0310-9ffc-ced21a209358
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86588 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,342 @@ |
1 |
+# jaspar/import.R |
|
2 |
+# |
|
3 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
4 |
+library (org.Hs.eg.db) |
|
5 |
+library (org.Mm.eg.db) |
|
6 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
7 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
8 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
9 |
+run = function (dataDir) |
|
10 |
+{ |
|
11 |
+ dataDir <- file.path(dataDir, "jaspar") |
|
12 |
+ tbl.rmat = readRawMatrices (dataDir) |
|
13 |
+ matrices = convertRawMatricesToStandard (tbl.rmat) |
|
14 |
+ tbl.anno = createAnnotationTable (dataDir) |
|
15 |
+ tbl.md = createMetadataTable (tbl.anno, matrices) |
|
16 |
+ matrices = renameMatrices (matrices, tbl.md) |
|
17 |
+ matrices = normalizeMatrices (matrices) |
|
18 |
+ serializedFile <- "jaspar.RData" |
|
19 |
+ save (matrices, tbl.md, file=serializedFile) |
|
20 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
21 |
+ printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
22 |
+ |
|
23 |
+} # run |
|
24 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
25 |
+ |
|
26 |
+readRawMatrices = function (dataDir) |
|
27 |
+{ |
|
28 |
+ file <- file.path(dataDir, 'MATRIX_DATA.txt') |
|
29 |
+ #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric')) |
|
30 |
+ #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count') |
|
31 |
+ tbl.matrices = read.table (file, sep='\n', header=FALSE ) |
|
32 |
+ |
|
33 |
+ invisible (tbl.matrices) |
|
34 |
+ |
|
35 |
+} # readRawMatrices |
|
36 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
37 |
+convertRawMatricesToStandard = function (tbl.rmat) |
|
38 |
+{ |
|
39 |
+ #matrix.ids = unique (tbl.rmat$id) |
|
40 |
+ #result = vector ('list', length (matrix.ids)) |
|
41 |
+ |
|
42 |
+ #i = 1 |
|
43 |
+ #for (matrix.id in matrix.ids) { |
|
44 |
+ # tbl.sub = subset (tbl.rmat, id == matrix.id) |
|
45 |
+ # # sanity check this rather unusual representation of a position count matrix |
|
46 |
+ # base.count = as.data.frame (table (tbl.sub$base)) |
|
47 |
+ # stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T')) |
|
48 |
+ # conservative length check. actually expect sequence lengths of 6 - 20 bases |
|
49 |
+ # if (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) { |
|
50 |
+ # printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1]) |
|
51 |
+ # } |
|
52 |
+ # stopifnot (all (base.count$Freq == base.count$Freq [1])) |
|
53 |
+ # nucleotide.counts = tbl.sub$count |
|
54 |
+ # row.names = c('A', 'C', 'G', 'T'); |
|
55 |
+ # col.names = 1:(nrow (tbl.sub) / 4) |
|
56 |
+ # m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names)) |
|
57 |
+ # result [[i]] = m |
|
58 |
+ # i = i + 1 |
|
59 |
+ # } # for matrix.id |
|
60 |
+ |
|
61 |
+ #names (result) = matrix.ids |
|
62 |
+ |
|
63 |
+ result<- vector("list",length=593) |
|
64 |
+ matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE) |
|
65 |
+ names (result) = matrices.id |
|
66 |
+ #Get a list of the ID's and then assign them as names |
|
67 |
+ m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE) |
|
68 |
+ data.list=list() |
|
69 |
+ x = 1 |
|
70 |
+ y = 1 |
|
71 |
+ while (x < length(m.d)) |
|
72 |
+ { |
|
73 |
+ matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3]) |
|
74 |
+ matrices.formatted <- gsub("\t"," ",matrices.position) |
|
75 |
+ #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t")) |
|
76 |
+ row.names = c('A', 'C', 'G', 'T') |
|
77 |
+ col.names = 1:(ncol (matrices.formatted)) |
|
78 |
+ #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,)) |
|
79 |
+ result[[y]] <- matrices.formatted |
|
80 |
+ y=y+1 |
|
81 |
+ x=x+4 |
|
82 |
+ } |
|
83 |
+ browser() |
|
84 |
+ return (result) |
|
85 |
+ |
|
86 |
+} # convertRawMatricesToStandard |
|
87 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
88 |
+# read 'mysql' tables provide by jaspar: |
|
89 |
+# MATRIX.txt: 9229 CORE MA0001 1 AGL3 |
|
90 |
+# MATRIX_PROTEIN.txt: 9229 P29383 |
|
91 |
+# MATRIX_SPECIES.txt: 9229 3702 |
|
92 |
+# MATRIX_ANNOTATION.txt: |
|
93 |
+# 9229 class Other Alpha-Helix |
|
94 |
+# 9229 comment - |
|
95 |
+# 9229 family MADS |
|
96 |
+# 9229 medline 7632923 |
|
97 |
+# 9229 tax_group plants |
|
98 |
+# 9229 type SELEX |
|
99 |
+createAnnotationTable = function (dataDir) |
|
100 |
+{ |
|
101 |
+ file <- file.path(dataDir, "MATRIX.txt") |
|
102 |
+ tbl.matrix = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
103 |
+ colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder') |
|
104 |
+ |
|
105 |
+ file <- file.path(dataDir, "MATRIX_PROTEIN.txt") |
|
106 |
+ tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
107 |
+ colnames (tbl.protein) = c ('id', 'proteinID') |
|
108 |
+ |
|
109 |
+ file <- file.path(dataDir, "MATRIX_SPECIES.txt") |
|
110 |
+ tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE) |
|
111 |
+ colnames (tbl.species) = c ('id', 'speciesID') |
|
112 |
+ |
|
113 |
+ file <- file.path(dataDir, "MATRIX_ANNOTATION.txt") |
|
114 |
+ tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="") |
|
115 |
+ colnames (tbl.anno) = c ('id', 'attribute', 'value') |
|
116 |
+ |
|
117 |
+ tbl.family = subset (tbl.anno, attribute=='family') [, -2]; |
|
118 |
+ colnames (tbl.family) = c ('id', 'family') |
|
119 |
+ |
|
120 |
+ tbl.tax = subset (tbl.anno, attribute=='tax_group') [,-2]; |
|
121 |
+ colnames (tbl.tax) = c ('id', 'tax') |
|
122 |
+ |
|
123 |
+ tbl.class = subset (tbl.anno, attribute=='class') [,-2]; |
|
124 |
+ colnames (tbl.class) = c ('id', 'class') |
|
125 |
+ |
|
126 |
+ tbl.comment = subset (tbl.anno, attribute=='comment')[,-2]; |
|
127 |
+ colnames (tbl.comment) = c ('id', 'comment') |
|
128 |
+ |
|
129 |
+ tbl.pubmed = subset (tbl.anno, attribute=='medline')[,-2]; |
|
130 |
+ colnames (tbl.pubmed) = c ('id', 'pubmed') |
|
131 |
+ |
|
132 |
+ tbl.type = subset (tbl.anno, attribute=='type')[,-2]; |
|
133 |
+ colnames (tbl.type) = c ('id', 'type') |
|
134 |
+ |
|
135 |
+ |
|
136 |
+ tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE) |
|
137 |
+ tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE) |
|
138 |
+ tbl.md = merge (tbl.md, tbl.family, all.x=TRUE) |
|
139 |
+ tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE) |
|
140 |
+ tbl.md = merge (tbl.md, tbl.class, all.x=TRUE) |
|
141 |
+ tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE) |
|
142 |
+ tbl.md = merge (tbl.md, tbl.type, all.x=TRUE) |
|
143 |
+ |
|
144 |
+ fullID = paste (tbl.md$mID, tbl.md$version, sep='.') |
|
145 |
+ tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE) |
|
146 |
+ |
|
147 |
+ invisible (tbl.md) |
|
148 |
+ |
|
149 |
+} # createAnnotationTable |
|
150 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
151 |
+# assemble these columns: |
|
152 |
+# names=character(), # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a |
|
153 |
+# nativeNames=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt |
|
154 |
+# geneSymbols=character(), # ABF2, Rhox11 |
|
155 |
+# sequenceCounts=integer(), # often NA |
|
156 |
+# organisms=character(), # Scerevisiae, Mmusculus |
|
157 |
+# bindingMolecules=character(), # YMR072W, 194738 |
|
158 |
+# bindingMoleculeIdTypes=character(), # SGD, entrez gene |
|
159 |
+# bindingDomainTypes=character(), # NA, Homeo |
|
160 |
+# dataSources=character(), # ScerTF, UniPROBE |
|
161 |
+# experimentTypes=character(), # NA, protein-binding microarray |
|
162 |
+# pubmedIDs=character(), # 19111667, 1858359 |
|
163 |
+# tfFamilies=character()) # NA, NA |
|
164 |
+# |
|
165 |
+# from these |
|
166 |
+# |
|
167 |
+createMetadataTable = function (tbl.anno, matrices) |
|
168 |
+{ |
|
169 |
+ options (stringsAsFactors=FALSE) |
|
170 |
+ tbl.md = data.frame () |
|
171 |
+ matrix.ids = names (matrices) |
|
172 |
+ dataSource = 'JASPAR_CORE' |
|
173 |
+ |
|
174 |
+ for (matrix.id in matrix.ids) { |
|
175 |
+ matrix = matrices [[matrix.id]] |
|
176 |
+ stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1) |
|
177 |
+ tbl.sub = subset (tbl.anno, id==matrix.id) |
|
178 |
+ if (nrow (tbl.sub) > 1) { |
|
179 |
+ # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr |
|
180 |
+ # some others, a sampling: P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510 |
|
181 |
+ dimer = paste (unique (tbl.sub$proteinID), collapse=';') |
|
182 |
+ tbl.sub [1, 'proteinID'] = dimer |
|
183 |
+ } |
|
184 |
+ anno = as.list (tbl.sub [1,]) |
|
185 |
+ taxon.code = anno$speciesID |
|
186 |
+ geneId.info = assignGeneId (anno$proteinID) |
|
187 |
+ new.row = list (providerName=anno$binder, |
|
188 |
+ providerId=anno$fullID, |
|
189 |
+ dataSource=dataSource, |
|
190 |
+ geneSymbol=anno$binder, |
|
191 |
+ geneId=geneId.info$geneId, |
|
192 |
+ geneIdType=geneId.info$type, |
|
193 |
+ proteinId=anno$proteinID, |
|
194 |
+ proteinIdType=guessProteinIdentifierType (anno$proteinID), |
|
195 |
+ organism=convertTaxonCode(anno$speciesID), |
|
196 |
+ sequenceCount=as.integer (mean (colSums (matrix))), |
|
197 |
+ bindingSequence=NA_character_, |
|
198 |
+ bindingDomain=anno$class, |
|
199 |
+ tfFamily=anno$family, |
|
200 |
+ experimentType=anno$type, |
|
201 |
+ pubmedID=anno$pubmed) |
|
202 |
+ tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE)) |
|
203 |
+ full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID) |
|
204 |
+ rownames (tbl.md) [nrow (tbl.md)] = full.name |
|
205 |
+ } # for i |
|
206 |
+ |
|
207 |
+ # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA> |
|
208 |
+ NA.string.indices = grep ('NA', tbl.md$proteinId) |
|
209 |
+ if (length (NA.string.indices) > 0) |
|
210 |
+ tbl.md$proteinId [NA.string.indices] = NA |
|
211 |
+ |
|
212 |
+ invisible (tbl.md) |
|
213 |
+ |
|
214 |
+} # createMetadataTable |
|
215 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
216 |
+renameMatrices = function (matrices, tbl.md) |
|
217 |
+{ |
|
218 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
219 |
+ names (matrices) = rownames (tbl.md) |
|
220 |
+ invisible (matrices) |
|
221 |
+ |
|
222 |
+} # renameMatrices |
|
223 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
224 |
+# full names: ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', |
|
225 |
+# 'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam', |
|
226 |
+# 'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster', |
|
227 |
+# 'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', |
|
228 |
+# 'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'), |
|
229 |
+convertTaxonCode = function (ncbi.code) |
|
230 |
+{ |
|
231 |
+ #if (is.na (ncbi.code)) |
|
232 |
+ # return (NA_character_) |
|
233 |
+ ncbi.code = as.character (ncbi.code) |
|
234 |
+ if (ncbi.code %in% c ('-', NA_character_, 'NA')) |
|
235 |
+ return ('Vertebrata') |
|
236 |
+ |
|
237 |
+ tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102', |
|
238 |
+ '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729', |
|
239 |
+ '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'), |
|
240 |
+ name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', |
|
241 |
+ 'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam', |
|
242 |
+ 'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster', |
|
243 |
+ 'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', |
|
244 |
+ 'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'), |
|
245 |
+ stringsAsFactors=FALSE) |
|
246 |
+ |
|
247 |
+ ncbi.code = as.character (ncbi.code) |
|
248 |
+ index = which (tbl$code == ncbi.code) |
|
249 |
+ if (length (index) == 1) |
|
250 |
+ return (tbl$name [index]) |
|
251 |
+ else { |
|
252 |
+ write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ()) |
|
253 |
+ return (NA_character_) |
|
254 |
+ } |
|
255 |
+ |
|
256 |
+} # convertTaxonCode |
|
257 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
258 |
+# an empirical and not altogether trustworthy solution to identifying identifier types. |
|
259 |
+guessProteinIdentifierType = function (moleculeName) |
|
260 |
+{ |
|
261 |
+ if (nchar (moleculeName) == 0) |
|
262 |
+ return (NA_character_) |
|
263 |
+ if (is.na (moleculeName)) |
|
264 |
+ return (NA_character_) |
|
265 |
+ |
|
266 |
+ first.char = substr (moleculeName, 1, 1) |
|
267 |
+ |
|
268 |
+ if (first.char == 'Y') |
|
269 |
+ return ('SGD') |
|
270 |
+ |
|
271 |
+ if (first.char %in% c ('P', 'Q', 'O', 'A', 'C')) |
|
272 |
+ return ('UNIPROT') |
|
273 |
+ |
|
274 |
+ if (length (grep ('^EAW', moleculeName)) == 1) |
|
275 |
+ return ('NCBI') |
|
276 |
+ |
|
277 |
+ if (length (grep ('^EAX', moleculeName)) == 1) |
|
278 |
+ return ('NCBI') |
|
279 |
+ |
|
280 |
+ if (length (grep ('^NP_', moleculeName)) == 1) |
|
281 |
+ return ('REFSEQ') |
|
282 |
+ |
|
283 |
+ if (length (grep ('^BAD', moleculeName)) == 1) |
|
284 |
+ return ('EMBL') |
|
285 |
+ |
|
286 |
+ return (NA_character_) |
|
287 |
+ |
|
288 |
+} # guessProteinIdentifierType |
|
289 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
290 |
+normalizeMatrices = function (matrices) |
|
291 |
+{ |
|
292 |
+ mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
293 |
+ invisible (mtx.normalized) |
|
294 |
+ |
|
295 |
+} # normalizeMatrices |
|
296 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
297 |
+assignGeneId = function (proteinId) |
|
298 |
+{ |
|
299 |
+ if (!exists ('id.uniprot.human')) { |
|
300 |
+ |
|
301 |
+ tbl = toTable (org.Hs.egUNIPROT) |
|
302 |
+ id.uniprot.human <<- as.character (tbl$gene_id) |
|
303 |
+ names (id.uniprot.human) <<- tbl$uniprot_id |
|
304 |
+ |
|
305 |
+ tbl = toTable (org.Hs.egREFSEQ) |
|
306 |
+ tbl = tbl [grep ('^NP_', tbl$accession),] |
|
307 |
+ id.refseq.human <<- as.character (tbl$gene_id) |
|
308 |
+ names (id.refseq.human) <<- tbl$accession |
|
309 |
+ |
|
310 |
+ tbl = toTable (org.Mm.egUNIPROT) |
|
311 |
+ id.uniprot.mouse <<- as.character (tbl$gene_id) |
|
312 |
+ names (id.uniprot.mouse) <<- tbl$uniprot_id |
|
313 |
+ |
|
314 |
+ tbl = toTable (org.Mm.egREFSEQ) |
|
315 |
+ tbl = tbl [grep ('^NP_', tbl$accession),] |
|
316 |
+ id.refseq.mouse <<- as.character (tbl$gene_id) |
|
317 |
+ names (id.refseq.mouse) <<- tbl$accession |
|
318 |
+ } |
|
319 |
+ |
|
320 |
+ proteinId = strsplit (proteinId, '\\.')[[1]][1] # remove any trailing '.*' |
|
321 |
+ |
|
322 |
+ if (proteinId %in% names (id.uniprot.human)) |
|
323 |
+ return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ')) |
|
324 |
+ |
|
325 |
+ if (proteinId %in% names (id.uniprot.mouse)) |
|
326 |
+ return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ')) |
|
327 |
+ |
|
328 |
+ if (proteinId %in% names (id.refseq.human)) |
|
329 |
+ return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ')) |
|
330 |
+ |
|
331 |
+ if (proteinId %in% names (id.refseq.mouse)) |
|
332 |
+ return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ')) |
|
333 |
+ |
|
334 |
+ found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE)) |
|
335 |
+ |
|
336 |
+ if (found.leading.Y == 1) |
|
337 |
+ return (list (geneId=proteinId, type='SGD')) |
|
338 |
+ |
|
339 |
+ return (list (geneId=NA_character_, type=NA_character_)) |
|
340 |
+ |
|
341 |
+} # assignGeneId |
|
342 |
+#------------------------------------------------------------------------------------------------------------------------ |