Browse code

Matrix.txt needs to be MATRIX.txt - discovered when running this on linux which, in contrast to macos, is case sensitive in filenames

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@107889 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 28/08/2015 18:54:35
Showing1 changed files
... ...
@@ -62,7 +62,7 @@ extractMatrices = function (pwm.list)
62 62
 #------------------------------------------------------------------------------------------------------------------------
63 63
 createAnnotationTable <- function(dataDir)
64 64
 {
65
-    file <- file.path(dataDir, "Matrix.txt")
65
+    file <- file.path(dataDir, "MATRIX.txt")
66 66
     tbl.matrix <-  read.table(file, sep='\t', header=FALSE, as.is=TRUE)
67 67
     colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder')
68 68
       
Browse code

cisbp now successfully added: unit tets pass, man page for package updated

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@107858 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 28/08/2015 04:39:12
Showing1 changed files
... ...
@@ -32,7 +32,7 @@ readRawMatrices = function (dataDir)
32 32
     
33 33
 
34 34
   filename <- file.path(dataDir, "matrix_data.txt")
35
-  stopifnot(file.exists(filename))
35
+    stopifnot(file.exists(filename))
36 36
   
37 37
   all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
38 38
   title.lines = grep ('^>', all.lines)
Browse code

tristan's working versions

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@88754 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 10/04/2014 19:09:39
Showing1 changed files
... ...
@@ -1,180 +1,131 @@
1 1
 # jaspar2014/import.R
2
-# 
3
-#------------------------------------------------------------------------------------------------------------------------
2
+#-----------------------------------------------------------------------------------
4 3
 library (org.Hs.eg.db)
5 4
 library (org.Mm.eg.db)
6 5
 #------------------------------------------------------------------------------------------------------------------------
6
+options (stringsAsFactors=FALSE)
7 7
 printf <- function(...) print(noquote(sprintf(...)))
8 8
 #------------------------------------------------------------------------------------------------------------------------
9 9
 run = function (dataDir)
10 10
 {
11
-  dataDir <- file.path(dataDir, "jaspar")
12
-  tbl.rmat = readRawMatrices (dataDir)
13
-  matrices = convertRawMatricesToStandard (tbl.rmat)
14
-  tbl.anno = createAnnotationTable (dataDir)
15
-  tbl.md = createMetadataTable (tbl.anno, matrices)
16
-  matrices = renameMatrices (matrices, tbl.md)
17
-  matrices = normalizeMatrices (matrices)
18
-  serializedFile <- "jaspar.RData"
11
+  dataDir <- file.path(dataDir,"jaspar2014")
12
+  rawMatrixList <- readRawMatrices (dataDir)
13
+  matrices <- extractMatrices (rawMatrixList)
14
+  tbl.anno <- createAnnotationTable(dataDir)
15
+  tbl.md <- createMetadataTable (tbl.anno, matrices)
16
+  matrices <- normalizeMatrices (matrices)
17
+  matrices <- renameMatrices (matrices, tbl.md)
18
+  serializedFile <- file.path(dataDir, "jaspar2014.RData")
19 19
   save (matrices, tbl.md, file=serializedFile)
20 20
   printf("saved %d matrices to %s", length(matrices), serializedFile)
21
-  printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
21
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
22 22
 
23 23
 } # run
24 24
 #------------------------------------------------------------------------------------------------------------------------
25
-
26 25
 readRawMatrices = function (dataDir)
27 26
 {
28
-  file <- file.path(dataDir, 'MATRIX_DATA.txt')
29
-  #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
30
-  #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
31
-  tbl.matrices = read.table (file, sep='\n', header=FALSE )
27
+    # our convention is that there is a shared "dataDir" visible to
28
+    # the importer, and that within that directory there is one
29
+    # subdirectory for each data source.
30
+    # for this example importer, that directory will be <dataDir>/test
31
+    # within which we will look for one small file "sample.pcm"
32
+    
33
+
34
+  filename <- file.path(dataDir, "matrix_data.txt")
35
+  stopifnot(file.exists(filename))
32 36
   
33
-  invisible (tbl.matrices)
37
+  all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
38
+  title.lines = grep ('^>', all.lines)
39
+  title.line.count <<- length (title.lines)
40
+  max = title.line.count - 1
34 41
 
42
+  pwms = list ()
43
+  
44
+  for (i in 1:max) {
45
+    start.line = title.lines [i]
46
+    end.line = title.lines [i+1] - 1
47
+    new.pwm = parsePwm (all.lines [start.line:end.line])
48
+    pwms = c (pwms, list (new.pwm))
49
+    } # for i
50
+  invisible (pwms)
35 51
 } # readRawMatrices
36 52
 #------------------------------------------------------------------------------------------------------------------------
37
-convertRawMatricesToStandard = function (tbl.rmat)
53
+extractMatrices = function (pwm.list)
38 54
 {
39
-  #matrix.ids = unique (tbl.rmat$id)
40
-  #result =  vector ('list', length (matrix.ids))
41
-
42
-  #i = 1
43
-  #for (matrix.id in matrix.ids) {
44
-  #  tbl.sub = subset (tbl.rmat, id == matrix.id)
45
-  #    # sanity check this rather unusual representation of a position count matrix
46
-  #  base.count = as.data.frame (table (tbl.sub$base))
47
-  #  stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
48
-      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
49
-  #  if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
50
-  #    printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
51
-  #    }
52
-  #  stopifnot (all (base.count$Freq == base.count$Freq [1]))
53
-  # nucleotide.counts = tbl.sub$count
54
-  #  row.names = c('A', 'C', 'G', 'T'); 
55
-  #  col.names = 1:(nrow (tbl.sub) / 4)
56
-  #  m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
57
-  #  result [[i]] = m
58
-  #  i = i + 1
59
-  #  } # for matrix.id
60
-
61
-  #names (result) = matrix.ids
62
-
63
-  result<- vector("list",length=593)
64
-  matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE)
65
-  names (result) = matrices.id
66
-  #Get a list of the ID's and then assign them as names 
67
-  m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE)
68
-  data.list=list()
69
-  x = 1
70
-  y = 1
71
-  while (x < length(m.d))
72
-    {   
73
-      matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3])
74
-      matrices.formatted <- gsub("\t"," ",matrices.position)
75
-      #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t"))
76
-      row.names = c('A', 'C', 'G', 'T')
77
-      col.names = 1:(ncol (matrices.formatted))
78
-      #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,))
79
-      result[[y]] <- matrices.formatted
80
-      y=y+1
81
-      x=x+4
82
-    }
83
-  browser()
84
-  return (result)
85
-
86
-} # convertRawMatricesToStandard 
55
+  matrices = sapply (pwm.list, function (element) element$matrix)
56
+  matrix.names <- sapply (pwm.list, function (element) element$title)
57
+  matrix.names <- sub("^>", "", matrix.names)
58
+  names (matrices) <- matrix.names
59
+  
60
+  return (matrices)
61
+} # extractMatrices
87 62
 #------------------------------------------------------------------------------------------------------------------------
88
-# read 'mysql' tables provide by jaspar: 
89
-#          MATRIX.txt:  9229	CORE	MA0001	1	AGL3
90
-#  MATRIX_PROTEIN.txt: 9229	P29383
91
-#  MATRIX_SPECIES.txt: 9229	3702
92
-#  MATRIX_ANNOTATION.txt: 
93
-#     9229	class	Other Alpha-Helix
94
-#     9229	comment	-
95
-#     9229	family	MADS
96
-#     9229	medline	7632923
97
-#     9229	tax_group	plants
98
-#     9229	type	SELEX
99
-createAnnotationTable = function (dataDir)
63
+createAnnotationTable <- function(dataDir)
100 64
 {
101
-  file <- file.path(dataDir, "MATRIX.txt")
102
-  tbl.matrix =  read.table (file, sep='\t', header=F, as.is=TRUE)
103
-  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
104
-
105
-  file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
106
-  tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
107
-  colnames (tbl.protein) =  c ('id', 'proteinID')
108
-
109
-  file <- file.path(dataDir, "MATRIX_SPECIES.txt")
110
-  tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
111
-  colnames (tbl.species) = c ('id', 'speciesID')
112
-
113
-  file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
114
-  tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
115
-  colnames (tbl.anno) = c ('id', 'attribute', 'value')
116
-
117
-  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
118
-  colnames (tbl.family) = c ('id', 'family')
119
-
120
-  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
121
-  colnames (tbl.tax) = c ('id', 'tax')
122
-
123
-  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
124
-  colnames (tbl.class) = c ('id', 'class')
125
-
126
-  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
127
-  colnames (tbl.comment) = c ('id', 'comment')
128
-
129
-  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
130
-  colnames (tbl.pubmed) = c ('id', 'pubmed')
131
-
132
-  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
133
-  colnames (tbl.type) = c ('id', 'type')
134
-
135
-
136
-  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
137
-  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
138
-  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
139
-  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
140
-  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
141
-  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
142
-  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
143
-
144
-  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
145
-  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
146
-
147
-  invisible (tbl.md)
148
-
65
+    file <- file.path(dataDir, "Matrix.txt")
66
+    tbl.matrix <-  read.table(file, sep='\t', header=FALSE, as.is=TRUE)
67
+    colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder')
68
+      
69
+    file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
70
+    stopifnot(file.exists(file))
71
+    tbl.protein <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
72
+    colnames(tbl.protein) <- c('id', 'proteinID')
73
+
74
+    file <- file.path(dataDir, "MATRIX_SPECIES.txt")
75
+    stopifnot(file.exists(file))
76
+    tbl.species <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
77
+    colnames(tbl.species) <- c('id', 'speciesID')
78
+    
79
+    file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
80
+    stopifnot(file.exists(file))
81
+    tbl.anno <- read.table(file, sep='\t', header=FALSE, as.is=TRUE, quote="")
82
+    colnames(tbl.anno) <- c('id', 'attribute', 'value')
83
+
84
+    file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
85
+    tbl.family  <- subset(tbl.anno, attribute=='family') [, -2];   
86
+    colnames(tbl.family) <- c('id', 'family')
87
+       # create 5 2-column data.frames out of tbl.anno
88
+       # which can all then be merged on the "id" column
89
+    tbl.tax <- subset(tbl.anno, attribute=='tax_group') [,-2]; 
90
+    colnames(tbl.tax) <- c('id', 'tax')
91
+
92
+    tbl.class <- subset(tbl.anno, attribute=='class') [,-2];     
93
+    colnames(tbl.class) <- c('id', 'class')
94
+
95
+    tbl.comment <- subset(tbl.anno, attribute=='comment')[,-2];    
96
+    colnames(tbl.comment) <- c('id', 'comment')
97
+
98
+    tbl.pubmed  <- subset(tbl.anno, attribute=='medline')[,-2];    
99
+    colnames(tbl.pubmed) <- c('id', 'pubmed')
100
+
101
+    tbl.type    <- subset(tbl.anno, attribute=='type')[,-2];       
102
+    colnames(tbl.type) <- c('id', 'type')
103
+
104
+    tbl.md <- merge(tbl.matrix, tbl.species, all.x=TRUE)
105
+    tbl.md <- merge(tbl.md, tbl.protein, all.x=TRUE)
106
+    tbl.md <- merge(tbl.md, tbl.family, all.x=TRUE)
107
+    tbl.md <- merge(tbl.md, tbl.tax, all.x=TRUE)
108
+    tbl.md <- merge(tbl.md, tbl.class, all.x=TRUE)
109
+    tbl.md <- merge(tbl.md, tbl.pubmed, all.x=TRUE)
110
+    tbl.md <- merge(tbl.md, tbl.type, all.x=TRUE)
111
+
112
+    fullID <- paste(tbl.md$mID, tbl.md$version, sep='.')
113
+    tbl.md <- cbind(fullID, tbl.md, stringsAsFactors=FALSE)
114
+
115
+    invisible(tbl.md)
116
+    
149 117
 } # createAnnotationTable
150
-#------------------------------------------------------------------------------------------------------------------------
151
-# assemble these columns:
152
-#                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
153
-#                      nativeNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
154
-#                      geneSymbols=character(),              # ABF2, Rhox11
155
-#                      sequenceCounts=integer(),             # often NA
156
-#                      organisms=character(),                # Scerevisiae, Mmusculus
157
-#                      bindingMolecules=character(),         # YMR072W, 194738
158
-#                      bindingMoleculeIdTypes=character(),   # SGD, entrez gene
159
-#                      bindingDomainTypes=character(),       # NA, Homeo
160
-#                      dataSources=character(),              # ScerTF, UniPROBE
161
-#                      experimentTypes=character(),          # NA, protein-binding microarray
162
-#                      pubmedIDs=character(),                # 19111667, 1858359
163
-#                      tfFamilies=character())               # NA, NA
164
-#
165
-# from these
166
-#
118
+#-------------------------------------------------------------------------------
167 119
 createMetadataTable = function (tbl.anno, matrices)
168 120
 {
169 121
   options (stringsAsFactors=FALSE)
170 122
   tbl.md = data.frame ()
171 123
   matrix.ids = names (matrices) 
172
-  dataSource = 'JASPAR_CORE'
124
+  dataSource = 'JASPAR_2014'
173 125
   
174 126
   for (matrix.id in matrix.ids) {
175 127
     matrix = matrices [[matrix.id]]
176
-    stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
177
-    tbl.sub = subset (tbl.anno, id==matrix.id)
128
+    tbl.sub = subset (tbl.anno, fullID==substr(matrix.id,0,8))
178 129
     if (nrow (tbl.sub) > 1) {  
179 130
         # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
180 131
         # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
... ...
@@ -208,19 +159,77 @@ createMetadataTable = function (tbl.anno, matrices)
208 159
     NA.string.indices = grep ('NA', tbl.md$proteinId)
209 160
     if (length (NA.string.indices) > 0)
210 161
       tbl.md$proteinId [NA.string.indices] = NA
211
-     
212 162
    invisible (tbl.md)
213
-
214
-} # createMetadataTable
163
+}#createMetadataTable
215 164
 #------------------------------------------------------------------------------------------------------------------------
216 165
 renameMatrices = function (matrices, tbl.md)
217 166
 {
218 167
   stopifnot (length (matrices) == nrow (tbl.md))
219 168
   names (matrices) = rownames (tbl.md)
220 169
   invisible (matrices)
221
-
170
+  
222 171
 } # renameMatrices
223 172
 #------------------------------------------------------------------------------------------------------------------------
173
+# an empirical and not altogether trustworthy solution to identifying identifier types.
174
+guessProteinIdentifierType = function (moleculeName)
175
+{
176
+  if (nchar (moleculeName) == 0)
177
+    return (NA_character_)
178
+  if (is.na (moleculeName))
179
+    return (NA_character_) 
180
+
181
+  first.char = substr (moleculeName, 1, 1)
182
+
183
+  if (first.char == 'Y')
184
+    return ('SGD')
185
+
186
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
187
+    return ('UNIPROT')
188
+
189
+  if (length (grep ('^NP_', moleculeName)) == 1)
190
+    return ('REFSEQ')
191
+
192
+   return (NA_character_)
193
+
194
+} # guessProteinIdentifierType
195
+#------------------------------------------------------------------------------------------------------------------------
196
+normalizeMatrices = function (matrices)
197
+{
198
+  mtx.normalized = sapply (matrices,
199
+      function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
200
+
201
+  invisible (mtx.normalized)
202
+
203
+} # normalizeMatrices
204
+#------------------------------------------------------------------------------------------------------------------------
205
+parsePwm = function (text)
206
+{
207
+   lines = strsplit (text, '\t')
208
+   #browser()
209
+   stopifnot(length(lines)==5) # title line, one line for each base
210
+   title = lines [[1]][1]
211
+   line.count = length(lines)
212
+
213
+     # expect 4 rows, and a number of columns we can discern from
214
+     # the incoming text.
215
+  cols <- length(lines[[2]])
216
+  result <- matrix (nrow=4, ncol=cols,
217
+                   dimnames=list(c('A','C','G','T'),
218
+                                 as.character(1:cols)))
219
+  row = 1
220
+  for(i in 2:line.count){
221
+    result [row,] = as.numeric (lines[[i]])
222
+    row = row + 1
223
+    } # for i
224
+
225
+  #result = t (result)
226
+    
227
+  #return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
228
+  return (list (title=title, matrix=result))
229
+
230
+} # parsePwm
231
+#----------------------------------------------------------------------------------------------------
232
+#------------------------------------------------------------------------------------------------------------------------
224 233
 # full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
225 234
 #                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
226 235
 #                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
... ...
@@ -248,52 +257,16 @@ convertTaxonCode = function (ncbi.code)
248 257
   index = which (tbl$code == ncbi.code)
249 258
   if (length (index) == 1)
250 259
     return (tbl$name [index])
260
+  if (nchar(ncbi.code)>6)
261
+    return ('Vertebrata')
251 262
   else {
252
-    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
263
+    browser()
264
+    write (sprintf ("unable to map organism code |%s|", ncbi.code), stderr ())
253 265
     return (NA_character_)
254 266
     }
255 267
 
256 268
 } # convertTaxonCode
257
-#------------------------------------------------------------------------------------------------------------------------
258
-# an empirical and not altogether trustworthy solution to identifying identifier types.
259
-guessProteinIdentifierType = function (moleculeName)
260
-{
261
-  if (nchar (moleculeName) == 0)
262
-    return (NA_character_)
263
-  if (is.na (moleculeName))
264
-    return (NA_character_) 
265
-
266
-  first.char = substr (moleculeName, 1, 1)
267
-
268
-  if (first.char == 'Y')
269
-    return ('SGD')
270
-
271
-  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
272
-    return ('UNIPROT')
273
-
274
-  if (length (grep ('^EAW', moleculeName)) == 1)
275
-    return ('NCBI')
276
-
277
-  if (length (grep ('^EAX', moleculeName)) == 1)
278
-    return ('NCBI')
279
-
280
-  if (length (grep ('^NP_', moleculeName)) == 1)
281
-    return ('REFSEQ')
282
-
283
-  if (length (grep ('^BAD', moleculeName)) == 1)
284
-    return ('EMBL')
285
-
286
-   return (NA_character_)
287
-
288
-} # guessProteinIdentifierType
289
-#------------------------------------------------------------------------------------------------------------------------
290
-normalizeMatrices = function (matrices)
291
-{
292
-  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
293
-  invisible (mtx.normalized)
294
-
295
-} # normalizeMatrices
296
-#------------------------------------------------------------------------------------------------------------------------
269
+#----------------------------------------------------------------------------------------------------
297 270
 assignGeneId = function (proteinId)
298 271
 {
299 272
   if (!exists ('id.uniprot.human')) {
... ...
@@ -339,4 +312,4 @@ assignGeneId = function (proteinId)
339 312
   return (list (geneId=NA_character_, type=NA_character_))
340 313
 
341 314
 } # assignGeneId
342
-#------------------------------------------------------------------------------------------------------------------------
315
+#-----------------------------------------------------------------------------
Browse code

bringing this up to date before overwriting it with Tristan's version

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@88753 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 10/04/2014 19:07:57
Showing1 changed files
... ...
@@ -1,4 +1,4 @@
1
-# jaspar/import.R
1
+# jaspar2014/import.R
2 2
 # 
3 3
 #------------------------------------------------------------------------------------------------------------------------
4 4
 library (org.Hs.eg.db)
Browse code

moved tristan's new work to a more appropriate directory

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86588 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 19/02/2014 18:03:49
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,342 @@
1
+# jaspar/import.R
2
+# 
3
+#------------------------------------------------------------------------------------------------------------------------
4
+library (org.Hs.eg.db)
5
+library (org.Mm.eg.db)
6
+#------------------------------------------------------------------------------------------------------------------------
7
+printf <- function(...) print(noquote(sprintf(...)))
8
+#------------------------------------------------------------------------------------------------------------------------
9
+run = function (dataDir)
10
+{
11
+  dataDir <- file.path(dataDir, "jaspar")
12
+  tbl.rmat = readRawMatrices (dataDir)
13
+  matrices = convertRawMatricesToStandard (tbl.rmat)
14
+  tbl.anno = createAnnotationTable (dataDir)
15
+  tbl.md = createMetadataTable (tbl.anno, matrices)
16
+  matrices = renameMatrices (matrices, tbl.md)
17
+  matrices = normalizeMatrices (matrices)
18
+  serializedFile <- "jaspar.RData"
19
+  save (matrices, tbl.md, file=serializedFile)
20
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
21
+  printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
22
+
23
+} # run
24
+#------------------------------------------------------------------------------------------------------------------------
25
+
26
+readRawMatrices = function (dataDir)
27
+{
28
+  file <- file.path(dataDir, 'MATRIX_DATA.txt')
29
+  #tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
30
+  #colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
31
+  tbl.matrices = read.table (file, sep='\n', header=FALSE )
32
+  
33
+  invisible (tbl.matrices)
34
+
35
+} # readRawMatrices
36
+#------------------------------------------------------------------------------------------------------------------------
37
+convertRawMatricesToStandard = function (tbl.rmat)
38
+{
39
+  #matrix.ids = unique (tbl.rmat$id)
40
+  #result =  vector ('list', length (matrix.ids))
41
+
42
+  #i = 1
43
+  #for (matrix.id in matrix.ids) {
44
+  #  tbl.sub = subset (tbl.rmat, id == matrix.id)
45
+  #    # sanity check this rather unusual representation of a position count matrix
46
+  #  base.count = as.data.frame (table (tbl.sub$base))
47
+  #  stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
48
+      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
49
+  #  if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
50
+  #    printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
51
+  #    }
52
+  #  stopifnot (all (base.count$Freq == base.count$Freq [1]))
53
+  # nucleotide.counts = tbl.sub$count
54
+  #  row.names = c('A', 'C', 'G', 'T'); 
55
+  #  col.names = 1:(nrow (tbl.sub) / 4)
56
+  #  m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
57
+  #  result [[i]] = m
58
+  #  i = i + 1
59
+  #  } # for matrix.id
60
+
61
+  #names (result) = matrix.ids
62
+
63
+  result<- vector("list",length=593)
64
+  matrices.id <-grep('>',tbl.matrices[[1]],value=TRUE)
65
+  names (result) = matrices.id
66
+  #Get a list of the ID's and then assign them as names 
67
+  m.d <-grep('>',tbl.matrices[[1]],value=TRUE,invert=TRUE)
68
+  data.list=list()
69
+  x = 1
70
+  y = 1
71
+  while (x < length(m.d))
72
+    {   
73
+      matrices.position <- rbind(m.d[x],m.d[x+1],m.d[x+2],m.d[x+3])
74
+      matrices.formatted <- gsub("\t"," ",matrices.position)
75
+      #matrices.formatted2 <- as.numeric(strsplit(matrices.position, "\t"))
76
+      row.names = c('A', 'C', 'G', 'T')
77
+      col.names = 1:(ncol (matrices.formatted))
78
+      #matrices.matrix <- matrix (matrices.formatted, byrow=TRUE, nrow=4,dimnames=list(row.names,))
79
+      result[[y]] <- matrices.formatted
80
+      y=y+1
81
+      x=x+4
82
+    }
83
+  browser()
84
+  return (result)
85
+
86
+} # convertRawMatricesToStandard 
87
+#------------------------------------------------------------------------------------------------------------------------
88
+# read 'mysql' tables provide by jaspar: 
89
+#          MATRIX.txt:  9229	CORE	MA0001	1	AGL3
90
+#  MATRIX_PROTEIN.txt: 9229	P29383
91
+#  MATRIX_SPECIES.txt: 9229	3702
92
+#  MATRIX_ANNOTATION.txt: 
93
+#     9229	class	Other Alpha-Helix
94
+#     9229	comment	-
95
+#     9229	family	MADS
96
+#     9229	medline	7632923
97
+#     9229	tax_group	plants
98
+#     9229	type	SELEX
99
+createAnnotationTable = function (dataDir)
100
+{
101
+  file <- file.path(dataDir, "MATRIX.txt")
102
+  tbl.matrix =  read.table (file, sep='\t', header=F, as.is=TRUE)
103
+  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
104
+
105
+  file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
106
+  tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
107
+  colnames (tbl.protein) =  c ('id', 'proteinID')
108
+
109
+  file <- file.path(dataDir, "MATRIX_SPECIES.txt")
110
+  tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
111
+  colnames (tbl.species) = c ('id', 'speciesID')
112
+
113
+  file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
114
+  tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
115
+  colnames (tbl.anno) = c ('id', 'attribute', 'value')
116
+
117
+  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
118
+  colnames (tbl.family) = c ('id', 'family')
119
+
120
+  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
121
+  colnames (tbl.tax) = c ('id', 'tax')
122
+
123
+  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
124
+  colnames (tbl.class) = c ('id', 'class')
125
+
126
+  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
127
+  colnames (tbl.comment) = c ('id', 'comment')
128
+
129
+  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
130
+  colnames (tbl.pubmed) = c ('id', 'pubmed')
131
+
132
+  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
133
+  colnames (tbl.type) = c ('id', 'type')
134
+
135
+
136
+  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
137
+  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
138
+  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
139
+  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
140
+  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
141
+  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
142
+  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
143
+
144
+  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
145
+  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
146
+
147
+  invisible (tbl.md)
148
+
149
+} # createAnnotationTable
150
+#------------------------------------------------------------------------------------------------------------------------
151
+# assemble these columns:
152
+#                      names=character(),                    # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
153
+#                      nativeNames=character(),              # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
154
+#                      geneSymbols=character(),              # ABF2, Rhox11
155
+#                      sequenceCounts=integer(),             # often NA
156
+#                      organisms=character(),                # Scerevisiae, Mmusculus
157
+#                      bindingMolecules=character(),         # YMR072W, 194738
158
+#                      bindingMoleculeIdTypes=character(),   # SGD, entrez gene
159
+#                      bindingDomainTypes=character(),       # NA, Homeo
160
+#                      dataSources=character(),              # ScerTF, UniPROBE
161
+#                      experimentTypes=character(),          # NA, protein-binding microarray
162
+#                      pubmedIDs=character(),                # 19111667, 1858359
163
+#                      tfFamilies=character())               # NA, NA
164
+#
165
+# from these
166
+#
167
+createMetadataTable = function (tbl.anno, matrices)
168
+{
169
+  options (stringsAsFactors=FALSE)
170
+  tbl.md = data.frame ()
171
+  matrix.ids = names (matrices) 
172
+  dataSource = 'JASPAR_CORE'
173
+  
174
+  for (matrix.id in matrix.ids) {
175
+    matrix = matrices [[matrix.id]]
176
+    stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
177
+    tbl.sub = subset (tbl.anno, id==matrix.id)
178
+    if (nrow (tbl.sub) > 1) {  
179
+        # the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
180
+        # some others, a sampling:  P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
181
+      dimer = paste (unique (tbl.sub$proteinID), collapse=';')
182
+      tbl.sub [1, 'proteinID'] = dimer
183
+      }
184
+    anno = as.list (tbl.sub [1,])
185
+    taxon.code = anno$speciesID
186
+    geneId.info = assignGeneId (anno$proteinID)
187
+    new.row = list (providerName=anno$binder,
188
+                    providerId=anno$fullID,
189
+                    dataSource=dataSource,
190
+                    geneSymbol=anno$binder,
191
+                    geneId=geneId.info$geneId,
192
+                    geneIdType=geneId.info$type,
193
+                    proteinId=anno$proteinID,
194
+                    proteinIdType=guessProteinIdentifierType (anno$proteinID),
195
+                    organism=convertTaxonCode(anno$speciesID),
196
+                    sequenceCount=as.integer (mean (colSums (matrix))),
197
+                    bindingSequence=NA_character_,
198
+                    bindingDomain=anno$class,
199
+                    tfFamily=anno$family,
200
+                    experimentType=anno$type,
201
+                    pubmedID=anno$pubmed)
202
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
203
+    full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
204
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
205
+    } # for i
206
+
207
+      # Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
208
+    NA.string.indices = grep ('NA', tbl.md$proteinId)
209
+    if (length (NA.string.indices) > 0)
210
+      tbl.md$proteinId [NA.string.indices] = NA
211
+     
212
+   invisible (tbl.md)
213
+
214
+} # createMetadataTable
215
+#------------------------------------------------------------------------------------------------------------------------
216
+renameMatrices = function (matrices, tbl.md)
217
+{
218
+  stopifnot (length (matrices) == nrow (tbl.md))
219
+  names (matrices) = rownames (tbl.md)
220
+  invisible (matrices)
221
+
222
+} # renameMatrices
223
+#------------------------------------------------------------------------------------------------------------------------
224
+# full names:   ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum', 
225
+#                'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
226
+#                'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
227
+#                'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis', 
228
+#                'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
229
+convertTaxonCode = function (ncbi.code)
230
+{
231
+  #if (is.na (ncbi.code))  
232
+  #  return (NA_character_)
233
+  ncbi.code = as.character (ncbi.code)
234
+  if (ncbi.code %in% c ('-', NA_character_, 'NA'))
235
+    return ('Vertebrata')
236
+
237
+  tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
238
+                            '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
239
+                            '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
240
+                    name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
241
+                            'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
242
+                            'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
243
+                            'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
244
+                            'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
245
+                    stringsAsFactors=FALSE)
246
+
247
+  ncbi.code = as.character (ncbi.code)
248
+  index = which (tbl$code == ncbi.code)
249
+  if (length (index) == 1)
250
+    return (tbl$name [index])
251
+  else {
252
+    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
253
+    return (NA_character_)
254
+    }
255
+
256
+} # convertTaxonCode
257
+#------------------------------------------------------------------------------------------------------------------------
258
+# an empirical and not altogether trustworthy solution to identifying identifier types.
259
+guessProteinIdentifierType = function (moleculeName)
260
+{
261
+  if (nchar (moleculeName) == 0)
262
+    return (NA_character_)
263
+  if (is.na (moleculeName))
264
+    return (NA_character_) 
265
+
266
+  first.char = substr (moleculeName, 1, 1)
267
+
268
+  if (first.char == 'Y')
269
+    return ('SGD')
270
+
271
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
272
+    return ('UNIPROT')
273
+
274
+  if (length (grep ('^EAW', moleculeName)) == 1)
275
+    return ('NCBI')
276
+
277
+  if (length (grep ('^EAX', moleculeName)) == 1)
278
+    return ('NCBI')
279
+
280
+  if (length (grep ('^NP_', moleculeName)) == 1)
281
+    return ('REFSEQ')
282
+
283
+  if (length (grep ('^BAD', moleculeName)) == 1)
284
+    return ('EMBL')
285
+
286
+   return (NA_character_)
287
+
288
+} # guessProteinIdentifierType
289
+#------------------------------------------------------------------------------------------------------------------------
290
+normalizeMatrices = function (matrices)
291
+{
292
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
293
+  invisible (mtx.normalized)
294
+
295
+} # normalizeMatrices
296
+#------------------------------------------------------------------------------------------------------------------------
297
+assignGeneId = function (proteinId)
298
+{
299
+  if (!exists ('id.uniprot.human')) {
300
+
301
+    tbl = toTable (org.Hs.egUNIPROT)
302
+    id.uniprot.human <<- as.character (tbl$gene_id)
303
+    names (id.uniprot.human) <<- tbl$uniprot_id
304
+
305
+    tbl = toTable (org.Hs.egREFSEQ)
306
+    tbl = tbl [grep ('^NP_', tbl$accession),]
307
+    id.refseq.human <<- as.character (tbl$gene_id)
308
+    names (id.refseq.human) <<- tbl$accession
309
+
310
+    tbl = toTable (org.Mm.egUNIPROT)
311
+    id.uniprot.mouse <<- as.character (tbl$gene_id)
312
+    names (id.uniprot.mouse) <<- tbl$uniprot_id
313
+
314
+    tbl = toTable (org.Mm.egREFSEQ)
315
+    tbl = tbl [grep ('^NP_', tbl$accession),]
316
+    id.refseq.mouse <<- as.character (tbl$gene_id)
317
+    names (id.refseq.mouse) <<- tbl$accession
318
+    }
319
+
320
+  proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
321
+
322
+  if (proteinId %in% names (id.uniprot.human))
323
+    return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
324
+
325
+  if (proteinId %in% names (id.uniprot.mouse))
326
+    return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
327
+
328
+  if (proteinId %in% names (id.refseq.human))
329
+    return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
330
+
331
+  if (proteinId %in% names (id.refseq.mouse))
332
+    return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
333
+
334
+  found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
335
+
336
+  if (found.leading.Y == 1)
337
+    return (list (geneId=proteinId, type='SGD'))
338
+
339
+  return (list (geneId=NA_character_, type=NA_character_))
340
+
341
+} # assignGeneId
342
+#------------------------------------------------------------------------------------------------------------------------