Browse code

fixed pubmed id for stamlab data

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86070 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 04/02/2014 19:33:27
Showing 1 changed files
... ...
@@ -185,7 +185,7 @@ createMetadataTable = function (matrices, novels)
185 185
                     bindingDomain=NA,
186 186
                     tfFamily=NA,
187 187
                     experimentType='digital genomic footprinting',
188
-                    pubmedID='22959076')
188
+                    pubmedID="22955618")
189 189
     tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
190 190
     full.name = sprintf ('%s-%s-%s', 'Hsapiens', 'stamlab', matrix.id)
191 191
     rownames (tbl.md) [nrow (tbl.md)] = full.name
Browse code

stamlab geneIdType is now NA. unit test adjusted. motivated by macos failure of 'test.geneIdsAndTypes'

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@86014 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 03/02/2014 19:40:10
Showing 1 changed files
... ...
@@ -195,7 +195,7 @@ createMetadataTable = function (matrices, novels)
195 195
   novels.ordered = novels [tbl.md$providerName]  # make sure we follow the order in the tbl
196 196
   novelPFM [which (novels.ordered)] = 'novelMotif'
197 197
   tbl.md$geneId = novelPFM
198
-  tbl.md$geneIdType = rep ('comment', nrow (tbl.md))
198
+  tbl.md$geneIdType = rep (NA_character_, nrow (tbl.md))
199 199
 
200 200
   invisible (tbl.md)
201 201
 
Browse code

removed all explicit repo paths

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75382 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 20:36:31
Showing 1 changed files
... ...
@@ -5,10 +5,7 @@ library (org.Mm.eg.db)
5 5
 #------------------------------------------------------------------------------------------------------------------------
6 6
 printf <- function(...) print(noquote(sprintf(...)))
7 7
 #------------------------------------------------------------------------------------------------------------------------
8
-kDataDir <- "~/s/data/public/TFBS"
9
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb"
10
-#------------------------------------------------------------------------------------------------------------------------
11
-run = function (dataDir=kDataDir)
8
+run = function (dataDir)
12 9
 {
13 10
   dataDir <- file.path(dataDir, "stamlab")
14 11
   rawMatrixList <- readRawMatrices (dataDir)
Browse code

changed kDataDir to be repo root

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75368 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 17:35:26
Showing 1 changed files
... ...
@@ -5,11 +5,12 @@ library (org.Mm.eg.db)
5 5
 #------------------------------------------------------------------------------------------------------------------------
6 6
 printf <- function(...) print(noquote(sprintf(...)))
7 7
 #------------------------------------------------------------------------------------------------------------------------
8
-kDataDir <- "~/s/data/public/TFBS/stam"
9
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab"
8
+kDataDir <- "~/s/data/public/TFBS"
9
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb"
10 10
 #------------------------------------------------------------------------------------------------------------------------
11 11
 run = function (dataDir=kDataDir)
12 12
 {
13
+  dataDir <- file.path(dataDir, "stamlab")
13 14
   rawMatrixList <- readRawMatrices (dataDir)
14 15
   novels <- readNovelStatus (dataDir)
15 16
   matrices <- extractAndNormalizeMatrices (rawMatrixList)
Browse code

rhino kDataDir now used

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75351 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 16:34:36
Showing 1 changed files
... ...
@@ -5,8 +5,8 @@ library (org.Mm.eg.db)
5 5
 #------------------------------------------------------------------------------------------------------------------------
6 6
 printf <- function(...) print(noquote(sprintf(...)))
7 7
 #------------------------------------------------------------------------------------------------------------------------
8
-kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab"
9 8
 kDataDir <- "~/s/data/public/TFBS/stam"
9
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab"
10 10
 #------------------------------------------------------------------------------------------------------------------------
11 11
 run = function (dataDir=kDataDir)
12 12
 {
Browse code

first version

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75345 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 13:35:38
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,349 @@
1
+# stamlab/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (org.Hs.eg.db)
4
+library (org.Mm.eg.db)
5
+#------------------------------------------------------------------------------------------------------------------------
6
+printf <- function(...) print(noquote(sprintf(...)))
7
+#------------------------------------------------------------------------------------------------------------------------
8
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/stamlab"
9
+kDataDir <- "~/s/data/public/TFBS/stam"
10
+#------------------------------------------------------------------------------------------------------------------------
11
+run = function (dataDir=kDataDir)
12
+{
13
+  rawMatrixList <- readRawMatrices (dataDir)
14
+  novels <- readNovelStatus (dataDir)
15
+  matrices <- extractAndNormalizeMatrices (rawMatrixList)
16
+  tbl.md <- createMetadataTable (matrices, novels)
17
+  matrices <- renameMatrices (matrices, tbl.md)
18
+
19
+  serializedFile <- "stamlab.RData"
20
+  save (matrices, tbl.md, file=serializedFile)
21
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
22
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
23
+
24
+} # run
25
+#------------------------------------------------------------------------------------------------------------------------
26
+readRawMatrices = function (dataDir)
27
+{
28
+  filename <- file.path(dataDir, "de.novo.pwm")
29
+  all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
30
+  title.lines = grep ('UW.Motif', all.lines)
31
+  title.line.count <<- length (title.lines)
32
+  max = title.line.count - 1
33
+
34
+  pwms = list ()
35
+  
36
+  for (i in 1:max) {
37
+    start.line = title.lines [i]
38
+    end.line = title.lines [i+1] - 1
39
+    new.pwm = parsePwm (all.lines [start.line:end.line])
40
+    pwms = c (pwms, list (new.pwm))
41
+    } # for i
42
+
43
+  
44
+  invisible (pwms)
45
+
46
+} # readRawMatrices
47
+#------------------------------------------------------------------------------------------------------------------------
48
+readNovelStatus = function (dataDir)
49
+{
50
+  filename <- file.path(dataDir, "novels.txt")
51
+  novels = scan (filename, what=character(0), sep='\n', quiet=TRUE)
52
+  all.names = sprintf ('UW.Motif.%04d', 1:683)
53
+  status = rep (FALSE, length (all.names))
54
+  true.novels = match (novels, all.names)
55
+  status [true.novels] = TRUE
56
+  names (status) = all.names
57
+  return (status)
58
+  
59
+} # readNovelStatus
60
+#------------------------------------------------------------------------------------------------------------------------
61
+extractAndNormalizeMatrices = function (pwm.list)
62
+{
63
+  ms = sapply (pwm.list, function (element) element$matrix)
64
+  nms = normalizeMatrices (ms)
65
+  names (nms) = sapply (pwm.list, function (element) element$title)
66
+  return (nms)
67
+
68
+} # extractAndNormalizeMatrices
69
+#------------------------------------------------------------------------------------------------------------------------
70
+#  matrices = sapply (list.pwms, function (pwm) pwm$matrix)
71
+#  matrix.names = sapply (list.pwms, function (pwm) pwm$title)
72
+#  names (matrices) = matrix.names
73
+convertRawMatricesToStandard = function (tbl.rmat)
74
+{
75
+  matrix.ids = unique (tbl.rmat$id)
76
+  result =  vector ('list', length (matrix.ids))
77
+
78
+  i = 1
79
+  for (matrix.id in matrix.ids) {
80
+    tbl.sub = subset (tbl.rmat, id == matrix.id)
81
+      # sanity check this rather unusual representation of a position count matrix
82
+    base.count = as.data.frame (table (tbl.sub$base))
83
+    stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
84
+      # conservative length check.  actually expect sequence lengths of 6 - 20 bases
85
+    if  (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
86
+      printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
87
+      }
88
+    stopifnot (all (base.count$Freq == base.count$Freq [1]))
89
+    nucleotide.counts = tbl.sub$count
90
+    row.names = c('A', 'C', 'G', 'T'); 
91
+    col.names = 1:(nrow (tbl.sub) / 4)
92
+    m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
93
+    result [[i]] = m
94
+    i = i + 1
95
+    } # for matrix.id
96
+
97
+  names (result) = matrix.ids
98
+  return (result)
99
+
100
+} # convertRawMatricesToStandard 
101
+#------------------------------------------------------------------------------------------------------------------------
102
+createAnnotationTable = function ()
103
+{
104
+  tbl.matrix =  read.table ('MATRIX.txt', sep='\t', header=F, as.is=TRUE)
105
+  colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
106
+
107
+  tbl.protein = read.table ('MATRIX_PROTEIN.txt', sep='\t', header=F, as.is=TRUE)
108
+  colnames (tbl.protein) =  c ('id', 'proteinID')
109
+
110
+  tbl.species = read.table ('MATRIX_SPECIES.txt', sep='\t', header=F, as.is=TRUE)
111
+  colnames (tbl.species) = c ('id', 'speciesID')
112
+
113
+  tbl.anno = read.table ('MATRIX_ANNOTATION.txt', sep='\t', header=F, as.is=TRUE, quote="")
114
+  colnames (tbl.anno) = c ('id', 'attribute', 'value')
115
+
116
+  tbl.family  = subset (tbl.anno, attribute=='family') [, -2];   
117
+  colnames (tbl.family) = c ('id', 'family')
118
+
119
+  tbl.tax     = subset (tbl.anno, attribute=='tax_group') [,-2]; 
120
+  colnames (tbl.tax) = c ('id', 'tax')
121
+
122
+  tbl.class   = subset (tbl.anno, attribute=='class') [,-2];     
123
+  colnames (tbl.class) = c ('id', 'class')
124
+
125
+  tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];    
126
+  colnames (tbl.comment) = c ('id', 'comment')
127
+
128
+  tbl.pubmed  = subset (tbl.anno, attribute=='medline')[,-2];    
129
+  colnames (tbl.pubmed) = c ('id', 'pubmed')
130
+
131
+  tbl.type    = subset (tbl.anno, attribute=='type')[,-2];       
132
+  colnames (tbl.type) = c ('id', 'type')
133
+
134
+
135
+  tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
136
+  tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
137
+  tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
138
+  tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
139
+  tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
140
+  tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
141
+  tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
142
+
143
+  fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
144
+  tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
145
+
146
+  invisible (tbl.md)
147
+
148
+} # createAnnotationTable
149
+#------------------------------------------------------------------------------------------------------------------------
150
+# assemble these columns:
151
+#                      names=character(),                    # species-source-gene: stamlab-Hsapiens-UW.Motif.0001
152
+#                      nativeNames=character(),              # UW.Motif.0001
153
+#                      geneSymbols=character(),              # NA
154
+#                      sequenceCounts=integer(),             # NA
155
+#                      organisms=character(),                # Hsapiens
156
+#                      bindingMolecules=character(),         # NA
157
+#                      bindingMoleculeIdTypes=character(),   # NA
158
+#                      bindingDomainTypes=character(),       # NA
159
+#                      dataSources=character(),              # stamlab
160
+#                      experimentTypes=character(),          # digital genomic footprinting
161
+#                      pubmedIDs=character(),                # 22959076
162
+#                      tfFamilies=character())               # NA
163
+#
164
+# from these
165
+#
166
+createMetadataTable = function (matrices, novels)
167
+{
168
+  options (stringsAsFactors=FALSE)
169
+  tbl.md = data.frame ()
170
+  matrix.ids = names (matrices) 
171
+  
172
+  for (matrix.id in matrix.ids) {
173
+    matrix = matrices [[matrix.id]]
174
+    taxon.code = 'Hsapiens'
175
+    geneId.info = NA
176
+    new.row = list (providerName=matrix.id,
177
+                    providerId=matrix.id,
178
+                    dataSource='stamlab',
179
+                    geneSymbol=NA,
180
+                    geneId=NA,
181
+                    geneIdType=NA,
182
+                    proteinId=NA,
183
+                    proteinIdType=NA,
184
+                    organism='Hsapiens',
185
+                    sequenceCount=NA,
186
+                    bindingSequence=NA_character_,
187
+                    bindingDomain=NA,
188
+                    tfFamily=NA,
189
+                    experimentType='digital genomic footprinting',
190
+                    pubmedID='22959076')
191
+    tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
192
+    full.name = sprintf ('%s-%s-%s', 'Hsapiens', 'stamlab', matrix.id)
193
+    rownames (tbl.md) [nrow (tbl.md)] = full.name
194
+    } # for i
195
+
196
+  novelPFM = rep ('knownMotif', nrow (tbl.md))
197
+  novels.ordered = novels [tbl.md$providerName]  # make sure we follow the order in the tbl
198
+  novelPFM [which (novels.ordered)] = 'novelMotif'
199
+  tbl.md$geneId = novelPFM
200
+  tbl.md$geneIdType = rep ('comment', nrow (tbl.md))
201
+
202
+  invisible (tbl.md)
203
+
204
+} # createMetadataTable
205
+#------------------------------------------------------------------------------------------------------------------------
206
+renameMatrices = function (matrices, tbl.md)
207
+{
208
+  stopifnot (length (matrices) == nrow (tbl.md))
209
+  names (matrices) = rownames (tbl.md)
210
+  invisible (matrices)
211
+
212
+} # renameMatrices
213
+#------------------------------------------------------------------------------------------------------------------------
214
+convertTaxonCode = function (ncbi.code)
215
+{
216
+  #if (is.na (ncbi.code))  
217
+  #  return (NA_character_)
218
+  ncbi.code = as.character (ncbi.code)
219
+  if (ncbi.code %in% c ('-', NA_character_, 'NA'))
220
+    return ('Vertebrata')
221
+
222
+  tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
223
+                            '4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
224
+                            '7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
225
+                    name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum', 
226
+                            'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
227
+                            'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
228
+                            'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis', 
229
+                            'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
230
+                    stringsAsFactors=FALSE)
231
+
232
+  ncbi.code = as.character (ncbi.code)
233
+  index = which (tbl$code == ncbi.code)
234
+  if (length (index) == 1)
235
+    return (tbl$name [index])
236
+  else {
237
+    write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
238
+    return (NA_character_)
239
+    }
240
+
241
+} # convertTaxonCode
242
+#------------------------------------------------------------------------------------------------------------------------
243
+# an empirical and not altogether trustworthy solution to identifying identifier types.
244
+guessProteinIdentifierType = function (moleculeName)
245
+{
246
+  if (nchar (moleculeName) == 0)
247
+    return (NA_character_)
248
+  if (is.na (moleculeName))
249
+    return (NA_character_) 
250
+
251
+  first.char = substr (moleculeName, 1, 1)
252
+
253
+  if (first.char == 'Y')
254
+    return ('SGD')
255
+
256
+  if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
257
+    return ('UNIPROT')
258
+
259
+  if (length (grep ('^EAW', moleculeName)) == 1)
260
+    return ('NCBI')
261
+
262
+  if (length (grep ('^EAX', moleculeName)) == 1)
263
+    return ('NCBI')
264
+
265
+  if (length (grep ('^NP_', moleculeName)) == 1)
266
+    return ('REFSEQ')
267
+
268
+  if (length (grep ('^BAD', moleculeName)) == 1)
269
+    return ('EMBL')
270
+
271
+   return (NA_character_)
272
+
273
+} # guessProteinIdentifierType
274
+#------------------------------------------------------------------------------------------------------------------------
275
+normalizeMatrices = function (matrices)
276
+{
277
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
278
+  invisible (mtx.normalized)
279
+
280
+} # normalizeMatrices
281
+#------------------------------------------------------------------------------------------------------------------------
282
+assignGeneId = function (proteinId)
283
+{
284
+  if (!exists ('id.uniprot.human')) {
285
+
286
+    tbl = toTable (org.Hs.egUNIPROT)
287
+    id.uniprot.human <<- as.character (tbl$gene_id)
288
+    names (id.uniprot.human) <<- tbl$uniprot_id
289
+
290
+    tbl = toTable (org.Hs.egREFSEQ)
291
+    tbl = tbl [grep ('^NP_', tbl$accession),]
292
+    id.refseq.human <<- as.character (tbl$gene_id)
293
+    names (id.refseq.human) <<- tbl$accession
294
+
295
+    tbl = toTable (org.Mm.egUNIPROT)
296
+    id.uniprot.mouse <<- as.character (tbl$gene_id)
297
+    names (id.uniprot.mouse) <<- tbl$uniprot_id
298
+
299
+    tbl = toTable (org.Mm.egREFSEQ)
300
+    tbl = tbl [grep ('^NP_', tbl$accession),]
301
+    id.refseq.mouse <<- as.character (tbl$gene_id)
302
+    names (id.refseq.mouse) <<- tbl$accession
303
+    }
304
+
305
+  proteinId = strsplit (proteinId, '\\.')[[1]][1]   # remove any trailing '.*'
306
+
307
+  if (proteinId %in% names (id.uniprot.human))
308
+    return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
309
+
310
+  if (proteinId %in% names (id.uniprot.mouse))
311
+    return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
312
+
313
+  if (proteinId %in% names (id.refseq.human))
314
+    return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
315
+
316
+  if (proteinId %in% names (id.refseq.mouse))
317
+    return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
318
+
319
+  found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
320
+
321
+  if (found.leading.Y == 1)
322
+    return (list (geneId=proteinId, type='SGD'))
323
+
324
+  return (list (geneId=NA_character_, type=NA_character_))
325
+
326
+} # assignGeneId
327
+#------------------------------------------------------------------------------------------------------------------------
328
+parsePwm = function (text)
329
+{
330
+  #printf ('parsing pwm %s', text [1])
331
+  lines = strsplit (text, '\t')
332
+  title = lines [[1]][1]
333
+  consensus.sequence = lines [[1]][2]
334
+  line.count = length (lines)
335
+  #printf ('%s: %s', title, consensus.sequence)
336
+
337
+  result = matrix (nrow=line.count-1, ncol=4, dimnames=list(1:(line.count-1), c ('A','C','G','T')))  
338
+  row = 1
339
+  for (line in lines [2:line.count]) {
340
+    result [row,] = as.numeric (line)
341
+    row = row + 1
342
+    } # for line
343
+
344
+  result = t (result)
345
+    
346
+  return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
347
+
348
+} # parsePwm
349
+#------------------------------------------------------------------------------------------------------------------------