Browse code

new directory for importing binding motifs from the ScerTF project

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75316 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 04/04/2013 23:41:20
Showing 2 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,242 @@
1
+# ScerTF/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library(RUnit)
4
+library(org.Sc.sgd.db)
5
+library(tools)   # for md5sum
6
+#------------------------------------------------------------------------------------------------------------------------
7
+printf <- function(...) print(noquote(sprintf(...)))
8
+#------------------------------------------------------------------------------------------------------------------------
9
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/ScerTF"
10
+kDataDir <- "~/s/data/public/TFBS/ScerTF/recommended/PCM"
11
+#------------------------------------------------------------------------------------------------------------------------
12
+run = function (dataDir=kDataDir)
13
+{
14
+  freshStart ()
15
+  tbl.ref = createExperimentRefTable ()
16
+  all.files = getMatrixFilenames (dataDir)
17
+  matrices = readAndParse (file.path(dataDir, all.files))
18
+  tbl.md = createMetadata (matrices, tbl.ref)
19
+  matrices = renameMatrices (matrices, tbl.md)
20
+  serializedFile <- 'ScerTF.RData'
21
+  save (matrices, tbl.md, file=serializedFile)
22
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
23
+  printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
24
+
25
+
26
+  invisible (list (mtx=matrices, md=tbl.md))
27
+
28
+} # run
29
+#------------------------------------------------------------------------------------------------------------------------
30
+freshStart = function ()
31
+{
32
+  output.files.easy.to.regenerate = grep ('RData$', dir (), v=T)
33
+  if (length (output.files.easy.to.regenerate) > 0)
34
+     unlink (output.files.easy.to.regenerate)
35
+
36
+} # freshStart
37
+#------------------------------------------------------------------------------------------------------------------------
38
+createMatrixNameUniqifier = function (matrix)
39
+{
40
+  temporary.file <<- tempfile ()
41
+  write (as.character (matrix), file=temporary.file)
42
+  md5sum.string <<- as.character (md5sum (temporary.file))
43
+  stopifnot (nchar (md5sum.string) == 32)
44
+  md5.6chars = substr (md5sum.string, 29, 32)
45
+  #unlink (temporary.file)
46
+
47
+} # createMatrixNameUniqifier
48
+#------------------------------------------------------------------------------------------------------------------------
49
+createExperimentRefTable = function ()
50
+{
51
+  options (stringsAsFactors = FALSE)
52
+  tbl.ref = data.frame (author=c('badis', 'foat', 'fordyce', 'harbison', 'macisaac', 'morozov', 'pachkov', 'spivak', 'zhao', 'zhu'))
53
+  tbl.ref = cbind (tbl.ref, year=c(2008,2008, 2010, 2004, 2006, 2007, 2007, 2012, 2011, 2009))
54
+  tbl.ref = cbind (tbl.ref, pmid=c('19111667', '17947326', '20802496', '15343339', '16522208', '17438293', '17130146', 
55
+                                    '22140105', '21654662', '19158363'))
56
+  tbl.ref = cbind (tbl.ref, organism=rep('Scerevisiae', nrow (tbl.ref)))
57
+
58
+  titles = c ('A library of yeast transcription factor motifs reveals a widespread function for Rsc3 in targeting nucleosome exclusion at promoters',
59
+    'TransfactomeDB: a resource for exploring the nucleotide sequence specificity and condition-specific regulatory activity of trans-acting factors.', 
60
+               'De novo identification and biophysical characterization of transcription-factor binding sites with microfluidic affinity analysis.',
61
+               'Transcriptional regulatory code of a eukaryotic genome.',
62
+               'An improved map of conserved regulatory sites for Saccharomyces cerevisiae',
63
+               'Connecting protein structure with predictions of regulatory sites.',
64
+               'SwissRegulon: a database of genome-wide annotations of regulatory sites.', 
65
+               'ScerTF: a comprehensive database of benchmarked position weight matrices for Saccharomyces species.',
66
+               'Quantitative analysis demonstrates most transcription factors require only simple models of specificity.',
67
+                'High-resolution DNA-binding specificity analysis of yeast transcription factors')
68
+
69
+  tbl.ref = cbind (tbl.ref, titles)
70
+ 
71
+  tbl.ref
72
+
73
+} # createExperimentRefTable
74
+#------------------------------------------------------------------------------------------------------------------------
75
+parsePWMfromText = function (lines.of.text)
76
+{
77
+  if (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) != 4) {
78
+    print (lines.of.text)
79
+    return (NA)
80
+    }     
81
+
82
+  for (line in lines.of.text) {
83
+    tokens = strsplit (line, '\\s*[:\\|]') [[1]]
84
+    nucleotide = tokens [1]
85
+    numbers.raw = tokens [2]
86
+    number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
87
+    while (nchar (number.tokens [1]) == 0)
88
+      number.tokens = number.tokens [-1]
89
+    numbers = as.numeric (number.tokens)
90
+    if (!exists ('result'))
91
+      result = matrix (nrow=4, ncol=length (numbers), byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:length(numbers)))
92
+    result [nucleotide,] = numbers
93
+    }
94
+
95
+  return (result)
96
+
97
+} # parsePWMfromText
98
+#------------------------------------------------------------------------------------------------------------------------
99
+# we expect exactly one matrix per file
100
+readAndParse = function (filenames)
101
+{
102
+    # this script is in the directory with all of the PWM files.   exclude it
103
+
104
+  files.to.remove = c ('go.R', 'matrices.ScerTF.RData', 'tbl.md.ScerTF.RData')
105
+  for (file.to.remove in files.to.remove) { 
106
+    removers = grep (file.to.remove, filenames)
107
+    if (length (removers) > 0)
108
+      filenames = filenames [-removers]
109
+    } # for file.to.remove
110
+
111
+  matrices = list ()
112
+
113
+  for (filename in filenames) {
114
+    #browser("readAndParse")
115
+    lines.of.text = scan (filename, what=character(0), sep='\n', quiet=TRUE)
116
+    mtx = parsePWMfromText (lines.of.text)
117
+    mtx.normalized = apply (mtx, 2, function (colvector) colvector / sum (colvector))
118
+    matrices [[basename(filename)]] = mtx.normalized
119
+    }
120
+ 
121
+   invisible (matrices)
122
+
123
+} # readAndParse
124
+#-----------------------------------------------------------------------------------------------------------------------
125
+toOrf = function (gene.names, quiet=FALSE)
126
+{
127
+  orfs = mget (gene.names, org.Sc.sgdCOMMON2ORF, ifnotfound=NA)
128
+  failures = which (is.na (orfs))
129
+      # make sure that all of the failures are simply gene.names which are already orfs
130
+
131
+    # some genes -- unpopular ones? :} -- have a classic orf name as gene symbol.  these do not apper in sgdCOMMON2ORF
132
+    # don't protest these, rather, just notify the caller of *other* symbols which failed to map
133
+    # this failure situation can be recognized if there are more failures (NAs) than orf names (Y.*) found in the input
134
+    # gene.names
135
+
136
+  if (length (grep ('^Y', names (failures))) != length (failures)) {
137
+    if (!quiet) {
138
+       printf ('error.  could not find orf for "%s"', list.to.string (gene.names [failures]))
139
+       } # !quiet
140
+    } 
141
+ 
142
+  orfs [as.integer (failures)] = names (failures)
143
+  for (gene.name in names (orfs)) {
144
+     orfs [[gene.name]] = orfs [[gene.name]][1]
145
+     } # for gene.name
146
+
147
+  unlist (unname (orfs))
148
+
149
+} # toOrf
150
+#------------------------------------------------------------------------------------------------------------------------
151
+toUniprot = function (orfs, quiet=FALSE)
152
+{
153
+  uniprots = mget (orfs, org.Sc.sgdUNIPROT, ifnotfound=NA)
154
+  failures = which (is.na (uniprots))
155
+
156
+  for (orf in names (uniprots)) {
157
+     uniprots [[orf]] = uniprots [[orf]][1]
158
+     } # for orf
159
+
160
+  unlist (unname (uniprots))
161
+
162
+} # toUniprot
163
+#------------------------------------------------------------------------------------------------------------------------
164
+# and rename the matrices to match the rownames of the tbl.md created here
165
+createMetadata = function (matrices, tbl.ref)
166
+{
167
+  options ('stringsAsFactors'=FALSE)
168
+  dataSource = 'ScerTF'
169
+  organism = 'Scerevisiae'
170
+
171
+  #tbl.ref = createExperimentRefTable ()
172
+  native.names.raw = names (matrices)
173
+  native.names.reversed = as.character (sapply (native.names.raw, 
174
+                                 function (s) {tokens = strsplit (s, '\\.')[[1]]; return (paste (tokens [2], tokens[1], sep='-'))}))
175
+
176
+  native.names = as.character (sapply (names (matrices), function (name) strsplit (name, '\\.') [[1]][2]))
177
+  gene.symbols = native.names
178
+
179
+  md5sum.suffix.uniqifiers = as.character (sapply (matrices, function (matrix) createMatrixNameUniqifier (matrix)))
180
+  native.names.uniqified = paste (native.names, md5sum.suffix.uniqifiers, sep='-')
181
+  full.names = paste (organism, dataSource, native.names.reversed, sep='-')
182
+
183
+  tbl.md = data.frame (providerName=native.names.raw)
184
+  rownames (tbl.md) = full.names
185
+  names (matrices) = full.names
186
+  tbl.md = cbind (tbl.md, providerId=gene.symbols)
187
+  tbl.md = cbind (tbl.md, dataSource = rep ('ScerTF', nrow (tbl.md)))
188
+  tbl.md = cbind (tbl.md, geneSymbol=gene.symbols)
189
+
190
+  orfs = as.character (toOrf (native.names))
191
+  tbl.md = cbind (tbl.md, geneId=orfs)
192
+  tbl.md = cbind (tbl.md, geneIdType=rep('SGD', nrow (tbl.md)))
193
+
194
+  uniprots <<- toUniprot (orfs)
195
+  tbl.md = cbind (tbl.md, proteinId=uniprots)
196
+  protein.id.types = rep('UNIPROT', nrow (tbl.md))
197
+  NA.protein.ids = which (is.na (uniprots))
198
+  if (length (NA.protein.ids) > 0)
199
+    protein.id.types [NA.protein.ids] = NA
200
+  
201
+  tbl.md = cbind (tbl.md, proteinIdType=protein.id.types)
202
+    
203
+  tbl.md = cbind (tbl.md, organism = rep ('Scerevisiae', nrow (tbl.md)))
204
+
205
+    # all matrices have colSums of 100, or very close, suggesting that these are not real counts but are normalized
206
+  tbl.md = cbind (tbl.md, sequenceCount = rep (NA_integer_, nrow (tbl.md)))
207
+  tbl.md = cbind (tbl.md, bindingSequence=rep(NA_character_, nrow (tbl.md)))
208
+  tbl.md = cbind (tbl.md, bindingDomain=rep(NA_character_, nrow (tbl.md)))
209
+  
210
+
211
+  tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md)))
212
+  tbl.md = cbind (tbl.md, experimentType=rep(NA_character_, nrow (tbl.md)))
213
+  authors = as.character (sapply (tbl.md$providerName, function (provider.name) strsplit (provider.name, '\\.')[[1]][1]))
214
+  pmid = as.character (sapply (authors, function (athr) subset (tbl.ref, author==athr)$pmid))
215
+  tbl.md = cbind (tbl.md, pubmedID=pmid)
216
+  
217
+  tbl.md
218
+
219
+} # createMetadata
220
+#-----------------------------------------------------------------------------------------------------------------------
221
+getMatrixFilenames = function (dataDir)
222
+{
223
+  all.files = list.files (dataDir)
224
+  files.to.exclude = c ('go.R', 'RCS', 'rdata')
225
+  for (file in files.to.exclude) {
226
+    removers = grep (file, all.files, ignore.case=T)
227
+    if (length (removers) > 0)
228
+      all.files = all.files [-removers]
229
+    }
230
+
231
+  invisible (all.files)
232
+
233
+} # getMatrixFilenames
234
+#-----------------------------------------------------------------------------------------------------------------------
235
+renameMatrices = function (matrices, tbl.md)
236
+{
237
+  stopifnot (length (matrices) == nrow (tbl.md))
238
+  names (matrices) = rownames (tbl.md)
239
+  invisible (matrices)
240
+
241
+} # renameMatrices
242
+#------------------------------------------------------------------------------------------------------------------------
0 243
new file mode 100644
... ...
@@ -0,0 +1,206 @@
1
+# ScerTF/test.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (RUnit)
4
+library (org.Sc.sgd.db)
5
+#------------------------------------------------------------------------------------------------------------------------
6
+source("import.R")
7
+#------------------------------------------------------------------------------------------------------------------------
8
+run.tests = function (dataDir=kDataDir)
9
+{
10
+  freshStart ()
11
+  x.filenames <<- test.getMatrixFilenames (dataDir)
12
+  txxa <<- test.createMatrixNameUniqifier ()
13
+  txx0 <<- test.toOrf ()
14
+  txx1 <<- test.toUniprot ()
15
+  txx2 <<- test.createExperimentRefTable ()
16
+  txx3 <<- test.parsePWMfromText (dataDir)
17
+  x.matrices <<- test.readAndParse (dataDir)
18
+  x.tbl.md  <<- test.createMetadata (dataDir)
19
+  x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md)
20
+
21
+} # run.tests
22
+#------------------------------------------------------------------------------------------------------------------------
23
+test.createExperimentRefTable = function ()
24
+{
25
+  print ('--- test.createExperimentRefTable')
26
+  tbl.ref = createExperimentRefTable ()
27
+  checkEquals (dim (tbl.ref), c (10, 5))
28
+  checkEquals (colnames (tbl.ref), c ('author', 'year', 'pmid', 'organism', 'titles'))
29
+
30
+     # make sure neither author nor pmid repeat
31
+  checkEquals (length (unique (tbl.ref$author)), nrow (tbl.ref))
32
+  checkEquals (length (unique (tbl.ref$pmid)), nrow (tbl.ref))
33
+
34
+  invisible (tbl.ref)
35
+
36
+} # test.createExperiementsTable
37
+#------------------------------------------------------------------------------------------------------------------------
38
+test.parsePWMfromText = function (dataDir)
39
+{
40
+  print ('--- test.parsePWMfromText')
41
+
42
+  file <- file.path(dataDir, 'macisaac.ABF1')
43
+  checkTrue(file.exists(file))
44
+  
45
+  lines.of.text = scan (file, what=character(0), sep='\n', quiet=TRUE)
46
+  pwm.abf1 = parsePWMfromText (lines.of.text)
47
+  checkEquals (dim (pwm.abf1), c (4, 15))
48
+  checkEquals (colnames (pwm.abf1), as.character (1:15))
49
+  checkEquals (rownames (pwm.abf1), c ('A', 'C', 'G', 'T'))
50
+  
51
+} # test.parsePWMfromText
52
+#-----------------------------------------------------------------------------------------------------------------------
53
+test.readAndParse = function (dataDir)
54
+{
55
+  print ('--- test.readAndParse')
56
+
57
+  all.files = getMatrixFilenames (dataDir)
58
+  sample.file.1 = grep ('badis.ABF2', all.files)
59
+  sample.file.2 = grep ('badis.CAT8', all.files)
60
+  checkEquals (length (sample.file.1), 1)
61
+  checkEquals (length (sample.file.2), 1)
62
+  
63
+  mtx.test = readAndParse (file.path(dataDir, all.files [c (sample.file.1, sample.file.2)]))
64
+
65
+  checkEquals (length (mtx.test), 2)
66
+  checkEquals (names (mtx.test), c ("badis.ABF2", "badis.CAT8"))
67
+
68
+  checkTrue (all (colSums (mtx.test [[1]]) == 1))
69
+  checkTrue (all (colSums (mtx.test [[2]]) == 1))
70
+
71
+  checkEquals (dim (mtx.test [[1]]), c (4,6))
72
+  checkEquals (dim (mtx.test [[2]]), c (4,6))
73
+
74
+  invisible (mtx.test)
75
+
76
+} # test.readAndParse
77
+#-----------------------------------------------------------------------------------------------------------------------
78
+test.toOrf = function ()
79
+{
80
+  print ('--- test.toOrf')
81
+     # SUT1 is the proper gene symbol for YGL162W, and an alias for YMR125W.  mget returns both.  we want just the first
82
+  checkEquals (toOrf ('SUT1'), 'YGL162W') 
83
+  checkEquals (toOrf (c ( "STB5", "SUT1", "THO2")), c ('YHR178W', 'YGL162W', 'YNL139C'))
84
+  
85
+  checkEquals (toOrf ('bogus', quiet=TRUE), 'bogus')
86
+  checkEquals (toOrf (c ( "STB5", "SUT1", "bogus", "THO2"), quiet=TRUE), c ('YHR178W', 'YGL162W', 'bogus', 'YNL139C'))
87
+
88
+} # test.toOrf 
89
+#------------------------------------------------------------------------------------------------------------------------
90
+test.toUniprot = function ()
91
+{
92
+  print ('--- test.toUniprot')
93
+    # demonstrate the problem
94
+  checkEquals (unlist (unname (mget ('YCR039C', org.Sc.sgdUNIPROT))), c ("P0CY08", "P0CY09"))
95
+
96
+    # but we want only one
97
+  checkEquals (toUniprot (c ('YCR039C')), "P0CY08")
98
+
99
+    # make sure it works embedded in a list of orfs
100
+  checkEquals (toUniprot (c ('YOL108C', 'YCR039C', 'YML065W')), c ("P13902", "P0CY08", "P54784"))
101
+
102
+    # an unrecognized orf should return NA
103
+  checkTrue (is.na (toUniprot ('bogus')))
104
+
105
+} # test.toUniprot
106
+#------------------------------------------------------------------------------------------------------------------------
107
+test.createMetadata = function (dataDir)
108
+{
109
+  print ('--- test.createMetadata')
110
+
111
+  matrices = readAndParse (file.path(dataDir, getMatrixFilenames (dataDir)))
112
+
113
+  tbl.md = createMetadata (matrices, createExperimentRefTable ())
114
+  checkEquals (dim (tbl.md), c (length (matrices), 15))
115
+  expected.columns = c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", "proteinId", 
116
+                        "proteinIdType", "organism", "sequenceCount", "bindingSequence", "bindingDomain", "tfFamily",
117
+                        "experimentType", "pubmedID")
118
+
119
+  checkEquals (colnames (tbl.md), expected.columns)
120
+  checkEquals (unique (tbl.md$organism), 'Scerevisiae')
121
+  ecm = 'Scerevisiae-ScerTF-ECM23-badis'
122
+  checkTrue (ecm %in% rownames (tbl.md))
123
+  x = tbl.md [ecm,]
124
+  checkEquals (x$providerName, "badis.ECM23")
125
+  checkEquals (x$geneSymbol, "ECM23")
126
+  checkEquals (x$geneId,  "YPL021W")
127
+  checkEquals (x$geneIdType, "SGD")
128
+  checkEquals (x$proteinId, "Q02710")
129
+  checkEquals (x$proteinIdType, "UNIPROT")
130
+  checkEquals (x$pubmedID, "19111667")
131
+
132
+  checkEquals (nrow (subset (tbl.md, is.na (proteinId) & !is.na (proteinIdType))), 0)
133
+
134
+  invisible (tbl.md)
135
+
136
+} # test.createMetadata
137
+#-----------------------------------------------------------------------------------------------------------------------
138
+test.getMatrixFilenames = function (dataDir)
139
+{
140
+  print ('--- test.getMatrixFilenames')
141
+
142
+  checkEquals (length (getMatrixFilenames (dataDir)), 196)
143
+
144
+} # test.getMatrixFilenames
145
+#-----------------------------------------------------------------------------------------------------------------------
146
+test.renameMatrices = function (matrices, tbl.md, tbl.anno)
147
+{
148
+  print ('--- test.renameMatrices')
149
+
150
+    # try it with just the first two matrices
151
+  checkEquals (dim (tbl.md), c (196, 15))
152
+  old.matrix.names = names (matrices)
153
+  matrices.renamed = renameMatrices (matrices, tbl.md [1:2,])
154
+  new.matrix.names = names (matrices.renamed)
155
+
156
+  #print (old.matrix.names)
157
+  #print (names (matrices.renamed))
158
+
159
+  gene.names = sapply (strsplit (old.matrix.names, '\\.'), function (tokens) return (tokens [2]))
160
+  author.names = sapply (strsplit (old.matrix.names, '\\.'), function (tokens) return (tokens [1]))
161
+
162
+     # though order of gene and author is reversed, both should be found, in boh old and new matrix names
163
+  for (m in 1:length (matrices)) {
164
+    checkTrue (length (grep (gene.names [m], old.matrix.names [m])) == 1)
165
+    checkTrue (length (grep (author.names [m], old.matrix.names [m])) == 1)
166
+    checkTrue (length (grep (gene.names [m], new.matrix.names [m])) == 1)
167
+    checkTrue (length (grep (author.names [m], new.matrix.names [m])) == 1)
168
+    }
169
+
170
+
171
+    # now check them all
172
+  
173
+  tbl.ref = createExperimentRefTable ()
174
+  all.files = getMatrixFilenames (dataDir)
175
+  matrices = readAndParse (file.path(dataDir, all.files))
176
+  tbl.md = createMetadata (matrices, tbl.ref)
177
+
178
+  old.matrix.names = names (matrices)
179
+  matrices.renamed = renameMatrices (matrices, tbl.md)
180
+  new.matrix.names = names (matrices.renamed)
181
+
182
+  for (m in 1:length (matrices)) {
183
+    checkTrue (length (grep (gene.names [m], old.matrix.names [m])) == 1)
184
+    checkTrue (length (grep (author.names [m], old.matrix.names [m])) == 1)
185
+    checkTrue (length (grep (gene.names [m], new.matrix.names [m])) == 1)
186
+    checkTrue (length (grep (author.names [m], new.matrix.names [m])) == 1)
187
+    }
188
+
189
+  #printf ('validated %d new matrix names', length (matrices))
190
+
191
+  invisible (matrices.renamed)
192
+
193
+} # test.renameMatrices
194
+#------------------------------------------------------------------------------------------------------------------------
195
+test.createMatrixNameUniqifier = function ()
196
+{
197
+  print ('--- test.createMatrixNameUniqifier')
198
+  
199
+  data = c (8,8,8,7,2,2,5,6,5,3,10,5,8,6,9,5,8,8,4,5,8,6,9,2,1,0,5,7,7,2,4,4,3,7,7,9,9,6,1,3)
200
+  test.matrix = matrix (data=data, nrow=4, ncol=10)
201
+  uniqifier = createMatrixNameUniqifier (test.matrix)
202
+  xxx <<- uniqifier
203
+  checkEquals (uniqifier, "b42f")
204
+
205
+} # test.createMatrixNameUniqifier
206
+#------------------------------------------------------------------------------------------------------------------------