Browse code

first version in scripts directory

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75344 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 05/04/2013 13:15:54
Showing 2 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,229 @@
1
+# hPDI/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (org.Hs.eg.db)
4
+#------------------------------------------------------------------------------------------------------------------------
5
+printf <- function(...) print(noquote(sprintf(...)))
6
+#------------------------------------------------------------------------------------------------------------------------
7
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/hPDI"
8
+kDataDir <- "~/s/data/public/TFBS/hPDI"
9
+#------------------------------------------------------------------------------------------------------------------------
10
+run = function (dataDir=kDataDir)
11
+{
12
+  filenames = getMatrixFilenames (dataDir)
13
+  matrices = readMatrices (filenames)
14
+  tbl.anno = readAnnotation (dataDir)
15
+
16
+     # make sure that all of the matrix names are geneSymbols represented in tbl.anno
17
+
18
+  stopifnot (length (intersect (names (matrices), tbl.anno$geneSymbol)) == length (names (matrices)))
19
+
20
+  tbl.md = createMetadata (matrices, tbl.anno)
21
+
22
+     # also required:  the order of tbl.anno entries exactly follow that of the matrices
23
+
24
+  stopifnot (names (matrices) == tbl.md$providerName)
25
+  matrices = renameMatrices (normalizeMatrices (matrices), tbl.md)
26
+  serializedFile <- "hPDI.RData"
27
+  save (matrices, tbl.md, file=serializedFile)
28
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
29
+  printf("next step:  copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
30
+
31
+  invisible (list (matrices=matrices, md=tbl.md))
32
+
33
+} # run
34
+#------------------------------------------------------------------------------------------------------------------------
35
+getMatrixFilenames = function (dataDir)
36
+{
37
+  files = list.files (file.path(dataDir, "pwm"))
38
+  invisible (file.path(dataDir, "pwm", files))
39
+
40
+} # getMatrixFilenames
41
+#------------------------------------------------------------------------------------------------------------------------
42
+readMatrices = function (filenames)
43
+{
44
+  max = length (filenames)
45
+
46
+  matrices = list ()
47
+
48
+  for (i in 1:max) {
49
+    filename = filenames [i]
50
+    #gene.name = strsplit (filename, '\\.')[[1]][1]
51
+    gene.name = sub(".output", "", basename(filename))
52
+    mtx = as.matrix (read.table (filename, header=FALSE, sep='\t', row.names=1))
53
+    colnames (mtx) = 1:ncol (mtx)
54
+    matrices [[gene.name]] = mtx
55
+    } # readMatrices
56
+
57
+  invisible (matrices)
58
+
59
+} # readMatrices
60
+#------------------------------------------------------------------------------------------------------------------------
61
+extractProteinIDs = function (raw.list)
62
+{
63
+     # first go after all the explicit uniprot ids, map them to refseq NP
64
+  uniprot.rows = grep ('Source:Uniprot', raw.list)
65
+  uniprot.ids.raw = raw.list [uniprot.rows]
66
+  uniprot.names = sapply (strsplit ((uniprot.ids.raw), ';Acc:'), function (tokens) tokens [2])
67
+  uniprot.names = gsub ("\\].*$", '', uniprot.names)
68
+  mapped.refseq.names = uniprotToRefSeq (uniprot.names)
69
+
70
+  protein = rep (NA_character_, length (raw.list))
71
+  protein [uniprot.rows] = as.vector (mapped.refseq.names)
72
+
73
+    # now grab the rows with explicit RefSeq NP idneitifers
74
+  refseq.rows  = grep ('Source:RefSeq', raw.list)
75
+  refseq.ids.raw = raw.list [refseq.rows]
76
+  refseq.names = sapply (strsplit ((refseq.ids.raw), ';Acc:'), function (tokens) tokens [2])
77
+  refseq.names = gsub ("\\].*$", '', refseq.names)
78
+
79
+  protein [refseq.rows] = refseq.names
80
+  still.na.rows = which (is.na (protein))
81
+  #mystery.rows = setdiff (1:nrow (tbl.anno), c (uniprot.rows, refseq.rows))
82
+  invisible (protein)
83
+
84
+} # extractProteinIDs
85
+#------------------------------------------------------------------------------------------------------------------------
86
+readRawAnnotation = function (dataDir)
87
+{
88
+  filename <- file.path(dataDir, "protein_annotation.txt")
89
+  tbl.anno = read.table (filename, sep='\t', header=T, fill=T, stringsAsFactors=FALSE, quote="")
90
+
91
+    # oddly, there are 37 empty columns in this data.frame.  locate the first of these, then delete all
92
+  stopifnot ('X' %in% colnames (tbl.anno))
93
+  first.empty.column = which (colnames (tbl.anno) == 'X')
94
+  tbl.anno = tbl.anno [, 1:(first.empty.column-1)]
95
+  stopifnot (length (colnames (tbl.anno)) == 16)
96
+
97
+  #protein.ids = extractIdentifiers (tbl.anno$ensembl_description)
98
+     # the goal is to create a new 'protein' column for the table, all refseq NP identifers, or NA
99
+  #tbl.anno = cbind (tbl.anno, protein, stringsAsFactors=FALSE)
100
+  invisible (tbl.anno)
101
+
102
+} # readRawAnnotation
103
+#------------------------------------------------------------------------------------------------------------------------
104
+addProteinColumn = function (tbl.anno)
105
+{
106
+  target = "ensembl_description"
107
+  stopifnot (target %in% colnames (tbl.anno))
108
+  ensembl.desc = tbl.anno [, target]
109
+  protein = extractProteinIDs (ensembl.desc)
110
+  invisible (cbind (tbl.anno, protein, stringsAsFactors=FALSE))
111
+
112
+} # addProteinColumn
113
+#------------------------------------------------------------------------------------------------------------------------
114
+# remove all but symbol, locusID, Pfam_domains and protein.  rename the first three
115
+trimAnnoTable = function (tbl.anno)
116
+{
117
+  tbl.fixed = tbl.anno [, c ('symbol', 'locusID', 'Pfam_domains', 'protein')]
118
+  colnames (tbl.fixed) = c ('geneSymbol', 'geneID', 'pfamDomain', 'protein')
119
+
120
+  invisible (tbl.fixed)
121
+
122
+} # trimAnnoTable
123
+#------------------------------------------------------------------------------------------------------------------------
124
+# read raw, then process and trim to get a tidy 4-column table
125
+readAnnotation = function (dataDir)
126
+{
127
+  tbl.anno.raw = readRawAnnotation (dataDir)
128
+  tbl.anno = addProteinColumn (tbl.anno.raw)
129
+  tbl.anno = trimAnnoTable (tbl.anno)
130
+
131
+  invisible (tbl.anno)
132
+
133
+} # readAnnotation
134
+#------------------------------------------------------------------------------------------------------------------------
135
+createMetadata = function (matrices, tbl.anno)
136
+{
137
+  options ('stringsAsFactors'=FALSE)
138
+  dataSource = 'hPDI'
139
+  organism = 'Hsapiens'
140
+  provider.names = names (matrices)
141
+  full.names = paste (organism, dataSource, provider.names, sep='-')
142
+
143
+  tbl.md = data.frame (providerName=provider.names)
144
+  rownames (tbl.md) = full.names
145
+  tbl.md = cbind (tbl.md, providerId=provider.names)
146
+
147
+    # indices are crucial, retreiving values from tbl.anno columns in an order that corresponds to the
148
+    # order of the provider.names obtained from the names of the matrices
149
+
150
+  indices = match (provider.names, tbl.anno$geneSymbol)
151
+  geneId = as.character (tbl.anno$geneID [indices])
152
+
153
+  egSyms = as.character (mget (geneId, org.Hs.egSYMBOL, ifnotfound=NA))
154
+  egSyms [egSyms=='NA'] == NA_character_
155
+
156
+  tbl.md = cbind (tbl.md, dataSource = rep (dataSource, nrow (tbl.md)))
157
+
158
+  tbl.md = cbind (tbl.md, geneSymbol=egSyms)
159
+  tbl.md = cbind (tbl.md, geneId)
160
+  tbl.md = cbind (tbl.md, geneIdType=rep('ENTREZ', nrow (tbl.md)))
161
+
162
+   
163
+    # find NP protein ids for each matrix.  these are provided by hPDI, and made into NPs consistently by me.
164
+ 
165
+
166
+  protein.ids = tbl.anno$protein [indices]
167
+  tbl.md = cbind (tbl.md, proteinId=tbl.anno$protein [indices])
168
+
169
+  proteinIdType = rep (NA_character_, length (protein.ids))
170
+  refseq.protein.ids = grep ('NP_', protein.ids)
171
+  proteinIdType [refseq.protein.ids] = 'REFSEQ'
172
+  tbl.md = cbind (tbl.md, proteinIdType)
173
+
174
+  tbl.md = cbind (tbl.md, organism = rep (organism, nrow (tbl.md)))
175
+
176
+  counts = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
177
+  tbl.md = cbind (tbl.md, sequenceCount=counts)
178
+
179
+  tbl.md = cbind (tbl.md, bindingSequence=rep(NA_character_, nrow (tbl.md)))
180
+
181
+  bindingDomain = tbl.anno$pfamDomain [indices]
182
+  tbl.md = cbind (tbl.md, bindingDomain)
183
+
184
+  tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md)))
185
+
186
+  experiment.types = rep (NA_character_, nrow (tbl.md))
187
+  tbl.md = cbind (tbl.md, experimentType=experiment.types)
188
+
189
+  pubmedIDs = rep (NA_character_)
190
+
191
+  tbl.md = cbind (tbl.md, pubmedID=pubmedIDs)
192
+
193
+  invisible (tbl.md)
194
+
195
+} # createMetadata
196
+#------------------------------------------------------------------------------------------------------------------------
197
+uniprotToRefSeq = function (ids)
198
+{
199
+  if (!exists ('tbl.prot')) {
200
+    tbl.refseq = toTable (org.Hs.egREFSEQ)
201
+    np.only = grep ('NP_', tbl.refseq$accession)
202
+    stopifnot (length (np.only) > 1000)   # crude sanity check
203
+    tbl.refseq = tbl.refseq [np.only,]
204
+    tbl.uniprot = toTable (org.Hs.egUNIPROT)
205
+    tbl.prot <<- merge (tbl.refseq, tbl.uniprot, all.x=TRUE)
206
+    } # create tbl.protx
207
+
208
+   indices = match (ids, tbl.prot$uniprot_id)
209
+   result = tbl.prot$accession [indices]
210
+   names (result) = ids
211
+   return (result)
212
+
213
+} # uniprotToRefSeq
214
+#------------------------------------------------------------------------------------------------------------------------
215
+normalizeMatrices = function (matrices)
216
+{
217
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
218
+  invisible (mtx.normalized)
219
+
220
+} # normalizeMatrices
221
+#------------------------------------------------------------------------------------------------------------------------
222
+renameMatrices = function (matrices, tbl.md)
223
+{
224
+  stopifnot (names (matrices) == tbl.md$providerName)  # same length, same order
225
+  names (matrices) = rownames (tbl.md)
226
+  invisible (matrices)
227
+
228
+} # renameMatrices
229
+#------------------------------------------------------------------------------------------------------------------------
0 230
new file mode 100644
... ...
@@ -0,0 +1,205 @@
1
+# hPDI/test.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library (RUnit)
4
+#------------------------------------------------------------------------------------------------------------------------
5
+source("import.R")
6
+#------------------------------------------------------------------------------------------------------------------------
7
+run.tests = function (dataDir=kDataDir)
8
+{
9
+  #freshStart ()
10
+
11
+  x.filenames <- test.getMatrixFilenames (dataDir)
12
+  x.matrices <- test.readMatrices (x.filenames)
13
+  x.tbl.anno.raw <- test.readRawAnnotation (dataDir)
14
+  x.refseqs  <- test.uniprotToRefSeq (x.tbl.anno.raw)
15
+  test.extractProteinIDs (dataDir)
16
+  x.tbl.annop.raw <- test.addProteinColumn (x.tbl.anno.raw)
17
+  x.tbl.annof.raw <- test.trimAnnoTable (x.tbl.annop.raw)
18
+
19
+    # now, one summary test for the above tests combined into one method
20
+  x.tbl.anno <- test.readAnnotation (dataDir)
21
+
22
+  x.tbl.md <- test.createMetadata (dataDir)
23
+  x.mat5f <- test.normalizeMatrices (x.matrices [1:5])
24
+  x.mat5fr <- test.renameMatrices (dataDir, x.filenames) 
25
+
26
+} # run.tests
27
+#------------------------------------------------------------------------------------------------------------------------
28
+test.getMatrixFilenames = function  (dataDir)
29
+{
30
+  print("--- test.getMatrixFilenames")
31
+  filenames = getMatrixFilenames (dataDir)
32
+  checkTrue (length (filenames) == 437)
33
+  checkTrue(file.exists(filenames[1]))
34
+  checkTrue(file.exists(filenames[437]))
35
+
36
+  invisible (filenames)
37
+
38
+} # test.getMatrixFilenames
39
+#------------------------------------------------------------------------------------------------------------------------
40
+test.readMatrices = function (filenames)
41
+{
42
+  print ('--- test.readMatrices')
43
+  matrices = readMatrices (filenames[1:10])
44
+  checkEquals (length (matrices), 10)
45
+  checkEquals(names(matrices)[1], "ABCF2")
46
+  checkEquals(dim(matrices[["ABCF2"]]), c(4,6))
47
+  checkEquals(rownames(matrices[["ABCF2"]]), c("A", "C", "G", "T"))
48
+  invisible (matrices)
49
+
50
+} # test.readMatrices
51
+#------------------------------------------------------------------------------------------------------------------------
52
+test.extractProteinIDs = function (dataDir)
53
+{
54
+  print ('--- test.extractProteinIDs')
55
+  
56
+  filename <- file.path(dataDir, "protein_annotation.txt")
57
+  raw.list = tail (read.table (filename, sep='\t', header=T, fill=T, stringsAsFactors=FALSE, quote="")$ensembl_description, n=16)
58
+  protein.ids <<- extractProteinIDs (raw.list)
59
+  checkEquals (length (protein.ids), length (raw.list))
60
+  checkEquals (protein.ids [2], 'NP_001012677')
61
+  checkEquals (protein.ids [5], 'NP_001074306')
62
+  checkTrue (is.na (protein.ids [10]))   # Q9GND9 -> NA
63
+  checkEquals (protein.ids [11], 'NP_001139192') # Q8NAP8 -> ZBT8B ->  NP_001139192.1
64
+
65
+} # test.extractProteinIDs
66
+#------------------------------------------------------------------------------------------------------------------------
67
+test.readRawAnnotation = function (dataDir)
68
+{
69
+  print ('--- test.readRawAnnotation')
70
+  tbl.anno = readRawAnnotation (dataDir)
71
+  checkEquals (dim (tbl.anno), c (4191, 16))
72
+    # are all possible proteins mapped to refseq?
73
+  
74
+  invisible (tbl.anno)
75
+
76
+} # test.readRawAnnotation
77
+#------------------------------------------------------------------------------------------------------------------------
78
+test.addProteinColumn = function (tbl.anno)
79
+{
80
+  print ('--- test.addProteinColumn')
81
+  tbl.small = tail (tbl.anno, n=16)
82
+  tbl.sp = addProteinColumn (tbl.small)
83
+  checkEquals (ncol (tbl.small) + 1, ncol (tbl.sp))
84
+  checkEquals (nrow (tbl.small), nrow (tbl.sp))
85
+  checkEquals (colnames (tbl.sp) [length (colnames (tbl.sp))], 'protein')
86
+  proteins = tbl.sp$protein
87
+  checkEquals (length (grep ('NP_', proteins)), 3)
88
+  checkEquals (length (which (is.na (proteins))), 13)
89
+
90
+  invisible (tbl.sp)
91
+
92
+} # test.addProteinColumn
93
+#------------------------------------------------------------------------------------------------------------------------
94
+test.trimAnnoTable = function (tbl.anno)
95
+{
96
+  print ('--- test.trimAnnoTable')
97
+  checkEquals (ncol (tbl.anno), 17)
98
+  tbl.trimmed = trimAnnoTable (tbl.anno)
99
+  checkEquals (ncol (tbl.trimmed), 4)
100
+  checkEquals (colnames (tbl.trimmed), c ('geneSymbol', 'geneID', 'pfamDomain', 'protein'))
101
+  invisible (tbl.trimmed)
102
+
103
+} # test.trimAnnoTable
104
+#------------------------------------------------------------------------------------------------------------------------
105
+test.readAnnotation = function (dataDir)
106
+{
107
+  print("--- test.readAnnotation")
108
+  tbl.anno = readAnnotation (dataDir)
109
+  checkEquals (dim (tbl.anno), c (4191, 4))
110
+  checkEquals (colnames (tbl.anno), c ("geneSymbol", "geneID", "pfamDomain", "protein"))
111
+
112
+  invisible (tbl.anno)
113
+
114
+} # test.readAnnotation
115
+#------------------------------------------------------------------------------------------------------------------------
116
+test.createMetadata = function (dataDir)
117
+{
118
+  print ('--- test.createMetadata')
119
+  tbl.anno = readAnnotation (dataDir)
120
+  filenames = getMatrixFilenames (dataDir)
121
+  matrices.all = readMatrices (filenames)
122
+    # take the top 3, and to be sure of a good test, add two with no protein annotation provided
123
+
124
+  names.of.na.protein.matrices = head (subset (tbl.anno, geneSymbol %in% names (matrices.all) & is.na (protein)), n=2)$geneSymbol
125
+  matrices = c (matrices.all [1:3], matrices.all [names.of.na.protein.matrices])
126
+  tbl.md = createMetadata (matrices, tbl.anno)
127
+
128
+  checkEquals (nrow (tbl.md), 5) 
129
+  checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", 
130
+                                     "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence",
131
+                                     "bindingDomain", "tfFamily", "experimentType", "pubmedID"))
132
+
133
+  checkTrue (all (!is.na (tbl.md$providerName)))
134
+  checkTrue (all (!is.na (tbl.md$providerId)))
135
+  checkTrue (all (!is.na (tbl.md$dataSource)))
136
+  checkTrue (all (!is.na (tbl.md$organism)))
137
+  checkTrue (all (!is.na (tbl.md$sequenceCount)))
138
+
139
+  checkTrue (all (is.na (tbl.md$bindingSequence)))
140
+
141
+  checkTrue (all (tbl.md$dataSource=='hPDI'))
142
+  checkTrue (is.integer (tbl.md$sequenceCount))
143
+  checkTrue (is.character (tbl.md$geneId))
144
+
145
+  syms = tbl.md$geneSymbol
146
+  ids = tbl.md$geneId
147
+  predicted.ids = as.character (mget (syms, org.Hs.egSYMBOL2EG))
148
+  checkEquals (ids, predicted.ids)
149
+
150
+  invisible (tbl.md)
151
+
152
+} # test.createMetadata
153
+#------------------------------------------------------------------------------------------------------------------------
154
+test.uniprotToRefSeq = function (tbl.anno)
155
+{
156
+  print ('--- test.uniprotToRefSeq')
157
+  samples = tbl.anno$protein [sample (1:nrow (tbl.anno), size=10, replace=FALSE)]
158
+  samples = c ("Q13523", "Q9UER7", "P52738", "Q96Q11", "Q8NCF5", "Q15784", "Q01664", "O14980", "P0C1Z6")
159
+  result = uniprotToRefSeq (samples)
160
+  checkEquals (names (result), samples)
161
+  refseqs = as.character (result)
162
+  checkEquals (length (grep ('^NP_', as.character  (refseqs))), length (samples))
163
+
164
+  samples.2 = c ('foo', "Q9UER7")
165
+  result.2 = uniprotToRefSeq (samples.2)
166
+  checkEquals (names (result.2), samples.2)
167
+  checkEquals (as.vector (result.2), c (NA_character_, "NP_001135441"))
168
+
169
+  invisible (result)
170
+
171
+} # test.uniprotToRefSeq
172
+#------------------------------------------------------------------------------------------------------------------------
173
+test.normalizeMatrices = function (matrices)
174
+{
175
+  print ('--- test.normalizeMatrices')
176
+
177
+  colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
178
+  checkTrue (all (colsums > 1))
179
+
180
+  matrices.norm = normalizeMatrices (matrices)
181
+  checkEquals (names (matrices.norm), names (matrices))
182
+
183
+  colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx))))))
184
+  checkTrue (all (colsums == 1))
185
+  invisible (matrices.norm)
186
+
187
+} # test.normalizeMatrices
188
+#------------------------------------------------------------------------------------------------------------------------
189
+test.renameMatrices = function (dataDir, filenames)
190
+{
191
+  print ('--- test.renameMatrices')
192
+
193
+  matrices = readMatrices (filenames)
194
+  tbl.anno = readAnnotation (dataDir)
195
+  tbl.md = createMetadata (matrices, tbl.anno)
196
+   
197
+  old.names = names (matrices)
198
+  mtxr = renameMatrices (matrices, tbl.md)
199
+  new.names = names (mtxr)
200
+
201
+  checkEquals (new.names, paste ('Hsapiens-hPDI-', old.names, sep=''))
202
+  invisible (mtxr)
203
+
204
+} # test.renameMatrices
205
+#------------------------------------------------------------------------------------------------------------------------