git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75344 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,229 @@ |
1 |
+# hPDI/import.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library (org.Hs.eg.db) |
|
4 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
5 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
6 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
7 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/hPDI" |
|
8 |
+kDataDir <- "~/s/data/public/TFBS/hPDI" |
|
9 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
10 |
+run = function (dataDir=kDataDir) |
|
11 |
+{ |
|
12 |
+ filenames = getMatrixFilenames (dataDir) |
|
13 |
+ matrices = readMatrices (filenames) |
|
14 |
+ tbl.anno = readAnnotation (dataDir) |
|
15 |
+ |
|
16 |
+ # make sure that all of the matrix names are geneSymbols represented in tbl.anno |
|
17 |
+ |
|
18 |
+ stopifnot (length (intersect (names (matrices), tbl.anno$geneSymbol)) == length (names (matrices))) |
|
19 |
+ |
|
20 |
+ tbl.md = createMetadata (matrices, tbl.anno) |
|
21 |
+ |
|
22 |
+ # also required: the order of tbl.anno entries exactly follow that of the matrices |
|
23 |
+ |
|
24 |
+ stopifnot (names (matrices) == tbl.md$providerName) |
|
25 |
+ matrices = renameMatrices (normalizeMatrices (matrices), tbl.md) |
|
26 |
+ serializedFile <- "hPDI.RData" |
|
27 |
+ save (matrices, tbl.md, file=serializedFile) |
|
28 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
29 |
+ printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
30 |
+ |
|
31 |
+ invisible (list (matrices=matrices, md=tbl.md)) |
|
32 |
+ |
|
33 |
+} # run |
|
34 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
35 |
+getMatrixFilenames = function (dataDir) |
|
36 |
+{ |
|
37 |
+ files = list.files (file.path(dataDir, "pwm")) |
|
38 |
+ invisible (file.path(dataDir, "pwm", files)) |
|
39 |
+ |
|
40 |
+} # getMatrixFilenames |
|
41 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
42 |
+readMatrices = function (filenames) |
|
43 |
+{ |
|
44 |
+ max = length (filenames) |
|
45 |
+ |
|
46 |
+ matrices = list () |
|
47 |
+ |
|
48 |
+ for (i in 1:max) { |
|
49 |
+ filename = filenames [i] |
|
50 |
+ #gene.name = strsplit (filename, '\\.')[[1]][1] |
|
51 |
+ gene.name = sub(".output", "", basename(filename)) |
|
52 |
+ mtx = as.matrix (read.table (filename, header=FALSE, sep='\t', row.names=1)) |
|
53 |
+ colnames (mtx) = 1:ncol (mtx) |
|
54 |
+ matrices [[gene.name]] = mtx |
|
55 |
+ } # readMatrices |
|
56 |
+ |
|
57 |
+ invisible (matrices) |
|
58 |
+ |
|
59 |
+} # readMatrices |
|
60 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
61 |
+extractProteinIDs = function (raw.list) |
|
62 |
+{ |
|
63 |
+ # first go after all the explicit uniprot ids, map them to refseq NP |
|
64 |
+ uniprot.rows = grep ('Source:Uniprot', raw.list) |
|
65 |
+ uniprot.ids.raw = raw.list [uniprot.rows] |
|
66 |
+ uniprot.names = sapply (strsplit ((uniprot.ids.raw), ';Acc:'), function (tokens) tokens [2]) |
|
67 |
+ uniprot.names = gsub ("\\].*$", '', uniprot.names) |
|
68 |
+ mapped.refseq.names = uniprotToRefSeq (uniprot.names) |
|
69 |
+ |
|
70 |
+ protein = rep (NA_character_, length (raw.list)) |
|
71 |
+ protein [uniprot.rows] = as.vector (mapped.refseq.names) |
|
72 |
+ |
|
73 |
+ # now grab the rows with explicit RefSeq NP idneitifers |
|
74 |
+ refseq.rows = grep ('Source:RefSeq', raw.list) |
|
75 |
+ refseq.ids.raw = raw.list [refseq.rows] |
|
76 |
+ refseq.names = sapply (strsplit ((refseq.ids.raw), ';Acc:'), function (tokens) tokens [2]) |
|
77 |
+ refseq.names = gsub ("\\].*$", '', refseq.names) |
|
78 |
+ |
|
79 |
+ protein [refseq.rows] = refseq.names |
|
80 |
+ still.na.rows = which (is.na (protein)) |
|
81 |
+ #mystery.rows = setdiff (1:nrow (tbl.anno), c (uniprot.rows, refseq.rows)) |
|
82 |
+ invisible (protein) |
|
83 |
+ |
|
84 |
+} # extractProteinIDs |
|
85 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
86 |
+readRawAnnotation = function (dataDir) |
|
87 |
+{ |
|
88 |
+ filename <- file.path(dataDir, "protein_annotation.txt") |
|
89 |
+ tbl.anno = read.table (filename, sep='\t', header=T, fill=T, stringsAsFactors=FALSE, quote="") |
|
90 |
+ |
|
91 |
+ # oddly, there are 37 empty columns in this data.frame. locate the first of these, then delete all |
|
92 |
+ stopifnot ('X' %in% colnames (tbl.anno)) |
|
93 |
+ first.empty.column = which (colnames (tbl.anno) == 'X') |
|
94 |
+ tbl.anno = tbl.anno [, 1:(first.empty.column-1)] |
|
95 |
+ stopifnot (length (colnames (tbl.anno)) == 16) |
|
96 |
+ |
|
97 |
+ #protein.ids = extractIdentifiers (tbl.anno$ensembl_description) |
|
98 |
+ # the goal is to create a new 'protein' column for the table, all refseq NP identifers, or NA |
|
99 |
+ #tbl.anno = cbind (tbl.anno, protein, stringsAsFactors=FALSE) |
|
100 |
+ invisible (tbl.anno) |
|
101 |
+ |
|
102 |
+} # readRawAnnotation |
|
103 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
104 |
+addProteinColumn = function (tbl.anno) |
|
105 |
+{ |
|
106 |
+ target = "ensembl_description" |
|
107 |
+ stopifnot (target %in% colnames (tbl.anno)) |
|
108 |
+ ensembl.desc = tbl.anno [, target] |
|
109 |
+ protein = extractProteinIDs (ensembl.desc) |
|
110 |
+ invisible (cbind (tbl.anno, protein, stringsAsFactors=FALSE)) |
|
111 |
+ |
|
112 |
+} # addProteinColumn |
|
113 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
114 |
+# remove all but symbol, locusID, Pfam_domains and protein. rename the first three |
|
115 |
+trimAnnoTable = function (tbl.anno) |
|
116 |
+{ |
|
117 |
+ tbl.fixed = tbl.anno [, c ('symbol', 'locusID', 'Pfam_domains', 'protein')] |
|
118 |
+ colnames (tbl.fixed) = c ('geneSymbol', 'geneID', 'pfamDomain', 'protein') |
|
119 |
+ |
|
120 |
+ invisible (tbl.fixed) |
|
121 |
+ |
|
122 |
+} # trimAnnoTable |
|
123 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
124 |
+# read raw, then process and trim to get a tidy 4-column table |
|
125 |
+readAnnotation = function (dataDir) |
|
126 |
+{ |
|
127 |
+ tbl.anno.raw = readRawAnnotation (dataDir) |
|
128 |
+ tbl.anno = addProteinColumn (tbl.anno.raw) |
|
129 |
+ tbl.anno = trimAnnoTable (tbl.anno) |
|
130 |
+ |
|
131 |
+ invisible (tbl.anno) |
|
132 |
+ |
|
133 |
+} # readAnnotation |
|
134 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
135 |
+createMetadata = function (matrices, tbl.anno) |
|
136 |
+{ |
|
137 |
+ options ('stringsAsFactors'=FALSE) |
|
138 |
+ dataSource = 'hPDI' |
|
139 |
+ organism = 'Hsapiens' |
|
140 |
+ provider.names = names (matrices) |
|
141 |
+ full.names = paste (organism, dataSource, provider.names, sep='-') |
|
142 |
+ |
|
143 |
+ tbl.md = data.frame (providerName=provider.names) |
|
144 |
+ rownames (tbl.md) = full.names |
|
145 |
+ tbl.md = cbind (tbl.md, providerId=provider.names) |
|
146 |
+ |
|
147 |
+ # indices are crucial, retreiving values from tbl.anno columns in an order that corresponds to the |
|
148 |
+ # order of the provider.names obtained from the names of the matrices |
|
149 |
+ |
|
150 |
+ indices = match (provider.names, tbl.anno$geneSymbol) |
|
151 |
+ geneId = as.character (tbl.anno$geneID [indices]) |
|
152 |
+ |
|
153 |
+ egSyms = as.character (mget (geneId, org.Hs.egSYMBOL, ifnotfound=NA)) |
|
154 |
+ egSyms [egSyms=='NA'] == NA_character_ |
|
155 |
+ |
|
156 |
+ tbl.md = cbind (tbl.md, dataSource = rep (dataSource, nrow (tbl.md))) |
|
157 |
+ |
|
158 |
+ tbl.md = cbind (tbl.md, geneSymbol=egSyms) |
|
159 |
+ tbl.md = cbind (tbl.md, geneId) |
|
160 |
+ tbl.md = cbind (tbl.md, geneIdType=rep('ENTREZ', nrow (tbl.md))) |
|
161 |
+ |
|
162 |
+ |
|
163 |
+ # find NP protein ids for each matrix. these are provided by hPDI, and made into NPs consistently by me. |
|
164 |
+ |
|
165 |
+ |
|
166 |
+ protein.ids = tbl.anno$protein [indices] |
|
167 |
+ tbl.md = cbind (tbl.md, proteinId=tbl.anno$protein [indices]) |
|
168 |
+ |
|
169 |
+ proteinIdType = rep (NA_character_, length (protein.ids)) |
|
170 |
+ refseq.protein.ids = grep ('NP_', protein.ids) |
|
171 |
+ proteinIdType [refseq.protein.ids] = 'REFSEQ' |
|
172 |
+ tbl.md = cbind (tbl.md, proteinIdType) |
|
173 |
+ |
|
174 |
+ tbl.md = cbind (tbl.md, organism = rep (organism, nrow (tbl.md))) |
|
175 |
+ |
|
176 |
+ counts = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx)))))) |
|
177 |
+ tbl.md = cbind (tbl.md, sequenceCount=counts) |
|
178 |
+ |
|
179 |
+ tbl.md = cbind (tbl.md, bindingSequence=rep(NA_character_, nrow (tbl.md))) |
|
180 |
+ |
|
181 |
+ bindingDomain = tbl.anno$pfamDomain [indices] |
|
182 |
+ tbl.md = cbind (tbl.md, bindingDomain) |
|
183 |
+ |
|
184 |
+ tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md))) |
|
185 |
+ |
|
186 |
+ experiment.types = rep (NA_character_, nrow (tbl.md)) |
|
187 |
+ tbl.md = cbind (tbl.md, experimentType=experiment.types) |
|
188 |
+ |
|
189 |
+ pubmedIDs = rep (NA_character_) |
|
190 |
+ |
|
191 |
+ tbl.md = cbind (tbl.md, pubmedID=pubmedIDs) |
|
192 |
+ |
|
193 |
+ invisible (tbl.md) |
|
194 |
+ |
|
195 |
+} # createMetadata |
|
196 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
197 |
+uniprotToRefSeq = function (ids) |
|
198 |
+{ |
|
199 |
+ if (!exists ('tbl.prot')) { |
|
200 |
+ tbl.refseq = toTable (org.Hs.egREFSEQ) |
|
201 |
+ np.only = grep ('NP_', tbl.refseq$accession) |
|
202 |
+ stopifnot (length (np.only) > 1000) # crude sanity check |
|
203 |
+ tbl.refseq = tbl.refseq [np.only,] |
|
204 |
+ tbl.uniprot = toTable (org.Hs.egUNIPROT) |
|
205 |
+ tbl.prot <<- merge (tbl.refseq, tbl.uniprot, all.x=TRUE) |
|
206 |
+ } # create tbl.protx |
|
207 |
+ |
|
208 |
+ indices = match (ids, tbl.prot$uniprot_id) |
|
209 |
+ result = tbl.prot$accession [indices] |
|
210 |
+ names (result) = ids |
|
211 |
+ return (result) |
|
212 |
+ |
|
213 |
+} # uniprotToRefSeq |
|
214 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
215 |
+normalizeMatrices = function (matrices) |
|
216 |
+{ |
|
217 |
+ mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
218 |
+ invisible (mtx.normalized) |
|
219 |
+ |
|
220 |
+} # normalizeMatrices |
|
221 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
222 |
+renameMatrices = function (matrices, tbl.md) |
|
223 |
+{ |
|
224 |
+ stopifnot (names (matrices) == tbl.md$providerName) # same length, same order |
|
225 |
+ names (matrices) = rownames (tbl.md) |
|
226 |
+ invisible (matrices) |
|
227 |
+ |
|
228 |
+} # renameMatrices |
|
229 |
+#------------------------------------------------------------------------------------------------------------------------ |
0 | 230 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,205 @@ |
1 |
+# hPDI/test.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library (RUnit) |
|
4 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
5 |
+source("import.R") |
|
6 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
7 |
+run.tests = function (dataDir=kDataDir) |
|
8 |
+{ |
|
9 |
+ #freshStart () |
|
10 |
+ |
|
11 |
+ x.filenames <- test.getMatrixFilenames (dataDir) |
|
12 |
+ x.matrices <- test.readMatrices (x.filenames) |
|
13 |
+ x.tbl.anno.raw <- test.readRawAnnotation (dataDir) |
|
14 |
+ x.refseqs <- test.uniprotToRefSeq (x.tbl.anno.raw) |
|
15 |
+ test.extractProteinIDs (dataDir) |
|
16 |
+ x.tbl.annop.raw <- test.addProteinColumn (x.tbl.anno.raw) |
|
17 |
+ x.tbl.annof.raw <- test.trimAnnoTable (x.tbl.annop.raw) |
|
18 |
+ |
|
19 |
+ # now, one summary test for the above tests combined into one method |
|
20 |
+ x.tbl.anno <- test.readAnnotation (dataDir) |
|
21 |
+ |
|
22 |
+ x.tbl.md <- test.createMetadata (dataDir) |
|
23 |
+ x.mat5f <- test.normalizeMatrices (x.matrices [1:5]) |
|
24 |
+ x.mat5fr <- test.renameMatrices (dataDir, x.filenames) |
|
25 |
+ |
|
26 |
+} # run.tests |
|
27 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
28 |
+test.getMatrixFilenames = function (dataDir) |
|
29 |
+{ |
|
30 |
+ print("--- test.getMatrixFilenames") |
|
31 |
+ filenames = getMatrixFilenames (dataDir) |
|
32 |
+ checkTrue (length (filenames) == 437) |
|
33 |
+ checkTrue(file.exists(filenames[1])) |
|
34 |
+ checkTrue(file.exists(filenames[437])) |
|
35 |
+ |
|
36 |
+ invisible (filenames) |
|
37 |
+ |
|
38 |
+} # test.getMatrixFilenames |
|
39 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
40 |
+test.readMatrices = function (filenames) |
|
41 |
+{ |
|
42 |
+ print ('--- test.readMatrices') |
|
43 |
+ matrices = readMatrices (filenames[1:10]) |
|
44 |
+ checkEquals (length (matrices), 10) |
|
45 |
+ checkEquals(names(matrices)[1], "ABCF2") |
|
46 |
+ checkEquals(dim(matrices[["ABCF2"]]), c(4,6)) |
|
47 |
+ checkEquals(rownames(matrices[["ABCF2"]]), c("A", "C", "G", "T")) |
|
48 |
+ invisible (matrices) |
|
49 |
+ |
|
50 |
+} # test.readMatrices |
|
51 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
52 |
+test.extractProteinIDs = function (dataDir) |
|
53 |
+{ |
|
54 |
+ print ('--- test.extractProteinIDs') |
|
55 |
+ |
|
56 |
+ filename <- file.path(dataDir, "protein_annotation.txt") |
|
57 |
+ raw.list = tail (read.table (filename, sep='\t', header=T, fill=T, stringsAsFactors=FALSE, quote="")$ensembl_description, n=16) |
|
58 |
+ protein.ids <<- extractProteinIDs (raw.list) |
|
59 |
+ checkEquals (length (protein.ids), length (raw.list)) |
|
60 |
+ checkEquals (protein.ids [2], 'NP_001012677') |
|
61 |
+ checkEquals (protein.ids [5], 'NP_001074306') |
|
62 |
+ checkTrue (is.na (protein.ids [10])) # Q9GND9 -> NA |
|
63 |
+ checkEquals (protein.ids [11], 'NP_001139192') # Q8NAP8 -> ZBT8B -> NP_001139192.1 |
|
64 |
+ |
|
65 |
+} # test.extractProteinIDs |
|
66 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
67 |
+test.readRawAnnotation = function (dataDir) |
|
68 |
+{ |
|
69 |
+ print ('--- test.readRawAnnotation') |
|
70 |
+ tbl.anno = readRawAnnotation (dataDir) |
|
71 |
+ checkEquals (dim (tbl.anno), c (4191, 16)) |
|
72 |
+ # are all possible proteins mapped to refseq? |
|
73 |
+ |
|
74 |
+ invisible (tbl.anno) |
|
75 |
+ |
|
76 |
+} # test.readRawAnnotation |
|
77 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
78 |
+test.addProteinColumn = function (tbl.anno) |
|
79 |
+{ |
|
80 |
+ print ('--- test.addProteinColumn') |
|
81 |
+ tbl.small = tail (tbl.anno, n=16) |
|
82 |
+ tbl.sp = addProteinColumn (tbl.small) |
|
83 |
+ checkEquals (ncol (tbl.small) + 1, ncol (tbl.sp)) |
|
84 |
+ checkEquals (nrow (tbl.small), nrow (tbl.sp)) |
|
85 |
+ checkEquals (colnames (tbl.sp) [length (colnames (tbl.sp))], 'protein') |
|
86 |
+ proteins = tbl.sp$protein |
|
87 |
+ checkEquals (length (grep ('NP_', proteins)), 3) |
|
88 |
+ checkEquals (length (which (is.na (proteins))), 13) |
|
89 |
+ |
|
90 |
+ invisible (tbl.sp) |
|
91 |
+ |
|
92 |
+} # test.addProteinColumn |
|
93 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
94 |
+test.trimAnnoTable = function (tbl.anno) |
|
95 |
+{ |
|
96 |
+ print ('--- test.trimAnnoTable') |
|
97 |
+ checkEquals (ncol (tbl.anno), 17) |
|
98 |
+ tbl.trimmed = trimAnnoTable (tbl.anno) |
|
99 |
+ checkEquals (ncol (tbl.trimmed), 4) |
|
100 |
+ checkEquals (colnames (tbl.trimmed), c ('geneSymbol', 'geneID', 'pfamDomain', 'protein')) |
|
101 |
+ invisible (tbl.trimmed) |
|
102 |
+ |
|
103 |
+} # test.trimAnnoTable |
|
104 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
105 |
+test.readAnnotation = function (dataDir) |
|
106 |
+{ |
|
107 |
+ print("--- test.readAnnotation") |
|
108 |
+ tbl.anno = readAnnotation (dataDir) |
|
109 |
+ checkEquals (dim (tbl.anno), c (4191, 4)) |
|
110 |
+ checkEquals (colnames (tbl.anno), c ("geneSymbol", "geneID", "pfamDomain", "protein")) |
|
111 |
+ |
|
112 |
+ invisible (tbl.anno) |
|
113 |
+ |
|
114 |
+} # test.readAnnotation |
|
115 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
116 |
+test.createMetadata = function (dataDir) |
|
117 |
+{ |
|
118 |
+ print ('--- test.createMetadata') |
|
119 |
+ tbl.anno = readAnnotation (dataDir) |
|
120 |
+ filenames = getMatrixFilenames (dataDir) |
|
121 |
+ matrices.all = readMatrices (filenames) |
|
122 |
+ # take the top 3, and to be sure of a good test, add two with no protein annotation provided |
|
123 |
+ |
|
124 |
+ names.of.na.protein.matrices = head (subset (tbl.anno, geneSymbol %in% names (matrices.all) & is.na (protein)), n=2)$geneSymbol |
|
125 |
+ matrices = c (matrices.all [1:3], matrices.all [names.of.na.protein.matrices]) |
|
126 |
+ tbl.md = createMetadata (matrices, tbl.anno) |
|
127 |
+ |
|
128 |
+ checkEquals (nrow (tbl.md), 5) |
|
129 |
+ checkEquals (colnames (tbl.md), c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", |
|
130 |
+ "proteinId", "proteinIdType", "organism", "sequenceCount", "bindingSequence", |
|
131 |
+ "bindingDomain", "tfFamily", "experimentType", "pubmedID")) |
|
132 |
+ |
|
133 |
+ checkTrue (all (!is.na (tbl.md$providerName))) |
|
134 |
+ checkTrue (all (!is.na (tbl.md$providerId))) |
|
135 |
+ checkTrue (all (!is.na (tbl.md$dataSource))) |
|
136 |
+ checkTrue (all (!is.na (tbl.md$organism))) |
|
137 |
+ checkTrue (all (!is.na (tbl.md$sequenceCount))) |
|
138 |
+ |
|
139 |
+ checkTrue (all (is.na (tbl.md$bindingSequence))) |
|
140 |
+ |
|
141 |
+ checkTrue (all (tbl.md$dataSource=='hPDI')) |
|
142 |
+ checkTrue (is.integer (tbl.md$sequenceCount)) |
|
143 |
+ checkTrue (is.character (tbl.md$geneId)) |
|
144 |
+ |
|
145 |
+ syms = tbl.md$geneSymbol |
|
146 |
+ ids = tbl.md$geneId |
|
147 |
+ predicted.ids = as.character (mget (syms, org.Hs.egSYMBOL2EG)) |
|
148 |
+ checkEquals (ids, predicted.ids) |
|
149 |
+ |
|
150 |
+ invisible (tbl.md) |
|
151 |
+ |
|
152 |
+} # test.createMetadata |
|
153 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
154 |
+test.uniprotToRefSeq = function (tbl.anno) |
|
155 |
+{ |
|
156 |
+ print ('--- test.uniprotToRefSeq') |
|
157 |
+ samples = tbl.anno$protein [sample (1:nrow (tbl.anno), size=10, replace=FALSE)] |
|
158 |
+ samples = c ("Q13523", "Q9UER7", "P52738", "Q96Q11", "Q8NCF5", "Q15784", "Q01664", "O14980", "P0C1Z6") |
|
159 |
+ result = uniprotToRefSeq (samples) |
|
160 |
+ checkEquals (names (result), samples) |
|
161 |
+ refseqs = as.character (result) |
|
162 |
+ checkEquals (length (grep ('^NP_', as.character (refseqs))), length (samples)) |
|
163 |
+ |
|
164 |
+ samples.2 = c ('foo', "Q9UER7") |
|
165 |
+ result.2 = uniprotToRefSeq (samples.2) |
|
166 |
+ checkEquals (names (result.2), samples.2) |
|
167 |
+ checkEquals (as.vector (result.2), c (NA_character_, "NP_001135441")) |
|
168 |
+ |
|
169 |
+ invisible (result) |
|
170 |
+ |
|
171 |
+} # test.uniprotToRefSeq |
|
172 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
173 |
+test.normalizeMatrices = function (matrices) |
|
174 |
+{ |
|
175 |
+ print ('--- test.normalizeMatrices') |
|
176 |
+ |
|
177 |
+ colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx)))))) |
|
178 |
+ checkTrue (all (colsums > 1)) |
|
179 |
+ |
|
180 |
+ matrices.norm = normalizeMatrices (matrices) |
|
181 |
+ checkEquals (names (matrices.norm), names (matrices)) |
|
182 |
+ |
|
183 |
+ colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx)))))) |
|
184 |
+ checkTrue (all (colsums == 1)) |
|
185 |
+ invisible (matrices.norm) |
|
186 |
+ |
|
187 |
+} # test.normalizeMatrices |
|
188 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
189 |
+test.renameMatrices = function (dataDir, filenames) |
|
190 |
+{ |
|
191 |
+ print ('--- test.renameMatrices') |
|
192 |
+ |
|
193 |
+ matrices = readMatrices (filenames) |
|
194 |
+ tbl.anno = readAnnotation (dataDir) |
|
195 |
+ tbl.md = createMetadata (matrices, tbl.anno) |
|
196 |
+ |
|
197 |
+ old.names = names (matrices) |
|
198 |
+ mtxr = renameMatrices (matrices, tbl.md) |
|
199 |
+ new.names = names (mtxr) |
|
200 |
+ |
|
201 |
+ checkEquals (new.names, paste ('Hsapiens-hPDI-', old.names, sep='')) |
|
202 |
+ invisible (mtxr) |
|
203 |
+ |
|
204 |
+} # test.renameMatrices |
|
205 |
+#------------------------------------------------------------------------------------------------------------------------ |