git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75316 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,242 @@ |
1 |
+# ScerTF/import.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library(RUnit) |
|
4 |
+library(org.Sc.sgd.db) |
|
5 |
+library(tools) # for md5sum |
|
6 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
7 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
8 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
9 |
+kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb/ScerTF" |
|
10 |
+kDataDir <- "~/s/data/public/TFBS/ScerTF/recommended/PCM" |
|
11 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
12 |
+run = function (dataDir=kDataDir) |
|
13 |
+{ |
|
14 |
+ freshStart () |
|
15 |
+ tbl.ref = createExperimentRefTable () |
|
16 |
+ all.files = getMatrixFilenames (dataDir) |
|
17 |
+ matrices = readAndParse (file.path(dataDir, all.files)) |
|
18 |
+ tbl.md = createMetadata (matrices, tbl.ref) |
|
19 |
+ matrices = renameMatrices (matrices, tbl.md) |
|
20 |
+ serializedFile <- 'ScerTF.RData' |
|
21 |
+ save (matrices, tbl.md, file=serializedFile) |
|
22 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
23 |
+ printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
24 |
+ |
|
25 |
+ |
|
26 |
+ invisible (list (mtx=matrices, md=tbl.md)) |
|
27 |
+ |
|
28 |
+} # run |
|
29 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
30 |
+freshStart = function () |
|
31 |
+{ |
|
32 |
+ output.files.easy.to.regenerate = grep ('RData$', dir (), v=T) |
|
33 |
+ if (length (output.files.easy.to.regenerate) > 0) |
|
34 |
+ unlink (output.files.easy.to.regenerate) |
|
35 |
+ |
|
36 |
+} # freshStart |
|
37 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
38 |
+createMatrixNameUniqifier = function (matrix) |
|
39 |
+{ |
|
40 |
+ temporary.file <<- tempfile () |
|
41 |
+ write (as.character (matrix), file=temporary.file) |
|
42 |
+ md5sum.string <<- as.character (md5sum (temporary.file)) |
|
43 |
+ stopifnot (nchar (md5sum.string) == 32) |
|
44 |
+ md5.6chars = substr (md5sum.string, 29, 32) |
|
45 |
+ #unlink (temporary.file) |
|
46 |
+ |
|
47 |
+} # createMatrixNameUniqifier |
|
48 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
49 |
+createExperimentRefTable = function () |
|
50 |
+{ |
|
51 |
+ options (stringsAsFactors = FALSE) |
|
52 |
+ tbl.ref = data.frame (author=c('badis', 'foat', 'fordyce', 'harbison', 'macisaac', 'morozov', 'pachkov', 'spivak', 'zhao', 'zhu')) |
|
53 |
+ tbl.ref = cbind (tbl.ref, year=c(2008,2008, 2010, 2004, 2006, 2007, 2007, 2012, 2011, 2009)) |
|
54 |
+ tbl.ref = cbind (tbl.ref, pmid=c('19111667', '17947326', '20802496', '15343339', '16522208', '17438293', '17130146', |
|
55 |
+ '22140105', '21654662', '19158363')) |
|
56 |
+ tbl.ref = cbind (tbl.ref, organism=rep('Scerevisiae', nrow (tbl.ref))) |
|
57 |
+ |
|
58 |
+ titles = c ('A library of yeast transcription factor motifs reveals a widespread function for Rsc3 in targeting nucleosome exclusion at promoters', |
|
59 |
+ 'TransfactomeDB: a resource for exploring the nucleotide sequence specificity and condition-specific regulatory activity of trans-acting factors.', |
|
60 |
+ 'De novo identification and biophysical characterization of transcription-factor binding sites with microfluidic affinity analysis.', |
|
61 |
+ 'Transcriptional regulatory code of a eukaryotic genome.', |
|
62 |
+ 'An improved map of conserved regulatory sites for Saccharomyces cerevisiae', |
|
63 |
+ 'Connecting protein structure with predictions of regulatory sites.', |
|
64 |
+ 'SwissRegulon: a database of genome-wide annotations of regulatory sites.', |
|
65 |
+ 'ScerTF: a comprehensive database of benchmarked position weight matrices for Saccharomyces species.', |
|
66 |
+ 'Quantitative analysis demonstrates most transcription factors require only simple models of specificity.', |
|
67 |
+ 'High-resolution DNA-binding specificity analysis of yeast transcription factors') |
|
68 |
+ |
|
69 |
+ tbl.ref = cbind (tbl.ref, titles) |
|
70 |
+ |
|
71 |
+ tbl.ref |
|
72 |
+ |
|
73 |
+} # createExperimentRefTable |
|
74 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
75 |
+parsePWMfromText = function (lines.of.text) |
|
76 |
+{ |
|
77 |
+ if (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) != 4) { |
|
78 |
+ print (lines.of.text) |
|
79 |
+ return (NA) |
|
80 |
+ } |
|
81 |
+ |
|
82 |
+ for (line in lines.of.text) { |
|
83 |
+ tokens = strsplit (line, '\\s*[:\\|]') [[1]] |
|
84 |
+ nucleotide = tokens [1] |
|
85 |
+ numbers.raw = tokens [2] |
|
86 |
+ number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]] |
|
87 |
+ while (nchar (number.tokens [1]) == 0) |
|
88 |
+ number.tokens = number.tokens [-1] |
|
89 |
+ numbers = as.numeric (number.tokens) |
|
90 |
+ if (!exists ('result')) |
|
91 |
+ result = matrix (nrow=4, ncol=length (numbers), byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:length(numbers))) |
|
92 |
+ result [nucleotide,] = numbers |
|
93 |
+ } |
|
94 |
+ |
|
95 |
+ return (result) |
|
96 |
+ |
|
97 |
+} # parsePWMfromText |
|
98 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
99 |
+# we expect exactly one matrix per file |
|
100 |
+readAndParse = function (filenames) |
|
101 |
+{ |
|
102 |
+ # this script is in the directory with all of the PWM files. exclude it |
|
103 |
+ |
|
104 |
+ files.to.remove = c ('go.R', 'matrices.ScerTF.RData', 'tbl.md.ScerTF.RData') |
|
105 |
+ for (file.to.remove in files.to.remove) { |
|
106 |
+ removers = grep (file.to.remove, filenames) |
|
107 |
+ if (length (removers) > 0) |
|
108 |
+ filenames = filenames [-removers] |
|
109 |
+ } # for file.to.remove |
|
110 |
+ |
|
111 |
+ matrices = list () |
|
112 |
+ |
|
113 |
+ for (filename in filenames) { |
|
114 |
+ #browser("readAndParse") |
|
115 |
+ lines.of.text = scan (filename, what=character(0), sep='\n', quiet=TRUE) |
|
116 |
+ mtx = parsePWMfromText (lines.of.text) |
|
117 |
+ mtx.normalized = apply (mtx, 2, function (colvector) colvector / sum (colvector)) |
|
118 |
+ matrices [[basename(filename)]] = mtx.normalized |
|
119 |
+ } |
|
120 |
+ |
|
121 |
+ invisible (matrices) |
|
122 |
+ |
|
123 |
+} # readAndParse |
|
124 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
125 |
+toOrf = function (gene.names, quiet=FALSE) |
|
126 |
+{ |
|
127 |
+ orfs = mget (gene.names, org.Sc.sgdCOMMON2ORF, ifnotfound=NA) |
|
128 |
+ failures = which (is.na (orfs)) |
|
129 |
+ # make sure that all of the failures are simply gene.names which are already orfs |
|
130 |
+ |
|
131 |
+ # some genes -- unpopular ones? :} -- have a classic orf name as gene symbol. these do not apper in sgdCOMMON2ORF |
|
132 |
+ # don't protest these, rather, just notify the caller of *other* symbols which failed to map |
|
133 |
+ # this failure situation can be recognized if there are more failures (NAs) than orf names (Y.*) found in the input |
|
134 |
+ # gene.names |
|
135 |
+ |
|
136 |
+ if (length (grep ('^Y', names (failures))) != length (failures)) { |
|
137 |
+ if (!quiet) { |
|
138 |
+ printf ('error. could not find orf for "%s"', list.to.string (gene.names [failures])) |
|
139 |
+ } # !quiet |
|
140 |
+ } |
|
141 |
+ |
|
142 |
+ orfs [as.integer (failures)] = names (failures) |
|
143 |
+ for (gene.name in names (orfs)) { |
|
144 |
+ orfs [[gene.name]] = orfs [[gene.name]][1] |
|
145 |
+ } # for gene.name |
|
146 |
+ |
|
147 |
+ unlist (unname (orfs)) |
|
148 |
+ |
|
149 |
+} # toOrf |
|
150 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
151 |
+toUniprot = function (orfs, quiet=FALSE) |
|
152 |
+{ |
|
153 |
+ uniprots = mget (orfs, org.Sc.sgdUNIPROT, ifnotfound=NA) |
|
154 |
+ failures = which (is.na (uniprots)) |
|
155 |
+ |
|
156 |
+ for (orf in names (uniprots)) { |
|
157 |
+ uniprots [[orf]] = uniprots [[orf]][1] |
|
158 |
+ } # for orf |
|
159 |
+ |
|
160 |
+ unlist (unname (uniprots)) |
|
161 |
+ |
|
162 |
+} # toUniprot |
|
163 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
164 |
+# and rename the matrices to match the rownames of the tbl.md created here |
|
165 |
+createMetadata = function (matrices, tbl.ref) |
|
166 |
+{ |
|
167 |
+ options ('stringsAsFactors'=FALSE) |
|
168 |
+ dataSource = 'ScerTF' |
|
169 |
+ organism = 'Scerevisiae' |
|
170 |
+ |
|
171 |
+ #tbl.ref = createExperimentRefTable () |
|
172 |
+ native.names.raw = names (matrices) |
|
173 |
+ native.names.reversed = as.character (sapply (native.names.raw, |
|
174 |
+ function (s) {tokens = strsplit (s, '\\.')[[1]]; return (paste (tokens [2], tokens[1], sep='-'))})) |
|
175 |
+ |
|
176 |
+ native.names = as.character (sapply (names (matrices), function (name) strsplit (name, '\\.') [[1]][2])) |
|
177 |
+ gene.symbols = native.names |
|
178 |
+ |
|
179 |
+ md5sum.suffix.uniqifiers = as.character (sapply (matrices, function (matrix) createMatrixNameUniqifier (matrix))) |
|
180 |
+ native.names.uniqified = paste (native.names, md5sum.suffix.uniqifiers, sep='-') |
|
181 |
+ full.names = paste (organism, dataSource, native.names.reversed, sep='-') |
|
182 |
+ |
|
183 |
+ tbl.md = data.frame (providerName=native.names.raw) |
|
184 |
+ rownames (tbl.md) = full.names |
|
185 |
+ names (matrices) = full.names |
|
186 |
+ tbl.md = cbind (tbl.md, providerId=gene.symbols) |
|
187 |
+ tbl.md = cbind (tbl.md, dataSource = rep ('ScerTF', nrow (tbl.md))) |
|
188 |
+ tbl.md = cbind (tbl.md, geneSymbol=gene.symbols) |
|
189 |
+ |
|
190 |
+ orfs = as.character (toOrf (native.names)) |
|
191 |
+ tbl.md = cbind (tbl.md, geneId=orfs) |
|
192 |
+ tbl.md = cbind (tbl.md, geneIdType=rep('SGD', nrow (tbl.md))) |
|
193 |
+ |
|
194 |
+ uniprots <<- toUniprot (orfs) |
|
195 |
+ tbl.md = cbind (tbl.md, proteinId=uniprots) |
|
196 |
+ protein.id.types = rep('UNIPROT', nrow (tbl.md)) |
|
197 |
+ NA.protein.ids = which (is.na (uniprots)) |
|
198 |
+ if (length (NA.protein.ids) > 0) |
|
199 |
+ protein.id.types [NA.protein.ids] = NA |
|
200 |
+ |
|
201 |
+ tbl.md = cbind (tbl.md, proteinIdType=protein.id.types) |
|
202 |
+ |
|
203 |
+ tbl.md = cbind (tbl.md, organism = rep ('Scerevisiae', nrow (tbl.md))) |
|
204 |
+ |
|
205 |
+ # all matrices have colSums of 100, or very close, suggesting that these are not real counts but are normalized |
|
206 |
+ tbl.md = cbind (tbl.md, sequenceCount = rep (NA_integer_, nrow (tbl.md))) |
|
207 |
+ tbl.md = cbind (tbl.md, bindingSequence=rep(NA_character_, nrow (tbl.md))) |
|
208 |
+ tbl.md = cbind (tbl.md, bindingDomain=rep(NA_character_, nrow (tbl.md))) |
|
209 |
+ |
|
210 |
+ |
|
211 |
+ tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md))) |
|
212 |
+ tbl.md = cbind (tbl.md, experimentType=rep(NA_character_, nrow (tbl.md))) |
|
213 |
+ authors = as.character (sapply (tbl.md$providerName, function (provider.name) strsplit (provider.name, '\\.')[[1]][1])) |
|
214 |
+ pmid = as.character (sapply (authors, function (athr) subset (tbl.ref, author==athr)$pmid)) |
|
215 |
+ tbl.md = cbind (tbl.md, pubmedID=pmid) |
|
216 |
+ |
|
217 |
+ tbl.md |
|
218 |
+ |
|
219 |
+} # createMetadata |
|
220 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
221 |
+getMatrixFilenames = function (dataDir) |
|
222 |
+{ |
|
223 |
+ all.files = list.files (dataDir) |
|
224 |
+ files.to.exclude = c ('go.R', 'RCS', 'rdata') |
|
225 |
+ for (file in files.to.exclude) { |
|
226 |
+ removers = grep (file, all.files, ignore.case=T) |
|
227 |
+ if (length (removers) > 0) |
|
228 |
+ all.files = all.files [-removers] |
|
229 |
+ } |
|
230 |
+ |
|
231 |
+ invisible (all.files) |
|
232 |
+ |
|
233 |
+} # getMatrixFilenames |
|
234 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
235 |
+renameMatrices = function (matrices, tbl.md) |
|
236 |
+{ |
|
237 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
238 |
+ names (matrices) = rownames (tbl.md) |
|
239 |
+ invisible (matrices) |
|
240 |
+ |
|
241 |
+} # renameMatrices |
|
242 |
+#------------------------------------------------------------------------------------------------------------------------ |
0 | 243 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,206 @@ |
1 |
+# ScerTF/test.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library (RUnit) |
|
4 |
+library (org.Sc.sgd.db) |
|
5 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
6 |
+source("import.R") |
|
7 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
8 |
+run.tests = function (dataDir=kDataDir) |
|
9 |
+{ |
|
10 |
+ freshStart () |
|
11 |
+ x.filenames <<- test.getMatrixFilenames (dataDir) |
|
12 |
+ txxa <<- test.createMatrixNameUniqifier () |
|
13 |
+ txx0 <<- test.toOrf () |
|
14 |
+ txx1 <<- test.toUniprot () |
|
15 |
+ txx2 <<- test.createExperimentRefTable () |
|
16 |
+ txx3 <<- test.parsePWMfromText (dataDir) |
|
17 |
+ x.matrices <<- test.readAndParse (dataDir) |
|
18 |
+ x.tbl.md <<- test.createMetadata (dataDir) |
|
19 |
+ x.matrices.renamed <<- test.renameMatrices (x.matrices, x.tbl.md) |
|
20 |
+ |
|
21 |
+} # run.tests |
|
22 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
23 |
+test.createExperimentRefTable = function () |
|
24 |
+{ |
|
25 |
+ print ('--- test.createExperimentRefTable') |
|
26 |
+ tbl.ref = createExperimentRefTable () |
|
27 |
+ checkEquals (dim (tbl.ref), c (10, 5)) |
|
28 |
+ checkEquals (colnames (tbl.ref), c ('author', 'year', 'pmid', 'organism', 'titles')) |
|
29 |
+ |
|
30 |
+ # make sure neither author nor pmid repeat |
|
31 |
+ checkEquals (length (unique (tbl.ref$author)), nrow (tbl.ref)) |
|
32 |
+ checkEquals (length (unique (tbl.ref$pmid)), nrow (tbl.ref)) |
|
33 |
+ |
|
34 |
+ invisible (tbl.ref) |
|
35 |
+ |
|
36 |
+} # test.createExperiementsTable |
|
37 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
38 |
+test.parsePWMfromText = function (dataDir) |
|
39 |
+{ |
|
40 |
+ print ('--- test.parsePWMfromText') |
|
41 |
+ |
|
42 |
+ file <- file.path(dataDir, 'macisaac.ABF1') |
|
43 |
+ checkTrue(file.exists(file)) |
|
44 |
+ |
|
45 |
+ lines.of.text = scan (file, what=character(0), sep='\n', quiet=TRUE) |
|
46 |
+ pwm.abf1 = parsePWMfromText (lines.of.text) |
|
47 |
+ checkEquals (dim (pwm.abf1), c (4, 15)) |
|
48 |
+ checkEquals (colnames (pwm.abf1), as.character (1:15)) |
|
49 |
+ checkEquals (rownames (pwm.abf1), c ('A', 'C', 'G', 'T')) |
|
50 |
+ |
|
51 |
+} # test.parsePWMfromText |
|
52 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
53 |
+test.readAndParse = function (dataDir) |
|
54 |
+{ |
|
55 |
+ print ('--- test.readAndParse') |
|
56 |
+ |
|
57 |
+ all.files = getMatrixFilenames (dataDir) |
|
58 |
+ sample.file.1 = grep ('badis.ABF2', all.files) |
|
59 |
+ sample.file.2 = grep ('badis.CAT8', all.files) |
|
60 |
+ checkEquals (length (sample.file.1), 1) |
|
61 |
+ checkEquals (length (sample.file.2), 1) |
|
62 |
+ |
|
63 |
+ mtx.test = readAndParse (file.path(dataDir, all.files [c (sample.file.1, sample.file.2)])) |
|
64 |
+ |
|
65 |
+ checkEquals (length (mtx.test), 2) |
|
66 |
+ checkEquals (names (mtx.test), c ("badis.ABF2", "badis.CAT8")) |
|
67 |
+ |
|
68 |
+ checkTrue (all (colSums (mtx.test [[1]]) == 1)) |
|
69 |
+ checkTrue (all (colSums (mtx.test [[2]]) == 1)) |
|
70 |
+ |
|
71 |
+ checkEquals (dim (mtx.test [[1]]), c (4,6)) |
|
72 |
+ checkEquals (dim (mtx.test [[2]]), c (4,6)) |
|
73 |
+ |
|
74 |
+ invisible (mtx.test) |
|
75 |
+ |
|
76 |
+} # test.readAndParse |
|
77 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
78 |
+test.toOrf = function () |
|
79 |
+{ |
|
80 |
+ print ('--- test.toOrf') |
|
81 |
+ # SUT1 is the proper gene symbol for YGL162W, and an alias for YMR125W. mget returns both. we want just the first |
|
82 |
+ checkEquals (toOrf ('SUT1'), 'YGL162W') |
|
83 |
+ checkEquals (toOrf (c ( "STB5", "SUT1", "THO2")), c ('YHR178W', 'YGL162W', 'YNL139C')) |
|
84 |
+ |
|
85 |
+ checkEquals (toOrf ('bogus', quiet=TRUE), 'bogus') |
|
86 |
+ checkEquals (toOrf (c ( "STB5", "SUT1", "bogus", "THO2"), quiet=TRUE), c ('YHR178W', 'YGL162W', 'bogus', 'YNL139C')) |
|
87 |
+ |
|
88 |
+} # test.toOrf |
|
89 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
90 |
+test.toUniprot = function () |
|
91 |
+{ |
|
92 |
+ print ('--- test.toUniprot') |
|
93 |
+ # demonstrate the problem |
|
94 |
+ checkEquals (unlist (unname (mget ('YCR039C', org.Sc.sgdUNIPROT))), c ("P0CY08", "P0CY09")) |
|
95 |
+ |
|
96 |
+ # but we want only one |
|
97 |
+ checkEquals (toUniprot (c ('YCR039C')), "P0CY08") |
|
98 |
+ |
|
99 |
+ # make sure it works embedded in a list of orfs |
|
100 |
+ checkEquals (toUniprot (c ('YOL108C', 'YCR039C', 'YML065W')), c ("P13902", "P0CY08", "P54784")) |
|
101 |
+ |
|
102 |
+ # an unrecognized orf should return NA |
|
103 |
+ checkTrue (is.na (toUniprot ('bogus'))) |
|
104 |
+ |
|
105 |
+} # test.toUniprot |
|
106 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
107 |
+test.createMetadata = function (dataDir) |
|
108 |
+{ |
|
109 |
+ print ('--- test.createMetadata') |
|
110 |
+ |
|
111 |
+ matrices = readAndParse (file.path(dataDir, getMatrixFilenames (dataDir))) |
|
112 |
+ |
|
113 |
+ tbl.md = createMetadata (matrices, createExperimentRefTable ()) |
|
114 |
+ checkEquals (dim (tbl.md), c (length (matrices), 15)) |
|
115 |
+ expected.columns = c ("providerName", "providerId", "dataSource", "geneSymbol", "geneId", "geneIdType", "proteinId", |
|
116 |
+ "proteinIdType", "organism", "sequenceCount", "bindingSequence", "bindingDomain", "tfFamily", |
|
117 |
+ "experimentType", "pubmedID") |
|
118 |
+ |
|
119 |
+ checkEquals (colnames (tbl.md), expected.columns) |
|
120 |
+ checkEquals (unique (tbl.md$organism), 'Scerevisiae') |
|
121 |
+ ecm = 'Scerevisiae-ScerTF-ECM23-badis' |
|
122 |
+ checkTrue (ecm %in% rownames (tbl.md)) |
|
123 |
+ x = tbl.md [ecm,] |
|
124 |
+ checkEquals (x$providerName, "badis.ECM23") |
|
125 |
+ checkEquals (x$geneSymbol, "ECM23") |
|
126 |
+ checkEquals (x$geneId, "YPL021W") |
|
127 |
+ checkEquals (x$geneIdType, "SGD") |
|
128 |
+ checkEquals (x$proteinId, "Q02710") |
|
129 |
+ checkEquals (x$proteinIdType, "UNIPROT") |
|
130 |
+ checkEquals (x$pubmedID, "19111667") |
|
131 |
+ |
|
132 |
+ checkEquals (nrow (subset (tbl.md, is.na (proteinId) & !is.na (proteinIdType))), 0) |
|
133 |
+ |
|
134 |
+ invisible (tbl.md) |
|
135 |
+ |
|
136 |
+} # test.createMetadata |
|
137 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
138 |
+test.getMatrixFilenames = function (dataDir) |
|
139 |
+{ |
|
140 |
+ print ('--- test.getMatrixFilenames') |
|
141 |
+ |
|
142 |
+ checkEquals (length (getMatrixFilenames (dataDir)), 196) |
|
143 |
+ |
|
144 |
+} # test.getMatrixFilenames |
|
145 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
146 |
+test.renameMatrices = function (matrices, tbl.md, tbl.anno) |
|
147 |
+{ |
|
148 |
+ print ('--- test.renameMatrices') |
|
149 |
+ |
|
150 |
+ # try it with just the first two matrices |
|
151 |
+ checkEquals (dim (tbl.md), c (196, 15)) |
|
152 |
+ old.matrix.names = names (matrices) |
|
153 |
+ matrices.renamed = renameMatrices (matrices, tbl.md [1:2,]) |
|
154 |
+ new.matrix.names = names (matrices.renamed) |
|
155 |
+ |
|
156 |
+ #print (old.matrix.names) |
|
157 |
+ #print (names (matrices.renamed)) |
|
158 |
+ |
|
159 |
+ gene.names = sapply (strsplit (old.matrix.names, '\\.'), function (tokens) return (tokens [2])) |
|
160 |
+ author.names = sapply (strsplit (old.matrix.names, '\\.'), function (tokens) return (tokens [1])) |
|
161 |
+ |
|
162 |
+ # though order of gene and author is reversed, both should be found, in boh old and new matrix names |
|
163 |
+ for (m in 1:length (matrices)) { |
|
164 |
+ checkTrue (length (grep (gene.names [m], old.matrix.names [m])) == 1) |
|
165 |
+ checkTrue (length (grep (author.names [m], old.matrix.names [m])) == 1) |
|
166 |
+ checkTrue (length (grep (gene.names [m], new.matrix.names [m])) == 1) |
|
167 |
+ checkTrue (length (grep (author.names [m], new.matrix.names [m])) == 1) |
|
168 |
+ } |
|
169 |
+ |
|
170 |
+ |
|
171 |
+ # now check them all |
|
172 |
+ |
|
173 |
+ tbl.ref = createExperimentRefTable () |
|
174 |
+ all.files = getMatrixFilenames (dataDir) |
|
175 |
+ matrices = readAndParse (file.path(dataDir, all.files)) |
|
176 |
+ tbl.md = createMetadata (matrices, tbl.ref) |
|
177 |
+ |
|
178 |
+ old.matrix.names = names (matrices) |
|
179 |
+ matrices.renamed = renameMatrices (matrices, tbl.md) |
|
180 |
+ new.matrix.names = names (matrices.renamed) |
|
181 |
+ |
|
182 |
+ for (m in 1:length (matrices)) { |
|
183 |
+ checkTrue (length (grep (gene.names [m], old.matrix.names [m])) == 1) |
|
184 |
+ checkTrue (length (grep (author.names [m], old.matrix.names [m])) == 1) |
|
185 |
+ checkTrue (length (grep (gene.names [m], new.matrix.names [m])) == 1) |
|
186 |
+ checkTrue (length (grep (author.names [m], new.matrix.names [m])) == 1) |
|
187 |
+ } |
|
188 |
+ |
|
189 |
+ #printf ('validated %d new matrix names', length (matrices)) |
|
190 |
+ |
|
191 |
+ invisible (matrices.renamed) |
|
192 |
+ |
|
193 |
+} # test.renameMatrices |
|
194 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
195 |
+test.createMatrixNameUniqifier = function () |
|
196 |
+{ |
|
197 |
+ print ('--- test.createMatrixNameUniqifier') |
|
198 |
+ |
|
199 |
+ data = c (8,8,8,7,2,2,5,6,5,3,10,5,8,6,9,5,8,8,4,5,8,6,9,2,1,0,5,7,7,2,4,4,3,7,7,9,9,6,1,3) |
|
200 |
+ test.matrix = matrix (data=data, nrow=4, ncol=10) |
|
201 |
+ uniqifier = createMatrixNameUniqifier (test.matrix) |
|
202 |
+ xxx <<- uniqifier |
|
203 |
+ checkEquals (uniqifier, "b42f") |
|
204 |
+ |
|
205 |
+} # test.createMatrixNameUniqifier |
|
206 |
+#------------------------------------------------------------------------------------------------------------------------ |