git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75293 bc3139a8-67e5-0310-9ffc-ced21a209358
3 | 3 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,287 @@ |
1 |
+# flyFactorSurvey/import.R |
|
2 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
3 |
+library(org.Dm.eg.db) |
|
4 |
+library(biomaRt) |
|
5 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
6 |
+bindingDomainXrefSourceFile <- function() {"TFfile2b.tsv"} |
|
7 |
+printf <- function(...) print(noquote(sprintf(...))) |
|
8 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
9 |
+run = function (flyFactorSurveyRootDir) |
|
10 |
+{ |
|
11 |
+ filenames = getMatrixFilenames (flyFactorSurveyRootDir) |
|
12 |
+ |
|
13 |
+ # the rootDir has only matrix files, with one exception: |
|
14 |
+ # a file which describes the protein binding domains |
|
15 |
+ # remove that from the matrix file list |
|
16 |
+ |
|
17 |
+ removers <- grep (bindingDomainXrefSourceFile(), filenames) |
|
18 |
+ if(length(removers) > 0) |
|
19 |
+ filenames <- filenames[-removers] |
|
20 |
+ |
|
21 |
+ full.filenames = file.path(flyFactorSurveyRootDir, filenames) |
|
22 |
+ |
|
23 |
+ list.BD = createXrefBindingDomain (flyFactorSurveyRootDir) |
|
24 |
+ xref = createXref () # maps flybase ids to uniprot |
|
25 |
+ tbl.ref = createExperimentRefTable () |
|
26 |
+ matrices = readAndParse (full.filenames) |
|
27 |
+ tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD) |
|
28 |
+ # now normalize, not in readAndParse, because we need the counts to go into tbl.md$sequenceCount |
|
29 |
+ matrices = normalizeMatrices (matrices) |
|
30 |
+ matrices = renameMatrices (matrices, tbl.md) |
|
31 |
+ serializedFile <- "flyFactorSurvey.RData" |
|
32 |
+ save (matrices, tbl.md, file=serializedFile) |
|
33 |
+ printf("saved %d matrices to %s", length(matrices), serializedFile) |
|
34 |
+ printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile) |
|
35 |
+ |
|
36 |
+} # run |
|
37 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
38 |
+# rec'd extra xref info from michael brodsky on (18 jul 2012) |
|
39 |
+# protein binding domains are of immediate interest. |
|
40 |
+createXrefBindingDomain = function (flyFactorSurveyRootDir) |
|
41 |
+{ |
|
42 |
+ f1 = file.path(flyFactorSurveyRootDir, bindingDomainXrefSourceFile()) |
|
43 |
+ tbl.raw = read.table (f1, sep='\t', header=TRUE, as.is=TRUE, comment.char='') |
|
44 |
+ list.BD = tbl.raw$DNAbindingDomain_Primary |
|
45 |
+ names (list.BD) = tbl.raw$FlybaseID |
|
46 |
+ invisible (list.BD) |
|
47 |
+ |
|
48 |
+} # createXrefBindingDomain |
|
49 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
50 |
+# a named list, with uniprot id as content, flybase gene names (FBgn) as names |
|
51 |
+createXref = function () |
|
52 |
+{ |
|
53 |
+ tbl.egUNIPROT = toTable (org.Dm.egUNIPROT) |
|
54 |
+ tbl.egFLYBASE = toTable (org.Dm.egFLYBASE) |
|
55 |
+ tbl.xref = merge (tbl.egFLYBASE, tbl.egUNIPROT) |
|
56 |
+ xref = tbl.xref [, 3] |
|
57 |
+ names (xref) = tbl.xref [, 2] |
|
58 |
+ invisible (xref) |
|
59 |
+ |
|
60 |
+} # createXref |
|
61 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
62 |
+getMatrixFilenames = function (flyFactorSurveyRootDir) |
|
63 |
+{ |
|
64 |
+ all.files = list.files (flyFactorSurveyRootDir) |
|
65 |
+ files.to.exclude = c ('go.R', 'RCS', 'rdata') |
|
66 |
+ for (file in files.to.exclude) { |
|
67 |
+ removers = grep (file, all.files, ignore.case=T) |
|
68 |
+ if (length (removers) > 0) |
|
69 |
+ all.files = all.files [-removers] |
|
70 |
+ } # for |
|
71 |
+ |
|
72 |
+ invisible (sort(all.files)) |
|
73 |
+ |
|
74 |
+} # getMatrixFilenames |
|
75 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
76 |
+createExperimentRefTable = function () |
|
77 |
+{ |
|
78 |
+ experiment.names = c ('SANGER', 'SOLEXA', 'NAR', 'Cell', 'FlyReg', 'NBT') |
|
79 |
+ pubmedID = c (NA_character_, NA_character_, '1858360', '18332042', NA_character_, NA_character_) |
|
80 |
+ experimentType = c ('bacterial 1-hybrid, SANGER sequencing', |
|
81 |
+ 'bacterial 1-hybrid, SOLEXA sequencing', |
|
82 |
+ 'bacterial 1-hybrid, SANGER sequencing', |
|
83 |
+ 'bacterial 1-hybrid, SANGER sequencing', |
|
84 |
+ 'bacterial 1-hybrid, SANGER sequencing', |
|
85 |
+ 'bacterial 1-hybrid, SANGER sequencing') |
|
86 |
+ |
|
87 |
+ tbl.ref = data.frame (pmid=pubmedID, experimentType=experimentType, stringsAsFactors=FALSE) |
|
88 |
+ rownames (tbl.ref) = experiment.names |
|
89 |
+ invisible (tbl.ref) |
|
90 |
+ |
|
91 |
+} # createExperimentRefTable |
|
92 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
93 |
+parsePWMfromText = function (lines.of.text) |
|
94 |
+{ |
|
95 |
+ stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4) |
|
96 |
+ |
|
97 |
+ |
|
98 |
+ for (line in lines.of.text) { |
|
99 |
+ tokens = strsplit (line, '\\s*[:\\|]') [[1]] |
|
100 |
+ nucleotide = tokens [1] |
|
101 |
+ numbers.raw = tokens [2] |
|
102 |
+ number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]] |
|
103 |
+ while (nchar (number.tokens [1]) == 0) |
|
104 |
+ number.tokens = number.tokens [-1] |
|
105 |
+ numbers = as.numeric (number.tokens) |
|
106 |
+ if (!exists ('pwm.result')) |
|
107 |
+ pwm.result = matrix (nrow=4, ncol=length (numbers), byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:length(numbers))) |
|
108 |
+ pwm.result [nucleotide,] = numbers |
|
109 |
+ } |
|
110 |
+ |
|
111 |
+ pwm.result |
|
112 |
+ |
|
113 |
+} # parsePWMfromText |
|
114 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
115 |
+# we expect exactly one matrix per file |
|
116 |
+readAndParse = function (filenames) |
|
117 |
+{ |
|
118 |
+ matrices = list () |
|
119 |
+ |
|
120 |
+ for (filename in filenames) { |
|
121 |
+ lines.of.text = scan (filename, what=character(0), sep='\n', comment='#', quiet=TRUE) |
|
122 |
+ mtx = parsePWMfromText (lines.of.text) |
|
123 |
+ #mtx.normalized = apply (mtx, 2, function (colvector) colvector / sum (colvector)) |
|
124 |
+ matrices [[basename(filename)]] = mtx |
|
125 |
+ } |
|
126 |
+ |
|
127 |
+ invisible (matrices) |
|
128 |
+ |
|
129 |
+} # readAndParse |
|
130 |
+#----------------------------------------------------------------------------------------------------------------------- |
|
131 |
+createMetadata = function (matrices, tbl.ref, xref, list.BD) |
|
132 |
+{ |
|
133 |
+ options ('stringsAsFactors'=FALSE) |
|
134 |
+ dataSource = 'FlyFactorSurvey' |
|
135 |
+ organism = 'Dmelanogaster' |
|
136 |
+ stopifnot (length (grep ('\\.pfm$', names (matrices))) == length (matrices)) |
|
137 |
+ native.names.raw = gsub ('\\.pfm$', '', names (matrices)) |
|
138 |
+ #geneSymbols = sapply (strsplit (native.names.raw, '_'), function (tokens) return (tokens [1])) |
|
139 |
+ flybase.ids = sapply (strsplit (native.names.raw, '_'), function (tokens) return (tokens [length (tokens)])) |
|
140 |
+ tbl.ids <- fbgnToIDs(flybase.ids) |
|
141 |
+ # we may have duplicate FBgns in the otherwise unique matrix names, |
|
142 |
+ # creating a tbl.ids with fewer rows than we have matrices. |
|
143 |
+ # expand tbl.ids to align with the matrix nmes |
|
144 |
+ tbl.ids <- tbl.ids[flybase.ids,] |
|
145 |
+ stopifnot(nrow(tbl.ids) == length(matrices)) |
|
146 |
+ # next up: call fbgnToIDs(flybase.ids), assign geneSymbols |
|
147 |
+ # and other variables from the returned data.frame |
|
148 |
+ stopifnot (length (grep ('^FBgn', flybase.ids)) == length (flybase.ids)) |
|
149 |
+ stopifnot (all (nchar (flybase.ids) == 11)) |
|
150 |
+ native.names.cooked = gsub ('-', '.', native.names.raw) # so that the dash reliably separates the three parts of full.names |
|
151 |
+ full.names = paste (organism, dataSource, native.names.cooked, sep='-') |
|
152 |
+ |
|
153 |
+ tbl.md = data.frame (providerName=native.names.raw) |
|
154 |
+ rownames (tbl.md) = full.names |
|
155 |
+ tbl.md = cbind (tbl.md, providerId=flybase.ids) |
|
156 |
+ tbl.md = cbind (tbl.md, dataSource = rep (dataSource, nrow (tbl.md))) |
|
157 |
+ |
|
158 |
+ tbl.md = cbind (tbl.md, geneSymbol=tbl.ids$symbol) |
|
159 |
+ tbl.md = cbind (tbl.md, geneId=tbl.ids$gene_id) |
|
160 |
+ #geneIdType <- rep("ENTREZ", nrow(tbl.md)) |
|
161 |
+ # not all FBgn ids can be mapped (by fbgnToIds) to entrez genes; for those, |
|
162 |
+ # the gene_id field is the FBgn id |
|
163 |
+ #geneIdIsFBgn <- grep("^FBgn", tbl.ids$gene_id) |
|
164 |
+ #if(length(geneIdIsFBgn) > 0) |
|
165 |
+ # geneIdType [geneIdIsFBgn] <- "FLYBASE" |
|
166 |
+ tbl.md = cbind (tbl.md, geneIdType=tbl.ids$geneIdType) |
|
167 |
+ |
|
168 |
+ uniprot.ids = as.character (xref [flybase.ids]) |
|
169 |
+ tbl.md = cbind (tbl.md, proteinId=uniprot.ids) |
|
170 |
+ protein.id.types = rep ('UNIPROT', nrow (tbl.md)) |
|
171 |
+ NA.protein.indices = which (is.na (tbl.md$proteinId)) |
|
172 |
+ if (length (NA.protein.indices) > 0) |
|
173 |
+ protein.id.types [NA.protein.indices] = NA |
|
174 |
+ tbl.md = cbind (tbl.md, proteinIdType=protein.id.types) |
|
175 |
+ #tbl.md = cbind (tbl.md, proteinIdType=rep ('UNIPROT', nrow (tbl.md))) |
|
176 |
+ |
|
177 |
+ tbl.md = cbind (tbl.md, organism = rep (organism, nrow (tbl.md))) |
|
178 |
+ |
|
179 |
+ counts = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx)))))) |
|
180 |
+ tbl.md = cbind (tbl.md, sequenceCount=counts) |
|
181 |
+ |
|
182 |
+ tbl.md = cbind (tbl.md, bindingSequence=rep (NA_character_, nrow (tbl.md))) |
|
183 |
+ bindingDomains = rep (NA, nrow (tbl.md)) |
|
184 |
+ indices = match (tbl.md$providerId, names (list.BD)) |
|
185 |
+ bindingDomain = as.character (list.BD)[indices] |
|
186 |
+ bindingDomain [bindingDomain == 'NA'] = NA |
|
187 |
+ bindingDomain [bindingDomain == ''] = NA |
|
188 |
+ tbl.md = cbind (tbl.md, bindingDomain) |
|
189 |
+ |
|
190 |
+ experiment.types = rep (NA_character_, nrow (tbl.md)) |
|
191 |
+ from.Cell = grep ('_Cell_', native.names.raw) |
|
192 |
+ from.NAR = grep ('_NAR_', native.names.raw) |
|
193 |
+ from.SOLEXA = grep ('_SOLEXA_', native.names.raw) |
|
194 |
+ from.SANGER = grep ('_SANGER_', native.names.raw) |
|
195 |
+ from.FlyReg = grep ('_FlyReg_', native.names.raw) |
|
196 |
+ |
|
197 |
+ experiment.types [from.SOLEXA] = 'bacterial 1-hybrid, SOLEXA sequencing' |
|
198 |
+ experiment.types [from.SANGER] = 'bacterial 1-hybrid, SANGER sequencing' |
|
199 |
+ experiment.types [from.Cell] = 'bacterial 1-hybrid, SANGER sequencing' |
|
200 |
+ experiment.types [from.NAR] = 'bacterial 1-hybrid, SANGER sequencing' |
|
201 |
+ experiment.types [from.FlyReg] = 'DNASE I footprinting' |
|
202 |
+ |
|
203 |
+ tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md))) |
|
204 |
+ |
|
205 |
+ tbl.md = cbind (tbl.md, experimentType=experiment.types) |
|
206 |
+ |
|
207 |
+ pubmedIDs = rep (NA_character_, nrow (tbl.md)) |
|
208 |
+ pubmedIDs [from.Cell] = '18332042' |
|
209 |
+ pubmedIDs [from.NAR] = '1858360' |
|
210 |
+ |
|
211 |
+ tbl.md = cbind (tbl.md, pubmedID=pubmedIDs) |
|
212 |
+ |
|
213 |
+ |
|
214 |
+ invisible (tbl.md) |
|
215 |
+ |
|
216 |
+} # createMetadata |
|
217 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
218 |
+normalizeMatrices = function (matrices) |
|
219 |
+{ |
|
220 |
+ mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) |
|
221 |
+ invisible (mtx.normalized) |
|
222 |
+ |
|
223 |
+} # normalizeMatrices |
|
224 |
+#------------------------------------------------------------------------------------------------------------------------ |
|
225 |
+renameMatrices = function (matrices, tbl.md) |
|
226 |
+{ |
|
227 |
+ stopifnot (length (matrices) == nrow (tbl.md)) |
|
228 |
+ names (matrices) = rownames (tbl.md) |
|
229 |
+ invisible (matrices) |
|
230 |
+ |
|
231 |
+} # renameMatrices |
|
232 |
+#------------------------------------------------------------------------------- |
|
233 |
+fbgnToIDs <- function(fbgns, useInputForMissingValues=TRUE) |
|
234 |
+{ |
|
235 |
+ fbgns <- unique(fbgns) |
|
236 |
+ if(!exists("tbl.names")) { |
|
237 |
+ tbl.sym <- toTable(org.Dm.egSYMBOL) |
|
238 |
+ tbl.fbgn <- toTable(org.Dm.egFLYBASE) |
|
239 |
+ tbl.names <<- merge(tbl.fbgn, tbl.sym)[, c("flybase_id", "symbol", "gene_id")] |
|
240 |
+ flyMart <<- useMart(biomart="ensembl", dataset="dmelanogaster_gene_ensembl") |
|
241 |
+ } |
|
242 |
+ tbl.bm <- getBM(attributes=c("flybase_gene_id", "entrezgene"), filters=c("flybase_gene_id"), |
|
243 |
+ values=fbgns, mart=flyMart) |
|
244 |
+ fbgns.notfound <- setdiff(fbgns, tbl.bm$flybase_gene_id) |
|
245 |
+ if(length(fbgns.notfound) > 0){ |
|
246 |
+ new.rows <- data.frame(flybase_gene_id=fbgns.notfound, |
|
247 |
+ entrezgene=NA, |
|
248 |
+ stringsAsFactors=FALSE) |
|
249 |
+ tbl.bm <- rbind(tbl.bm, new.rows) |
|
250 |
+ } |
|
251 |
+ |
|
252 |
+ tbl.bm <- unique(tbl.bm) |
|
253 |
+ |
|
254 |
+ # some fbgns could be mapped to multiple entrez genes. remove all but the |
|
255 |
+ # one which occurs first |
|
256 |
+ |
|
257 |
+ duplicates <- which(duplicated(tbl.bm$flybase_gene_id)) |
|
258 |
+ if(length(duplicates) > 0) |
|
259 |
+ tbl.bm <- tbl.bm[-duplicates,] |
|
260 |
+ |
|
261 |
+ stopifnot(sort(fbgns) == sort(tbl.bm$flybase_gene_id)) |
|
262 |
+ |
|
263 |
+ # order the biomart results so that it matches the order of the incoming fgbns |
|
264 |
+ tbl.bm <- tbl.bm[match(fbgns, tbl.bm$flybase_gene_id),] |
|
265 |
+ hits <- match(tbl.bm$entrezgene, tbl.names$gene_id) |
|
266 |
+ result <- tbl.names[hits,] |
|
267 |
+ rownames(result) <- fbgns |
|
268 |
+ |
|
269 |
+ # not all FBgn ids have an associated entrez geneID. |
|
270 |
+ # use these orphan FBgn ids as their own "geneID" |
|
271 |
+ |
|
272 |
+ failed.lookups <- which(is.na(result$flybase_id)) |
|
273 |
+ failed.count <- length(failed.lookups) |
|
274 |
+ if(useInputForMissingValues & failed.count > 0) { |
|
275 |
+ failed.names <- rownames(result)[failed.lookups] |
|
276 |
+ names.matrix <- matrix(rep(failed.names, failed.count), nrow=failed.count) |
|
277 |
+ result[failed.lookups,] <- names.matrix |
|
278 |
+ } |
|
279 |
+ |
|
280 |
+ geneIdType <- rep("ENTREZ", nrow(result)) |
|
281 |
+ geneIdType [failed.lookups] <- "FLYBASE" |
|
282 |
+ result <- cbind(result, geneIdType, stringsAsFactors=FALSE) |
|
283 |
+ result |
|
284 |
+ |
|
285 |
+ |
|
286 |
+} # fbgnToIDs |
|
287 |
+#------------------------------------------------------------------------------------------------------------------------ |
0 | 288 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,300 @@ |
1 |
+# flyFactorSurvey/test.R |
|
2 |
+#------------------------------------------------------------------------------- |
|
3 |
+library (RUnit) |
|
4 |
+#------------------------------------------------------------------------------- |
|
5 |
+source("import.R") |
|
6 |
+#------------------------------------------------------------------------------- |
|
7 |
+run.tests = function (flyFactorSurveyRootDir="~/s/data/public/TFBS/flyFactorSurvey/unpacked") |
|
8 |
+{ |
|
9 |
+ freshStart () |
|
10 |
+ test.fbgnToIDs() |
|
11 |
+ x.list.BD <- test.createXrefBindingDomain (flyFactorSurveyRootDir) |
|
12 |
+ x.xref <- test.createXref () |
|
13 |
+ x.filenames <- test.getMatrixFilenames (flyFactorSurveyRootDir) |
|
14 |
+ x.tbl.ref <- test.createExperimentRefTable () |
|
15 |
+ x.pwm <- test.parsePWMfromText (flyFactorSurveyRootDir) |
|
16 |
+ x.mat3 <- test.readAndParse (flyFactorSurveyRootDir) |
|
17 |
+ x.tbl.md <- test.createMetadata (x.mat3, x.tbl.ref, x.xref, x.list.BD) |
|
18 |
+ x.mat3f <- test.normalizeMatrices (x.mat3) |
|
19 |
+ x.mat3fr <- test.renameMatrices (x.mat3f, x.tbl.md [1:3,]) |
|
20 |
+ |
|
21 |
+} # run.tests |
|
22 |
+#------------------------------------------------------------------------------- |
|
23 |
+freshStart = function () |
|
24 |
+{ |
|
25 |
+ output.files.easy.to.regenerate = grep ('RData$', dir (), value=TRUE) |
|
26 |
+ if (length (output.files.easy.to.regenerate) > 0) |
|
27 |
+ unlink (output.files.easy.to.regenerate) |
|
28 |
+ |
|
29 |
+} # freshStart |
|
30 |
+#------------------------------------------------------------------------------- |
|
31 |
+test.createXrefBindingDomain = function (flyFactorSurveyRootDir) |
|
32 |
+{ |
|
33 |
+ print ('--- test.createXrefBindingDomain') |
|
34 |
+ list.BD = createXrefBindingDomain (flyFactorSurveyRootDir) |
|
35 |
+ checkEquals (length (list.BD), 777) |
|
36 |
+ checkTrue (all (grepl ('FBgn', names (list.BD)))) |
|
37 |
+ # most abundant domain is zf-C2H2 |
|
38 |
+ checkEquals(names (sort (table (as.character (list.BD)), decreasing=TRUE)[1]), |
|
39 |
+ "zf-C2H2") |
|
40 |
+ invisible(list.BD) |
|
41 |
+ |
|
42 |
+} # test.createXrefBindingDomain |
|
43 |
+#------------------------------------------------------------------------------- |
|
44 |
+test.createXref = function () |
|
45 |
+{ |
|
46 |
+ print ('--- test.createXref') |
|
47 |
+ xref = createXref () |
|
48 |
+ checkEquals (xref [['FBgn0261819']], 'F0J881') |
|
49 |
+ #checkEquals (xref [['FBgn0259750']], 'E1JHF4') # gone with org.Dm.eg.db_2.9.0 |
|
50 |
+ checkEquals (xref [['FBgn0000014']], 'E1JIM9') |
|
51 |
+ invisible (xref) |
|
52 |
+ |
|
53 |
+} # test.createXref |
|
54 |
+#------------------------------------------------------------------------------- |
|
55 |
+test.getMatrixFilenames = function (flyFactorSurveyRootDir) |
|
56 |
+{ |
|
57 |
+ print ('--- test.getMatrixFilenames') |
|
58 |
+ |
|
59 |
+ filenames = getMatrixFilenames (flyFactorSurveyRootDir) |
|
60 |
+ # as of (03 apr 2013): 614 matrix files, one binding domains extra |
|
61 |
+ # file, TFfile2b.tsv |
|
62 |
+ |
|
63 |
+ checkEquals (length (filenames), 615) |
|
64 |
+ checkEquals(filenames[1:3], |
|
65 |
+ c("Abd-A_FlyReg_FBgn0000014.pfm", "Abd-B_FlyReg_FBgn0000015.pfm", |
|
66 |
+ "AbdA_Cell_FBgn0000014.pfm")) |
|
67 |
+ invisible (filenames) |
|
68 |
+ |
|
69 |
+} # test.getMatrixFilenames |
|
70 |
+#------------------------------------------------------------------------------- |
|
71 |
+test.createExperimentRefTable = function () |
|
72 |
+{ |
|
73 |
+ print ('--- test.createExperimentRefTable') |
|
74 |
+ tbl.ref = createExperimentRefTable () |
|
75 |
+ checkEquals (nrow (tbl.ref), 6) |
|
76 |
+ |
|
77 |
+ invisible (tbl.ref) |
|
78 |
+ |
|
79 |
+} # test.createExperimentRefTable |
|
80 |
+#------------------------------------------------------------------------------- |
|
81 |
+test.parsePWMfromText = function (flyFactorSurveyRootDir) |
|
82 |
+{ |
|
83 |
+ print ('--- test.parsePWMfromText') |
|
84 |
+ path <- file.path(flyFactorSurveyRootDir, 'Tup_SOLEXA_10_FBgn0003896.pfm') |
|
85 |
+ |
|
86 |
+ lines.of.text = scan (path, sep='\n', |
|
87 |
+ what=character(0), comment='#', quiet=TRUE) |
|
88 |
+ pwm.tup = parsePWMfromText (lines.of.text) |
|
89 |
+ checkEquals (dim (pwm.tup), c (4, 9)) |
|
90 |
+ checkEquals (colnames (pwm.tup), as.character (1:9)) |
|
91 |
+ checkEquals (rownames (pwm.tup), c ('A', 'C', 'G', 'T')) |
|
92 |
+ |
|
93 |
+ invisible (pwm.tup) |
|
94 |
+ |
|
95 |
+} # test.parsePWMfromText |
|
96 |
+#------------------------------------------------------------------------------- |
|
97 |
+test.readAndParse = function (flyFactorSurveyRootDir) |
|
98 |
+{ |
|
99 |
+ print ('--- test.readAndParse') |
|
100 |
+ |
|
101 |
+ all.files = file.path(flyFactorSurveyRootDir, |
|
102 |
+ getMatrixFilenames (flyFactorSurveyRootDir)) |
|
103 |
+ |
|
104 |
+ sample.files = all.files [1:3] # grep ('FBgn0003896', all.files) |
|
105 |
+ checkEquals (length (sample.files), 3) |
|
106 |
+ checkTrue(all(file.exists(sample.files))) |
|
107 |
+ |
|
108 |
+ # by sorting these filenames, we know the order of the three returned |
|
109 |
+ # matrices, and results can easily be checked |
|
110 |
+ |
|
111 |
+ mtx.test = readAndParse (sample.files) |
|
112 |
+ |
|
113 |
+ checkEquals (length (mtx.test), 3) |
|
114 |
+ expected.names = c("Abd-A_FlyReg_FBgn0000014.pfm", |
|
115 |
+ "Abd-B_FlyReg_FBgn0000015.pfm", |
|
116 |
+ "AbdA_Cell_FBgn0000014.pfm") |
|
117 |
+ actual.names = sort(names(mtx.test)) |
|
118 |
+ checkEquals(expected.names, actual.names) |
|
119 |
+ |
|
120 |
+ checkEquals (dim (mtx.test [[1]]), c (4, 8)) |
|
121 |
+ checkEquals (dim (mtx.test [[2]]), c (4, 8)) |
|
122 |
+ checkEquals (dim (mtx.test [[3]]), c (4, 7)) |
|
123 |
+ |
|
124 |
+ invisible (mtx.test) |
|
125 |
+ |
|
126 |
+} # test.readAndParse |
|
127 |
+#------------------------------------------------------------------------------- |
|
128 |
+test.createMetadata = function (matrices, tbl.ref, xref, list.BD) |
|
129 |
+{ |
|
130 |
+ print ('--- test.createMetadata') |
|
131 |
+ |
|
132 |
+ tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD) |
|
133 |
+ checkEquals (nrow (tbl.md), length (matrices)) |
|
134 |
+ checkEquals (ncol (tbl.md), 15) |
|
135 |
+ expected <- c("providerName", "providerId", "dataSource", "geneSymbol", |
|
136 |
+ "geneId", "geneIdType", "proteinId", "proteinIdType", |
|
137 |
+ "organism", "sequenceCount", "bindingSequence", |
|
138 |
+ "bindingDomain", "tfFamily", "experimentType", "pubmedID") |
|
139 |
+ |
|
140 |
+ checkEquals (colnames (tbl.md), expected) |
|
141 |
+ tester = "Dmelanogaster-FlyFactorSurvey-Abd.A_FlyReg_FBgn0000014" |
|
142 |
+ checkTrue (tester %in% rownames (tbl.md)) |
|
143 |
+ x = as.list (tbl.md [tester,]) |
|
144 |
+ checkEquals (x$providerName, "Abd-A_FlyReg_FBgn0000014") |
|
145 |
+ checkEquals (x$providerId, "FBgn0000014") |
|
146 |
+ checkEquals (x$geneSymbol, "abd-A") |
|
147 |
+ checkEquals (x$geneId, "42037") |
|
148 |
+ checkEquals (x$geneIdType, "ENTREZ") |
|
149 |
+ checkEquals (x$proteinId, "E1JIM9") |
|
150 |
+ checkEquals (x$proteinIdType, "UNIPROT") |
|
151 |
+ checkEquals (x$organism, "Dmelanogaster") |
|
152 |
+ checkEquals (x$sequenceCount, 37) |
|
153 |
+ checkTrue (is.na (x$bindingSequence)) |
|
154 |
+ checkEquals (x$bindingDomain, 'Homeobox') |
|
155 |
+ checkTrue (is.na (x$tfFamily)) |
|
156 |
+ checkEquals (x$experimentType, "DNASE I footprinting") |
|
157 |
+ checkTrue (is.na (x$pubmedID)) |
|
158 |
+ |
|
159 |
+ invisible (tbl.md) |
|
160 |
+ |
|
161 |
+} # test.createMetadata |
|
162 |
+#------------------------------------------------------------------------------- |
|
163 |
+test.renameMatrices = function (matrices, tbl.md) |
|
164 |
+{ |
|
165 |
+ print ('--- test.renameMatrices') |
|
166 |
+ stopifnot (length (matrices) == 3) |
|
167 |
+ stopifnot (nrow (tbl.md) == 3) |
|
168 |
+ |
|
169 |
+ old.names = names (matrices) |
|
170 |
+ mtxr = renameMatrices (matrices, tbl.md) |
|
171 |
+ |
|
172 |
+ # make sure the dash in the incoming, eg, |
|
173 |
+ # "Abd-A_FlyReg_FBgn0000014.pfm" |
|
174 |
+ # is converted to a '.', reserving dashes for |
|
175 |
+ # the separation of dataSource-species-nativeGeneName |
|
176 |
+ |
|
177 |
+ |
|
178 |
+ checkEquals (names (mtxr) [1], |
|
179 |
+ "Dmelanogaster-FlyFactorSurvey-Abd.A_FlyReg_FBgn0000014") |
|
180 |
+ |
|
181 |
+ checkEquals (names (mtxr) [2], |
|
182 |
+ "Dmelanogaster-FlyFactorSurvey-Abd.B_FlyReg_FBgn0000015") |
|
183 |
+ |
|
184 |
+ checkEquals (names (mtxr) [3], |
|
185 |
+ "Dmelanogaster-FlyFactorSurvey-AbdA_Cell_FBgn0000014") |
|
186 |
+ |
|
187 |
+ invisible (mtxr) |
|
188 |
+ |
|
189 |
+} # renameMatrices |
|
190 |
+#------------------------------------------------------------------------------- |
|
191 |
+test.fbgnToIDs <- function() |
|
192 |
+{ |
|
193 |
+ print("--- test.fbgnToIDs") |
|
194 |
+ # first, make sure that unmappable FBgn ids are handled properly |
|
195 |
+ bogus <- fbgnToIDs("bogus") |
|
196 |
+ checkEquals(dim(bogus), c(1,4)) |
|
197 |
+ checkEquals(rownames(bogus), "bogus") |
|
198 |
+ checkEquals(as.character(as.list(bogus[1,],)), c(rep("bogus", 3), "FLYBASE")) |
|
199 |
+ |
|
200 |
+ # disable the translation of NA to rowname |
|
201 |
+ bogus <- fbgnToIDs("bogus", useInputForMissingValues=FALSE) |
|
202 |
+ checkEquals(dim(bogus), c(1,4)) |
|
203 |
+ checkEquals(rownames(bogus), "bogus") |
|
204 |
+ checkTrue(all(is.na(bogus[1,1:3]))) |
|
205 |
+ |
|
206 |
+ fbgns <- c ("FBgn0003267", "FBgn0000611", "FBgn0004394", "FBgn0027364", |
|
207 |
+ "FBgn0000413") |
|
208 |
+ tbl.ids <- fbgnToIDs(fbgns) |
|
209 |
+ checkEquals(tbl.ids$flybase_id, fbgns) |
|
210 |
+ checkEquals(tbl.ids$symbol, c("ro", "exd", "pdm2", "Six4", "da")) |
|
211 |
+ checkEquals(tbl.ids$geneIdType, rep("ENTREZ", 5)) |
|
212 |
+ |
|
213 |
+ # 0259750 is obsolete, replace by 0264442 |
|
214 |
+ tbl.ids <- fbgnToIDs("FBgn0259750") |
|
215 |
+ checkEquals(as.list(tbl.ids), |
|
216 |
+ list(flybase_id="FBgn0264442", symbol="ab", gene_id="34560", |
|
217 |
+ geneIdType="ENTREZ")) |
|
218 |
+ |
|
219 |
+ # http://flybase.org/static_pages/downloads/IDConv.html |
|
220 |
+ # FBgn0003986 FBgn0261930 FBgn0261930 vnd |
|
221 |
+ tbl.ids <- fbgnToIDs("FBgn0003986") |
|
222 |
+ checkEquals(as.character(as.list(tbl.ids[1,],)), |
|
223 |
+ c(rep("FBgn0003986", 3), "FLYBASE")) |
|
224 |
+ |
|
225 |
+ load("../../../extdata/fbgns.RData") |
|
226 |
+ checkEquals(length(fbgns), 326) |
|
227 |
+ tbl.ids <- fbgnToIDs(fbgns) |
|
228 |
+ checkEquals(dim(tbl.ids), c(length(fbgns), 4)) |
|
229 |
+ |
|
230 |
+ # no NA's should survive |
|
231 |
+ checkTrue(!any(is.na(tbl.ids))) |
|
232 |
+ |
|
233 |
+ # what percentage of the 326 ids fail to map? do NOT convert NAs |
|
234 |
+ tbl.ids <- fbgnToIDs(fbgns, useInputForMissingValues=FALSE) |
|
235 |
+ failure.count <- length(which(is.na(tbl.ids$flybase_id))) |
|
236 |
+ checkEquals(failure.count, 18) |
|
237 |
+ |
|
238 |
+} # test.fbgnToIDsp |
|
239 |
+#------------------------------------------------------------------------------- |
|
240 |
+# robert stojnic reported that MotifDb 1.1.6 (and before) had some incorrect |
|
241 |
+# gene symbols, as seen here: |
|
242 |
+# library(MotifDb) |
|
243 |
+# mdb <- MotifDb |
|
244 |
+# x <- query(mdb, "ab_SANGER_10_FBgn0259750") |
|
245 |
+# t(as.matrix(mcols(x))) |
|
246 |
+# [,1] |
|
247 |
+# providerName "ab_SANGER_10_FBgn0259750" |
|
248 |
+# providerId "FBgn0259750" |
|
249 |
+# dataSource "FlyFactorSurvey" |
|
250 |
+# geneSymbol "Ab" |
|
251 |
+# geneId "FBgn0259750" |
|
252 |
+# geneIdType "FLYBASE" |
|
253 |
+# proteinId "E1JHF4" |
|
254 |
+# proteinIdType "UNIPROT" |
|
255 |
+# organism "Dmelanogaster" |
|
256 |
+# sequenceCount "20" |
|
257 |
+# bindingSequence NA |
|
258 |
+# bindingDomain "zf-C2H2" |
|
259 |
+# tfFamily NA |
|
260 |
+# experimentType "bacterial 1-hybrid, SANGER sequencing" |
|
261 |
+# pubmedID NA |
|
262 |
+# |
|
263 |
+# FBgn0259750 is obsolete. flybase provides a converter: |
|
264 |
+# http://flybase.org/static_pages/downloads/IDConv.html shows this, and provide the current FBgn: |
|
265 |
+# |
|
266 |
+# Submitted ID Current ID Converted ID Related record |
|
267 |
+# FBgn0259750 FBgn0264442 FBgn0264442 ab |
|
268 |
+# |
|
269 |
+# make sure we get this right |
|
270 |
+# the matrix files are: |
|
271 |
+# |
|
272 |
+# flyFactorSurveyRootDir/ab_SANGER_10_FBgn0259750.pfm |
|
273 |
+# flyFactorSurveyRootDir/ab_SOLEXA_5_FBgn0259750.pfm |
|
274 |
+# |
|
275 |
+test.geneNaming <- function(flyFactorSurveyRootDir) |
|
276 |
+{ |
|
277 |
+ |
|
278 |
+ flyFactorSurveyRootDir <- "/Users/pshannon/s/data/public/TFBS/flyFactorSurvey/unpacked" |
|
279 |
+ filenames = getMatrixFilenames (flyFactorSurveyRootDir) |
|
280 |
+ |
|
281 |
+ removers <- grep (bindingDomainXrefSourceFile(), filenames) |
|
282 |
+ if(length(removers) > 0) |
|
283 |
+ filenames <- filenames[-removers] |
|
284 |
+ |
|
285 |
+ keepers <- grep("FBgn0259750", filenames) |
|
286 |
+ checkTrue(length(keepers) == 2) |
|
287 |
+ filenames <- filenames[keepers] |
|
288 |
+ full.filenames = file.path(flyFactorSurveyRootDir, filenames) |
|
289 |
+ list.BD = createXrefBindingDomain (flyFactorSurveyRootDir) |
|
290 |
+ xref = createXref () # maps flybase ids to uniprot |
|
291 |
+ tbl.ref = createExperimentRefTable () |
|
292 |
+ matrices = readAndParse (full.filenames) |
|
293 |
+ tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD) |
|
294 |
+ # now normalize, not in readAndParse, because we need the counts to go into tbl.md$sequenceCount |
|
295 |
+ checkEquals(tbl.md$geneSymbol, c("ab", "ab")) |
|
296 |
+ matrices = normalizeMatrices (matrices) |
|
297 |
+ matrices = renameMatrices (matrices, tbl.md) |
|
298 |
+ |
|
299 |
+} # test.geneNaming |
|
300 |
+#------------------------------------------------------------------------------- |