Browse code

added import/flyFactorSurvey directory, with import and test scripts

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/MotifDb@75293 bc3139a8-67e5-0310-9ffc-ced21a209358

p.shannon authored on 04/04/2013 18:32:19
Showing 3 changed files

1 1
new file mode 100644
2 2
Binary files /dev/null and b/inst/scripts/import/flyFactorSurvey/flyFactorSurvey.RData differ
3 3
new file mode 100644
... ...
@@ -0,0 +1,287 @@
1
+# flyFactorSurvey/import.R
2
+#------------------------------------------------------------------------------------------------------------------------
3
+library(org.Dm.eg.db)
4
+library(biomaRt)
5
+#------------------------------------------------------------------------------------------------------------------------
6
+bindingDomainXrefSourceFile <- function() {"TFfile2b.tsv"}
7
+printf <- function(...) print(noquote(sprintf(...)))
8
+#------------------------------------------------------------------------------------------------------------------------
9
+run = function (flyFactorSurveyRootDir)
10
+{
11
+  filenames = getMatrixFilenames (flyFactorSurveyRootDir)
12
+
13
+     # the rootDir has only matrix files, with one exception:
14
+     # a file which describes the protein binding domains
15
+     # remove that from the matrix file list
16
+  
17
+  removers <- grep (bindingDomainXrefSourceFile(), filenames)
18
+  if(length(removers) > 0)
19
+      filenames <- filenames[-removers]
20
+  
21
+  full.filenames = file.path(flyFactorSurveyRootDir, filenames)
22
+
23
+  list.BD = createXrefBindingDomain (flyFactorSurveyRootDir)
24
+  xref = createXref ()  # maps flybase ids to uniprot
25
+  tbl.ref = createExperimentRefTable ()
26
+  matrices = readAndParse (full.filenames)
27
+  tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD)
28
+     # now normalize, not in readAndParse, because we need the counts to go into tbl.md$sequenceCount
29
+  matrices = normalizeMatrices (matrices)
30
+  matrices = renameMatrices (matrices, tbl.md)
31
+  serializedFile <- "flyFactorSurvey.RData"
32
+  save (matrices, tbl.md, file=serializedFile)
33
+  printf("saved %d matrices to %s", length(matrices), serializedFile)
34
+  printf("copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
35
+
36
+} # run
37
+#------------------------------------------------------------------------------------------------------------------------
38
+# rec'd extra xref info from michael brodsky on (18 jul 2012)
39
+# protein binding domains are of immediate interest.
40
+createXrefBindingDomain = function (flyFactorSurveyRootDir)
41
+{
42
+  f1 = file.path(flyFactorSurveyRootDir, bindingDomainXrefSourceFile())
43
+  tbl.raw = read.table (f1, sep='\t', header=TRUE, as.is=TRUE, comment.char='')
44
+  list.BD = tbl.raw$DNAbindingDomain_Primary
45
+  names (list.BD) = tbl.raw$FlybaseID
46
+  invisible (list.BD)
47
+
48
+} # createXrefBindingDomain
49
+#------------------------------------------------------------------------------------------------------------------------
50
+# a named list, with uniprot id as content, flybase gene names (FBgn) as names
51
+createXref = function ()
52
+{
53
+  tbl.egUNIPROT = toTable (org.Dm.egUNIPROT)
54
+  tbl.egFLYBASE = toTable (org.Dm.egFLYBASE)
55
+  tbl.xref = merge (tbl.egFLYBASE, tbl.egUNIPROT)
56
+  xref = tbl.xref [, 3]
57
+  names (xref) = tbl.xref [, 2]
58
+  invisible (xref)
59
+
60
+} # createXref
61
+#------------------------------------------------------------------------------------------------------------------------
62
+getMatrixFilenames = function (flyFactorSurveyRootDir)
63
+{
64
+  all.files = list.files (flyFactorSurveyRootDir)
65
+  files.to.exclude = c ('go.R', 'RCS', 'rdata')
66
+  for (file in files.to.exclude) {
67
+    removers = grep (file, all.files, ignore.case=T)
68
+    if (length (removers) > 0)
69
+      all.files = all.files [-removers]
70
+    } # for
71
+
72
+  invisible (sort(all.files))
73
+
74
+} # getMatrixFilenames
75
+#-----------------------------------------------------------------------------------------------------------------------
76
+createExperimentRefTable = function ()
77
+{
78
+  experiment.names = c ('SANGER', 'SOLEXA', 'NAR', 'Cell', 'FlyReg', 'NBT')
79
+  pubmedID = c (NA_character_, NA_character_, '1858360', '18332042', NA_character_, NA_character_)
80
+  experimentType = c ('bacterial 1-hybrid, SANGER sequencing',
81
+                      'bacterial 1-hybrid, SOLEXA sequencing',
82
+                      'bacterial 1-hybrid, SANGER sequencing',
83
+                      'bacterial 1-hybrid, SANGER sequencing',
84
+                      'bacterial 1-hybrid, SANGER sequencing',
85
+                      'bacterial 1-hybrid, SANGER sequencing')
86
+
87
+  tbl.ref = data.frame (pmid=pubmedID, experimentType=experimentType, stringsAsFactors=FALSE)
88
+  rownames (tbl.ref) = experiment.names
89
+  invisible (tbl.ref)
90
+
91
+} # createExperimentRefTable
92
+#------------------------------------------------------------------------------------------------------------------------
93
+parsePWMfromText = function (lines.of.text)
94
+{
95
+  stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4)
96
+
97
+
98
+  for (line in lines.of.text) {
99
+    tokens = strsplit (line, '\\s*[:\\|]') [[1]]
100
+    nucleotide = tokens [1]
101
+    numbers.raw = tokens [2]
102
+    number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
103
+    while (nchar (number.tokens [1]) == 0)
104
+      number.tokens = number.tokens [-1]
105
+    numbers = as.numeric (number.tokens)
106
+    if (!exists ('pwm.result'))
107
+      pwm.result = matrix (nrow=4, ncol=length (numbers), byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:length(numbers)))
108
+    pwm.result [nucleotide,] = numbers
109
+    }
110
+
111
+  pwm.result
112
+
113
+} # parsePWMfromText
114
+#------------------------------------------------------------------------------------------------------------------------
115
+# we expect exactly one matrix per file
116
+readAndParse = function (filenames)
117
+{
118
+  matrices = list ()
119
+
120
+  for (filename in filenames) {
121
+    lines.of.text = scan (filename, what=character(0), sep='\n', comment='#', quiet=TRUE)
122
+    mtx = parsePWMfromText (lines.of.text)
123
+    #mtx.normalized = apply (mtx, 2, function (colvector) colvector / sum (colvector))
124
+    matrices [[basename(filename)]] = mtx
125
+    }
126
+ 
127
+   invisible (matrices)
128
+
129
+} # readAndParse
130
+#-----------------------------------------------------------------------------------------------------------------------
131
+createMetadata = function (matrices, tbl.ref, xref, list.BD)
132
+{
133
+  options ('stringsAsFactors'=FALSE)
134
+  dataSource = 'FlyFactorSurvey'
135
+  organism = 'Dmelanogaster'
136
+  stopifnot (length (grep ('\\.pfm$', names (matrices))) == length (matrices))
137
+  native.names.raw = gsub ('\\.pfm$', '', names (matrices))
138
+  #geneSymbols = sapply (strsplit (native.names.raw, '_'), function (tokens) return (tokens [1]))
139
+  flybase.ids = sapply (strsplit (native.names.raw, '_'), function (tokens) return (tokens [length (tokens)]))
140
+  tbl.ids <- fbgnToIDs(flybase.ids)
141
+     # we may have duplicate FBgns in the otherwise unique matrix names,
142
+     # creating a tbl.ids with fewer rows than we have matrices.
143
+     # expand tbl.ids to align with the matrix nmes
144
+  tbl.ids <- tbl.ids[flybase.ids,]
145
+  stopifnot(nrow(tbl.ids) == length(matrices))
146
+     #  next up: call fbgnToIDs(flybase.ids), assign geneSymbols
147
+     #  and other variables from the returned data.frame
148
+  stopifnot (length (grep ('^FBgn', flybase.ids)) ==  length (flybase.ids))
149
+  stopifnot (all (nchar (flybase.ids) == 11))
150
+  native.names.cooked = gsub ('-', '.', native.names.raw)   # so that the dash reliably separates the three parts of full.names
151
+  full.names = paste (organism, dataSource, native.names.cooked, sep='-')
152
+
153
+  tbl.md = data.frame (providerName=native.names.raw)
154
+  rownames (tbl.md) = full.names
155
+  tbl.md = cbind (tbl.md, providerId=flybase.ids)
156
+  tbl.md = cbind (tbl.md, dataSource = rep (dataSource, nrow (tbl.md)))
157
+
158
+  tbl.md = cbind (tbl.md, geneSymbol=tbl.ids$symbol)
159
+  tbl.md = cbind (tbl.md, geneId=tbl.ids$gene_id)
160
+  #geneIdType <- rep("ENTREZ", nrow(tbl.md))
161
+     # not all FBgn ids can be mapped (by fbgnToIds) to entrez genes; for those,
162
+     # the gene_id field is the FBgn id
163
+  #geneIdIsFBgn <- grep("^FBgn", tbl.ids$gene_id)
164
+  #if(length(geneIdIsFBgn) > 0)
165
+  #    geneIdType [geneIdIsFBgn] <- "FLYBASE"
166
+  tbl.md = cbind (tbl.md, geneIdType=tbl.ids$geneIdType)
167
+
168
+  uniprot.ids = as.character (xref [flybase.ids])
169
+  tbl.md = cbind (tbl.md, proteinId=uniprot.ids)
170
+  protein.id.types = rep ('UNIPROT', nrow (tbl.md))
171
+  NA.protein.indices = which (is.na (tbl.md$proteinId))
172
+  if (length (NA.protein.indices) > 0)
173
+    protein.id.types [NA.protein.indices] = NA
174
+  tbl.md = cbind (tbl.md, proteinIdType=protein.id.types)
175
+  #tbl.md = cbind (tbl.md, proteinIdType=rep ('UNIPROT', nrow (tbl.md)))
176
+
177
+  tbl.md = cbind (tbl.md, organism = rep (organism, nrow (tbl.md)))
178
+
179
+  counts = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
180
+  tbl.md = cbind (tbl.md, sequenceCount=counts)
181
+
182
+  tbl.md = cbind (tbl.md, bindingSequence=rep (NA_character_, nrow (tbl.md)))
183
+  bindingDomains = rep (NA, nrow (tbl.md))
184
+  indices = match (tbl.md$providerId, names (list.BD))
185
+  bindingDomain = as.character (list.BD)[indices]
186
+  bindingDomain [bindingDomain == 'NA'] = NA
187
+  bindingDomain [bindingDomain == ''] = NA
188
+  tbl.md = cbind (tbl.md, bindingDomain)
189
+
190
+  experiment.types = rep (NA_character_, nrow (tbl.md))
191
+  from.Cell = grep ('_Cell_', native.names.raw)
192
+  from.NAR  = grep ('_NAR_', native.names.raw)
193
+  from.SOLEXA = grep ('_SOLEXA_', native.names.raw)
194
+  from.SANGER = grep ('_SANGER_', native.names.raw)
195
+  from.FlyReg = grep ('_FlyReg_', native.names.raw) 
196
+
197
+  experiment.types [from.SOLEXA] = 'bacterial 1-hybrid, SOLEXA sequencing'
198
+  experiment.types [from.SANGER] = 'bacterial 1-hybrid, SANGER sequencing'
199
+  experiment.types [from.Cell] = 'bacterial 1-hybrid, SANGER sequencing'
200
+  experiment.types [from.NAR] = 'bacterial 1-hybrid, SANGER sequencing'
201
+  experiment.types [from.FlyReg] = 'DNASE I footprinting'
202
+
203
+  tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md)))
204
+
205
+  tbl.md = cbind (tbl.md, experimentType=experiment.types)
206
+
207
+  pubmedIDs = rep (NA_character_, nrow (tbl.md))
208
+  pubmedIDs [from.Cell] = '18332042'
209
+  pubmedIDs [from.NAR] = '1858360'
210
+
211
+  tbl.md = cbind (tbl.md, pubmedID=pubmedIDs)
212
+
213
+  
214
+  invisible (tbl.md)
215
+
216
+} # createMetadata
217
+#------------------------------------------------------------------------------------------------------------------------
218
+normalizeMatrices = function (matrices)
219
+{
220
+  mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
221
+  invisible (mtx.normalized)
222
+
223
+} # normalizeMatrices
224
+#------------------------------------------------------------------------------------------------------------------------
225
+renameMatrices = function (matrices, tbl.md)
226
+{
227
+  stopifnot (length (matrices) == nrow (tbl.md))
228
+  names (matrices) = rownames (tbl.md)
229
+  invisible (matrices)
230
+
231
+} # renameMatrices
232
+#-------------------------------------------------------------------------------
233
+fbgnToIDs <- function(fbgns, useInputForMissingValues=TRUE)
234
+{
235
+   fbgns <- unique(fbgns)
236
+   if(!exists("tbl.names")) {
237
+       tbl.sym <- toTable(org.Dm.egSYMBOL)
238
+       tbl.fbgn <- toTable(org.Dm.egFLYBASE)
239
+       tbl.names <<- merge(tbl.fbgn, tbl.sym)[, c("flybase_id", "symbol", "gene_id")]
240
+       flyMart <<- useMart(biomart="ensembl", dataset="dmelanogaster_gene_ensembl")
241
+       }
242
+   tbl.bm <- getBM(attributes=c("flybase_gene_id", "entrezgene"), filters=c("flybase_gene_id"),
243
+                   values=fbgns, mart=flyMart)
244
+   fbgns.notfound <- setdiff(fbgns, tbl.bm$flybase_gene_id)
245
+   if(length(fbgns.notfound) > 0){
246
+       new.rows <- data.frame(flybase_gene_id=fbgns.notfound,
247
+                              entrezgene=NA,
248
+                              stringsAsFactors=FALSE)
249
+       tbl.bm <- rbind(tbl.bm, new.rows)
250
+       }
251
+
252
+   tbl.bm <- unique(tbl.bm)
253
+
254
+      # some fbgns could be mapped to multiple entrez genes.  remove all but the
255
+      # one which occurs first
256
+   
257
+   duplicates <- which(duplicated(tbl.bm$flybase_gene_id))
258
+   if(length(duplicates) > 0)
259
+       tbl.bm <- tbl.bm[-duplicates,]
260
+   
261
+   stopifnot(sort(fbgns) == sort(tbl.bm$flybase_gene_id))
262
+
263
+       # order the biomart results so that it matches the order of the incoming fgbns
264
+   tbl.bm <- tbl.bm[match(fbgns, tbl.bm$flybase_gene_id),]
265
+   hits <- match(tbl.bm$entrezgene, tbl.names$gene_id)
266
+   result <- tbl.names[hits,]
267
+   rownames(result) <- fbgns
268
+
269
+       # not all FBgn ids have an associated entrez geneID.
270
+       # use these orphan FBgn ids as their own "geneID"
271
+   
272
+   failed.lookups <- which(is.na(result$flybase_id))
273
+   failed.count <- length(failed.lookups)
274
+   if(useInputForMissingValues & failed.count > 0) {
275
+      failed.names <- rownames(result)[failed.lookups]
276
+      names.matrix <- matrix(rep(failed.names, failed.count), nrow=failed.count)
277
+      result[failed.lookups,] <- names.matrix
278
+      }
279
+
280
+   geneIdType <- rep("ENTREZ", nrow(result))
281
+   geneIdType [failed.lookups] <- "FLYBASE"
282
+   result <- cbind(result, geneIdType, stringsAsFactors=FALSE)
283
+   result
284
+
285
+
286
+} # fbgnToIDs
287
+#------------------------------------------------------------------------------------------------------------------------
0 288
new file mode 100644
... ...
@@ -0,0 +1,300 @@
1
+# flyFactorSurvey/test.R
2
+#-------------------------------------------------------------------------------
3
+library (RUnit)
4
+#-------------------------------------------------------------------------------
5
+source("import.R")
6
+#-------------------------------------------------------------------------------
7
+run.tests = function (flyFactorSurveyRootDir="~/s/data/public/TFBS/flyFactorSurvey/unpacked")
8
+{
9
+    freshStart ()
10
+    test.fbgnToIDs()
11
+    x.list.BD <- test.createXrefBindingDomain (flyFactorSurveyRootDir)
12
+    x.xref <- test.createXref ()
13
+    x.filenames <- test.getMatrixFilenames (flyFactorSurveyRootDir)
14
+    x.tbl.ref <- test.createExperimentRefTable ()
15
+    x.pwm <- test.parsePWMfromText (flyFactorSurveyRootDir)
16
+    x.mat3 <- test.readAndParse (flyFactorSurveyRootDir)
17
+    x.tbl.md <- test.createMetadata (x.mat3, x.tbl.ref, x.xref, x.list.BD)
18
+    x.mat3f <- test.normalizeMatrices (x.mat3)
19
+    x.mat3fr <- test.renameMatrices (x.mat3f, x.tbl.md [1:3,])
20
+
21
+} # run.tests
22
+#-------------------------------------------------------------------------------
23
+freshStart = function ()
24
+{
25
+    output.files.easy.to.regenerate = grep ('RData$', dir (), value=TRUE)
26
+    if (length (output.files.easy.to.regenerate) > 0)
27
+        unlink (output.files.easy.to.regenerate)
28
+
29
+} # freshStart
30
+#-------------------------------------------------------------------------------
31
+test.createXrefBindingDomain = function (flyFactorSurveyRootDir)
32
+{
33
+    print ('--- test.createXrefBindingDomain')
34
+    list.BD = createXrefBindingDomain (flyFactorSurveyRootDir)
35
+    checkEquals (length (list.BD), 777)
36
+    checkTrue (all (grepl ('FBgn', names (list.BD))))
37
+        # most abundant domain is zf-C2H2
38
+    checkEquals(names (sort (table (as.character (list.BD)), decreasing=TRUE)[1]),
39
+                "zf-C2H2")
40
+    invisible(list.BD)
41
+
42
+} # test.createXrefBindingDomain
43
+#-------------------------------------------------------------------------------
44
+test.createXref = function ()
45
+{
46
+    print ('--- test.createXref')
47
+    xref = createXref ()
48
+    checkEquals (xref [['FBgn0261819']], 'F0J881')
49
+    #checkEquals (xref [['FBgn0259750']], 'E1JHF4') # gone with org.Dm.eg.db_2.9.0
50
+    checkEquals (xref [['FBgn0000014']], 'E1JIM9')
51
+    invisible (xref)
52
+
53
+} # test.createXref
54
+#-------------------------------------------------------------------------------
55
+test.getMatrixFilenames = function (flyFactorSurveyRootDir)
56
+{
57
+    print ('--- test.getMatrixFilenames')
58
+
59
+    filenames = getMatrixFilenames (flyFactorSurveyRootDir)
60
+       # as of (03 apr 2013): 614 matrix files, one binding domains extra
61
+       # file, TFfile2b.tsv
62
+    
63
+    checkEquals (length (filenames), 615)
64
+    checkEquals(filenames[1:3],
65
+                c("Abd-A_FlyReg_FBgn0000014.pfm", "Abd-B_FlyReg_FBgn0000015.pfm",
66
+                  "AbdA_Cell_FBgn0000014.pfm"))
67
+    invisible (filenames)
68
+
69
+} # test.getMatrixFilenames
70
+#-------------------------------------------------------------------------------
71
+test.createExperimentRefTable = function ()
72
+{
73
+    print ('--- test.createExperimentRefTable')
74
+    tbl.ref = createExperimentRefTable ()
75
+    checkEquals (nrow (tbl.ref), 6)
76
+
77
+    invisible (tbl.ref)
78
+
79
+} # test.createExperimentRefTable
80
+#-------------------------------------------------------------------------------
81
+test.parsePWMfromText = function (flyFactorSurveyRootDir)
82
+{
83
+    print ('--- test.parsePWMfromText')
84
+    path <- file.path(flyFactorSurveyRootDir, 'Tup_SOLEXA_10_FBgn0003896.pfm')
85
+
86
+    lines.of.text =  scan (path, sep='\n',
87
+                           what=character(0), comment='#', quiet=TRUE)
88
+    pwm.tup =  parsePWMfromText (lines.of.text)
89
+    checkEquals (dim (pwm.tup), c (4, 9))
90
+    checkEquals (colnames (pwm.tup), as.character (1:9))
91
+    checkEquals (rownames (pwm.tup), c ('A', 'C', 'G', 'T'))
92
+
93
+  invisible (pwm.tup)
94
+  
95
+} # test.parsePWMfromText
96
+#-------------------------------------------------------------------------------
97
+test.readAndParse = function (flyFactorSurveyRootDir)
98
+{
99
+    print ('--- test.readAndParse')
100
+
101
+    all.files = file.path(flyFactorSurveyRootDir,
102
+                          getMatrixFilenames (flyFactorSurveyRootDir))
103
+    
104
+    sample.files = all.files [1:3]   # grep ('FBgn0003896', all.files)
105
+    checkEquals (length (sample.files), 3)
106
+    checkTrue(all(file.exists(sample.files)))
107
+
108
+      # by sorting these filenames, we know the order of the three returned
109
+      # matrices, and results can easily be checked
110
+    
111
+    mtx.test = readAndParse (sample.files)
112
+
113
+    checkEquals (length (mtx.test), 3)
114
+    expected.names = c("Abd-A_FlyReg_FBgn0000014.pfm",
115
+                       "Abd-B_FlyReg_FBgn0000015.pfm",
116
+                       "AbdA_Cell_FBgn0000014.pfm")
117
+    actual.names = sort(names(mtx.test))
118
+    checkEquals(expected.names, actual.names)
119
+
120
+    checkEquals (dim (mtx.test [[1]]), c (4, 8))
121
+    checkEquals (dim (mtx.test [[2]]), c (4, 8))
122
+    checkEquals (dim (mtx.test [[3]]), c (4, 7))
123
+
124
+    invisible (mtx.test)
125
+
126
+} # test.readAndParse
127
+#-------------------------------------------------------------------------------
128
+test.createMetadata = function (matrices, tbl.ref, xref, list.BD)
129
+{
130
+    print ('--- test.createMetadata')
131
+
132
+    tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD)
133
+    checkEquals (nrow (tbl.md), length (matrices))
134
+    checkEquals (ncol (tbl.md), 15)
135
+    expected <- c("providerName", "providerId", "dataSource", "geneSymbol",
136
+                  "geneId", "geneIdType", "proteinId", "proteinIdType",
137
+                  "organism", "sequenceCount", "bindingSequence",
138
+                  "bindingDomain", "tfFamily", "experimentType", "pubmedID")
139
+
140
+    checkEquals (colnames (tbl.md), expected)
141
+    tester =  "Dmelanogaster-FlyFactorSurvey-Abd.A_FlyReg_FBgn0000014"
142
+    checkTrue (tester %in% rownames (tbl.md))
143
+    x = as.list (tbl.md [tester,])
144
+    checkEquals (x$providerName, "Abd-A_FlyReg_FBgn0000014")
145
+    checkEquals (x$providerId, "FBgn0000014")
146
+    checkEquals (x$geneSymbol, "abd-A")
147
+    checkEquals (x$geneId, "42037")
148
+    checkEquals (x$geneIdType, "ENTREZ")
149
+    checkEquals (x$proteinId, "E1JIM9")
150
+    checkEquals (x$proteinIdType, "UNIPROT")
151
+    checkEquals (x$organism, "Dmelanogaster")
152
+    checkEquals (x$sequenceCount, 37)
153
+    checkTrue   (is.na (x$bindingSequence))
154
+    checkEquals (x$bindingDomain, 'Homeobox')
155
+    checkTrue   (is.na (x$tfFamily))
156
+    checkEquals (x$experimentType, "DNASE I footprinting")
157
+    checkTrue   (is.na (x$pubmedID))
158
+  
159
+    invisible (tbl.md)
160
+
161
+} # test.createMetadata
162
+#-------------------------------------------------------------------------------
163
+test.renameMatrices = function (matrices, tbl.md)
164
+{
165
+    print ('--- test.renameMatrices')
166
+    stopifnot (length (matrices) == 3)
167
+    stopifnot (nrow (tbl.md) == 3)
168
+
169
+    old.names = names (matrices)
170
+    mtxr = renameMatrices (matrices, tbl.md)
171
+
172
+      # make sure the dash in the incoming, eg,
173
+      #     "Abd-A_FlyReg_FBgn0000014.pfm"
174
+      # is converted to a '.', reserving dashes for 
175
+      # the separation of dataSource-species-nativeGeneName
176
+
177
+
178
+    checkEquals (names (mtxr) [1],
179
+                 "Dmelanogaster-FlyFactorSurvey-Abd.A_FlyReg_FBgn0000014")
180
+
181
+    checkEquals (names (mtxr) [2],
182
+                 "Dmelanogaster-FlyFactorSurvey-Abd.B_FlyReg_FBgn0000015")
183
+
184
+    checkEquals (names (mtxr) [3],
185
+                 "Dmelanogaster-FlyFactorSurvey-AbdA_Cell_FBgn0000014")
186
+
187
+    invisible (mtxr)
188
+
189
+} # renameMatrices
190
+#-------------------------------------------------------------------------------
191
+test.fbgnToIDs <- function()
192
+{
193
+    print("--- test.fbgnToIDs")
194
+       # first, make sure that unmappable FBgn ids are handled properly
195
+    bogus <- fbgnToIDs("bogus")
196
+    checkEquals(dim(bogus), c(1,4))
197
+    checkEquals(rownames(bogus), "bogus")
198
+    checkEquals(as.character(as.list(bogus[1,],)), c(rep("bogus", 3), "FLYBASE"))
199
+
200
+       # disable the translation of NA to rowname
201
+    bogus <- fbgnToIDs("bogus", useInputForMissingValues=FALSE)
202
+    checkEquals(dim(bogus), c(1,4))
203
+    checkEquals(rownames(bogus), "bogus")
204
+    checkTrue(all(is.na(bogus[1,1:3])))
205
+
206
+    fbgns <- c ("FBgn0003267", "FBgn0000611", "FBgn0004394", "FBgn0027364",
207
+                "FBgn0000413")
208
+    tbl.ids <- fbgnToIDs(fbgns)
209
+    checkEquals(tbl.ids$flybase_id, fbgns)
210
+    checkEquals(tbl.ids$symbol, c("ro", "exd", "pdm2", "Six4", "da"))
211
+    checkEquals(tbl.ids$geneIdType, rep("ENTREZ", 5))
212
+
213
+         # 0259750 is obsolete, replace by 0264442
214
+    tbl.ids <- fbgnToIDs("FBgn0259750")
215
+    checkEquals(as.list(tbl.ids),
216
+                list(flybase_id="FBgn0264442", symbol="ab", gene_id="34560",
217
+                     geneIdType="ENTREZ"))
218
+
219
+       # http://flybase.org/static_pages/downloads/IDConv.html
220
+       # FBgn0003986 	 FBgn0261930 	FBgn0261930	 vnd 
221
+    tbl.ids <- fbgnToIDs("FBgn0003986")
222
+    checkEquals(as.character(as.list(tbl.ids[1,],)),
223
+                c(rep("FBgn0003986", 3), "FLYBASE"))
224
+    
225
+    load("../../../extdata/fbgns.RData")
226
+    checkEquals(length(fbgns), 326)
227
+    tbl.ids <- fbgnToIDs(fbgns)
228
+    checkEquals(dim(tbl.ids), c(length(fbgns), 4))
229
+
230
+       # no NA's should survive
231
+    checkTrue(!any(is.na(tbl.ids)))
232
+
233
+       # what percentage of the 326 ids fail to map?  do NOT convert NAs
234
+    tbl.ids <- fbgnToIDs(fbgns, useInputForMissingValues=FALSE)
235
+    failure.count <- length(which(is.na(tbl.ids$flybase_id)))
236
+    checkEquals(failure.count, 18)
237
+
238
+} # test.fbgnToIDsp
239
+#-------------------------------------------------------------------------------
240
+# robert stojnic reported that MotifDb 1.1.6 (and before) had some incorrect
241
+# gene symbols, as seen here:
242
+#   library(MotifDb)
243
+#   mdb <- MotifDb
244
+#   x <- query(mdb, "ab_SANGER_10_FBgn0259750")
245
+#   t(as.matrix(mcols(x)))
246
+#                 [,1]                                   
247
+#   providerName    "ab_SANGER_10_FBgn0259750"             
248
+#   providerId      "FBgn0259750"
249
+#   dataSource      "FlyFactorSurvey"                      
250
+#   geneSymbol      "Ab"                                   
251
+#   geneId          "FBgn0259750"                          
252
+#   geneIdType      "FLYBASE"                              
253
+#   proteinId       "E1JHF4"                               
254
+#   proteinIdType   "UNIPROT"                              
255
+#   organism        "Dmelanogaster"                        
256
+#   sequenceCount   "20"                                   
257
+#   bindingSequence NA                                     
258
+#   bindingDomain   "zf-C2H2"                              
259
+#   tfFamily        NA                                     
260
+#   experimentType  "bacterial 1-hybrid, SANGER sequencing"
261
+#   pubmedID        NA
262
+#
263
+# FBgn0259750 is obsolete.  flybase provides a converter:
264
+#    http://flybase.org/static_pages/downloads/IDConv.html shows this, and provide the current FBgn:
265
+#
266
+#           Submitted ID          Current ID    Converted ID    Related record 
267
+#            FBgn0259750          FBgn0264442    FBgn0264442                ab 
268
+#
269
+# make sure we get this right
270
+# the matrix files are:
271
+#
272
+#     flyFactorSurveyRootDir/ab_SANGER_10_FBgn0259750.pfm
273
+#     flyFactorSurveyRootDir/ab_SOLEXA_5_FBgn0259750.pfm
274
+#
275
+test.geneNaming <- function(flyFactorSurveyRootDir)
276
+{
277
+
278
+   flyFactorSurveyRootDir <- "/Users/pshannon/s/data/public/TFBS/flyFactorSurvey/unpacked"
279
+   filenames = getMatrixFilenames (flyFactorSurveyRootDir)
280
+
281
+   removers <- grep (bindingDomainXrefSourceFile(), filenames)
282
+   if(length(removers) > 0)
283
+       filenames <- filenames[-removers]
284
+  
285
+   keepers <- grep("FBgn0259750", filenames)
286
+   checkTrue(length(keepers) == 2)
287
+   filenames <- filenames[keepers]
288
+   full.filenames = file.path(flyFactorSurveyRootDir, filenames)
289
+   list.BD = createXrefBindingDomain (flyFactorSurveyRootDir)
290
+   xref = createXref ()  # maps flybase ids to uniprot
291
+   tbl.ref = createExperimentRefTable ()
292
+   matrices = readAndParse (full.filenames)
293
+   tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD)
294
+     # now normalize, not in readAndParse, because we need the counts to go into tbl.md$sequenceCount
295
+   checkEquals(tbl.md$geneSymbol, c("ab", "ab"))
296
+   matrices = normalizeMatrices (matrices)
297
+   matrices = renameMatrices (matrices, tbl.md)
298
+
299
+} # test.geneNaming
300
+#-------------------------------------------------------------------------------