# flyFactorSurvey/import.R #------------------------------------------------------------------------------------------------------------------------ library(org.Dm.eg.db) library(biomaRt) #------------------------------------------------------------------------------------------------------------------------ bindingDomainXrefSourceFile <- function() {"TFfile2b.tsv"} printf <- function(...) print(noquote(sprintf(...))) #kDataDir <- "~/s/data/public/TFBS" kDataDir <- "/shared/silo_researcher/Morgan_M/BioC/MotifDb" #------------------------------------------------------------------------------------------------------------------------ run = function (dataDir=kDataDir) { dataDir <- file.path(dataDir, "flyFactorSurvey") filenames = getMatrixFilenames (dataDir) # the rootDir has only matrix files, with one exception: # a file which describes the protein binding domains # remove that from the matrix file list removers <- grep (bindingDomainXrefSourceFile(), filenames) if(length(removers) > 0) filenames <- filenames[-removers] full.filenames = file.path(dataDir, filenames) list.BD = createXrefBindingDomain (dataDir) xref = createXref () # maps flybase ids to uniprot tbl.ref = createExperimentRefTable () matrices = readAndParse (full.filenames) tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD) # now normalize, not in readAndParse, because we need the counts to go into tbl.md$sequenceCount matrices = normalizeMatrices (matrices) matrices = renameMatrices (matrices, tbl.md) serializedFile <- "flyFactorSurvey.RData" save (matrices, tbl.md, file=serializedFile) printf("saved %d matrices to %s", length(matrices), serializedFile) printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, and rebuild package", serializedFile) } # run #------------------------------------------------------------------------------------------------------------------------ # rec'd extra xref info from michael brodsky on (18 jul 2012) # protein binding domains are of immediate interest. createXrefBindingDomain = function (dataDir) { f1 = file.path(dataDir, bindingDomainXrefSourceFile()) tbl.raw = read.table (f1, sep='\t', header=TRUE, as.is=TRUE, comment.char='') list.BD = tbl.raw$DNAbindingDomain_Primary names (list.BD) = tbl.raw$FlybaseID invisible (list.BD) } # createXrefBindingDomain #------------------------------------------------------------------------------------------------------------------------ # a named list, with uniprot id as content, flybase gene names (FBgn) as names createXref = function () { tbl.egUNIPROT = toTable (org.Dm.egUNIPROT) tbl.egFLYBASE = toTable (org.Dm.egFLYBASE) tbl.xref = merge (tbl.egFLYBASE, tbl.egUNIPROT) xref = tbl.xref [, 3] names (xref) = tbl.xref [, 2] invisible (xref) } # createXrefp #------------------------------------------------------------------------------------------------------------------------ getMatrixFilenames = function (dataDir) { all.files = list.files (dataDir) files.to.exclude = c ('go.R', 'RCS', 'rdata') for (file in files.to.exclude) { removers = grep (file, all.files, ignore.case=T) if (length (removers) > 0) all.files = all.files [-removers] } # for invisible (sort(all.files)) } # getMatrixFilenames #----------------------------------------------------------------------------------------------------------------------- createExperimentRefTable = function () { experiment.names = c ('SANGER', 'SOLEXA', 'NAR', 'Cell', 'FlyReg', 'NBT') pubmedID = c (NA_character_, NA_character_, '1858360', '18332042', NA_character_, NA_character_) experimentType = c ('bacterial 1-hybrid, SANGER sequencing', 'bacterial 1-hybrid, SOLEXA sequencing', 'bacterial 1-hybrid, SANGER sequencing', 'bacterial 1-hybrid, SANGER sequencing', 'bacterial 1-hybrid, SANGER sequencing', 'bacterial 1-hybrid, SANGER sequencing') tbl.ref = data.frame (pmid=pubmedID, experimentType=experimentType, stringsAsFactors=FALSE) rownames (tbl.ref) = experiment.names invisible (tbl.ref) } # createExperimentRefTable #------------------------------------------------------------------------------------------------------------------------ parsePWMfromText = function (lines.of.text) { stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4) for (line in lines.of.text) { tokens = strsplit (line, '\\s*[:\\|]') [[1]] nucleotide = tokens [1] numbers.raw = tokens [2] number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]] while (nchar (number.tokens [1]) == 0) number.tokens = number.tokens [-1] numbers = as.numeric (number.tokens) if (!exists ('pwm.result')) pwm.result = matrix (nrow=4, ncol=length (numbers), byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:length(numbers))) pwm.result [nucleotide,] = numbers } pwm.result } # parsePWMfromText #------------------------------------------------------------------------------------------------------------------------ # we expect exactly one matrix per file readAndParse = function (filenames) { matrices = list () for (filename in filenames) { lines.of.text = scan (filename, what=character(0), sep='\n', comment='#', quiet=TRUE) mtx = parsePWMfromText (lines.of.text) matrices [[basename(filename)]] = mtx } invisible (matrices) } # readAndParse #----------------------------------------------------------------------------------------------------------------------- createMetadata = function (matrices, tbl.ref, xref, list.BD) { options ('stringsAsFactors'=FALSE) dataSource = 'FlyFactorSurvey' organism = 'Dmelanogaster' stopifnot (length (grep ('\\.pfm$', names (matrices))) == length (matrices)) native.names.raw = gsub ('\\.pfm$', '', names (matrices)) #geneSymbols = sapply (strsplit (native.names.raw, '_'), function (tokens) return (tokens [1])) flybase.ids = sapply (strsplit (native.names.raw, '_'), function (tokens) return (tokens [length (tokens)])) tbl.ids <- fbgnToIDs(flybase.ids) # we may have duplicate FBgns in the otherwise unique matrix names, # creating a tbl.ids with fewer rows than we have matrices. # expand tbl.ids to align with the matrix nmes tbl.ids <- tbl.ids[flybase.ids,] stopifnot(nrow(tbl.ids) == length(matrices)) # next up: call fbgnToIDs(flybase.ids), assign geneSymbols # and other variables from the returned data.frame stopifnot (length (grep ('^FBgn', flybase.ids)) == length (flybase.ids)) stopifnot (all (nchar (flybase.ids) == 11)) native.names.cooked = gsub ('-', '.', native.names.raw) # so that the dash reliably separates the three parts of full.names full.names = paste (organism, dataSource, native.names.cooked, sep='-') tbl.md = data.frame (providerName=native.names.raw) rownames (tbl.md) = full.names tbl.md = cbind (tbl.md, providerId=flybase.ids) tbl.md = cbind (tbl.md, dataSource = rep (dataSource, nrow (tbl.md))) tbl.md = cbind (tbl.md, geneSymbol=tbl.ids$symbol) tbl.md = cbind (tbl.md, geneId=tbl.ids$gene_id) #geneIdType <- rep("ENTREZ", nrow(tbl.md)) # not all FBgn ids can be mapped (by fbgnToIds) to entrez genes; for those, # the gene_id field is the FBgn id #geneIdIsFBgn <- grep("^FBgn", tbl.ids$gene_id) #if(length(geneIdIsFBgn) > 0) # geneIdType [geneIdIsFBgn] <- "FLYBASE" tbl.md = cbind (tbl.md, geneIdType=tbl.ids$geneIdType) uniprot.ids = as.character (xref [flybase.ids]) tbl.md = cbind (tbl.md, proteinId=uniprot.ids) protein.id.types = rep ('UNIPROT', nrow (tbl.md)) NA.protein.indices = which (is.na (tbl.md$proteinId)) if (length (NA.protein.indices) > 0) protein.id.types [NA.protein.indices] = NA tbl.md = cbind (tbl.md, proteinIdType=protein.id.types) #tbl.md = cbind (tbl.md, proteinIdType=rep ('UNIPROT', nrow (tbl.md))) tbl.md = cbind (tbl.md, organism = rep (organism, nrow (tbl.md))) counts = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx)))))) tbl.md = cbind (tbl.md, sequenceCount=counts) tbl.md = cbind (tbl.md, bindingSequence=rep (NA_character_, nrow (tbl.md))) bindingDomains = rep (NA, nrow (tbl.md)) indices = match (tbl.md$providerId, names (list.BD)) bindingDomain = as.character (list.BD)[indices] bindingDomain [bindingDomain == 'NA'] = NA bindingDomain [bindingDomain == ''] = NA tbl.md = cbind (tbl.md, bindingDomain) experiment.types = rep (NA_character_, nrow (tbl.md)) from.Cell = grep ('_Cell_', native.names.raw) from.NAR = grep ('_NAR_', native.names.raw) from.SOLEXA = grep ('_SOLEXA_', native.names.raw) from.SANGER = grep ('_SANGER_', native.names.raw) from.FlyReg = grep ('_FlyReg_', native.names.raw) experiment.types [from.SOLEXA] = 'bacterial 1-hybrid, SOLEXA sequencing' experiment.types [from.SANGER] = 'bacterial 1-hybrid, SANGER sequencing' experiment.types [from.Cell] = 'bacterial 1-hybrid, SANGER sequencing' experiment.types [from.NAR] = 'bacterial 1-hybrid, SANGER sequencing' experiment.types [from.FlyReg] = 'DNASE I footprinting' tbl.md = cbind (tbl.md, tfFamily=rep(NA_character_, nrow (tbl.md))) tbl.md = cbind (tbl.md, experimentType=experiment.types) pubmedIDs = rep (NA_character_, nrow (tbl.md)) pubmedIDs [from.Cell] = '18332042' pubmedIDs [from.NAR] = '1858360' tbl.md = cbind (tbl.md, pubmedID=pubmedIDs) invisible (tbl.md) } # createMetadata #------------------------------------------------------------------------------------------------------------------------ normalizeMatrices = function (matrices) { mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) invisible (mtx.normalized) } # normalizeMatrices #------------------------------------------------------------------------------------------------------------------------ renameMatrices = function (matrices, tbl.md) { stopifnot (length (matrices) == nrow (tbl.md)) names (matrices) = rownames (tbl.md) invisible (matrices) } # renameMatrices #------------------------------------------------------------------------------- fbgnToIDs <- function(fbgns, useInputForMissingValues=TRUE) { fbgns <- unique(fbgns) if(!exists("tbl.names")) { tbl.sym <- toTable(org.Dm.egSYMBOL) tbl.fbgn <- toTable(org.Dm.egFLYBASE) tbl.names <<- merge(tbl.fbgn, tbl.sym)[, c("flybase_id", "symbol", "gene_id")] flyMart <<- useMart(biomart="ensembl", dataset="dmelanogaster_gene_ensembl") } tbl.bm <- getBM(attributes=c("flybase_gene_id", "entrezgene"), filters=c("flybase_gene_id"), values=fbgns, mart=flyMart) fbgns.notfound <- setdiff(fbgns, tbl.bm$flybase_gene_id) if(length(fbgns.notfound) > 0){ new.rows <- data.frame(flybase_gene_id=fbgns.notfound, entrezgene=NA, stringsAsFactors=FALSE) tbl.bm <- rbind(tbl.bm, new.rows) } tbl.bm <- unique(tbl.bm) # some fbgns could be mapped to multiple entrez genes. remove all but the # one which occurs first duplicates <- which(duplicated(tbl.bm$flybase_gene_id)) if(length(duplicates) > 0) tbl.bm <- tbl.bm[-duplicates,] stopifnot(sort(fbgns) == sort(tbl.bm$flybase_gene_id)) # order the biomart results so that it matches the order of the incoming fgbns tbl.bm <- tbl.bm[match(fbgns, tbl.bm$flybase_gene_id),] hits <- match(tbl.bm$entrezgene, tbl.names$gene_id) result <- tbl.names[hits,] rownames(result) <- fbgns # not all FBgn ids have an associated entrez geneID. # use these orphan FBgn ids as their own "geneID" failed.lookups <- which(is.na(result$flybase_id)) failed.count <- length(failed.lookups) if(useInputForMissingValues & failed.count > 0) { failed.names <- rownames(result)[failed.lookups] names.matrix <- matrix(rep(failed.names, failed.count), nrow=failed.count) result[failed.lookups,] <- names.matrix } geneIdType <- rep("ENTREZ", nrow(result)) geneIdType [failed.lookups] <- "FLYBASE" result <- cbind(result, geneIdType, stringsAsFactors=FALSE) result } # fbgnToIDs #------------------------------------------------------------------------------------------------------------------------