# MotifDb/inst/scripes/import/cisbp/import.R #------------------------------------------------------------------------------------------------------------------------ options (stringsAsFactors=FALSE) printf <- function(...) print(noquote(sprintf(...))) #------------------------------------------------------------------------------------------------------------------------ run = function (dataDir) { printf("cisbp matrix and metadata import"); createMotifDbArchiveFile(dataDir, "cisbp.RData", count=NA) } # run #------------------------------------------------------------------------------------------------------------------------ # cisbp distributes metadata in a MySQL database with many tables. i have cobbled together an sql query to # extract a metadata data.frame from this database. first, however, here are my notes on creating and filling # that database. # not that an alternative to all that fuss is to simply used this file, created by me and with all the # information we currenly need: # # file.path(dataDir, "cisbp", "cisbp-metadata-6559-matrices.Rdata") # # to create from scratch: # cd dataDir/cisbp/sqlTables # --- create accounts # bash> mysql -u root # mysql> create user 'pshannon'@'localhost'; # mysql> grant all privileges on *.* to 'pshannon'@'localhost'; # mysql> quit # bash> mysql -u pshannon # mysql> create database cisbp; # mysql> quit # --- load the data # bash> bash load.sh # createMetadataTableFromDatabase <- function(dataDir, user) { # we want metadata only for those matrices found in the cisbp pwms download directory # so begin by getting those nams pwm.directory <- file.path(dataDir, "cisbp", "pwms") matrix.names <- list.files(pwm.directory) matrix.names <- sub(".txt", "", matrix.names, fixed=TRUE) printf("matrix names: %d", length(matrix.names)) s <- paste(matrix.names, collapse="','") formatted.matrix.names.as.group <- sprintf("'%s'", s) require(RMySQL) db <- dbConnect(MySQL(), user='pshannon', dbname='cisbp') matrix.selector <- sprintf("where ma.Motif_ID in (%s)", formatted.matrix.names.as.group) select <- "select ma.ma_id, ma.tf_id, ma.motif_id, ma.species, tf.TF_Name, ms.PMID, fa.Family_Name, pr.DBID" from <- "from motif_all as ma, tfs as tf, motifs as mo, motif_sources as ms, tf_families as fa, proteins as pr" where <- paste(matrix.selector, "and ma.Motif_ID=mo.Motif_ID", "and ma.TF_ID = tf.TF_ID", "and ma.Evidence = 'D'", "and mo.MSource_ID = ms.MSource_ID", "and tf.Family_ID = fa.Family_ID", "and pr.TF_ID = tf.TF_ID" ) query <- paste(select, from, where, sep=" ") tbl <- dbGetQuery(db, query) print(dim(tbl)) print(dim(unique(tbl[, 1:8]))) # [1] 15694 8 print(dim(unique(tbl[, 1:7]))) # [1] 6559 7 dups <- which(duplicated(tbl[, 1:7])) if(length(dups) > 0) tbl <- tbl[-dups,] dim(tbl) filename <- file.path(dataDir, "cisbp", "cisbp-metadata-6559-matrices.Rdata") printf("saving metdata data.frame to %s", filename) save(tbl, file=filename) } # createMetadataTableFromDatabase #------------------------------------------------------------------------------------------------------------------------ readRawMatrices = function (dataDir) { # our convention is that there is a shared "dataDir" visible to # the importer, and that within that directory there is one # subdirectory for each data source. # for this example importer, that directory will be <dataDir>/test # within which we will look for one small file "sample.pcm" filename <- file.path(dataDir, "cisbp", "sample.pcm") printf("checking for readable matrix file:") printf(" %s", filename) stopifnot(file.exists(filename)) all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE) title.lines = grep ('^>', all.lines) title.line.count <<- length (title.lines) max = title.line.count - 1 pwms = list () for (i in 1:max) { start.line = title.lines [i] end.line = title.lines [i+1] - 1 new.pwm = parsePwm (all.lines [start.line:end.line]) pwms = c (pwms, list (new.pwm)) } # for i invisible (pwms) } # readRawMatrices #------------------------------------------------------------------------------------------------------------------------ extractMatrices = function (pwm.list) { matrices = sapply (pwm.list, function (element) element$matrix) matrix.names <- sapply (pwm.list, function (element) element$title) matrix.names <- sub("^>", "", matrix.names) names (matrices) <- matrix.names matrices } # extractMatrices #------------------------------------------------------------------------------------------------------------------------ # where incoming x is a list with elements like this # # $ma_id =[1] "MA0116870_1.02" # $tf_id = [1] "T025973_1.02" # $motif_id = [1] "M0316_1.02" # $species = [1] "Mus_musculus" # $TF_Name = [1] "Nfil3" # $PMID = [1] "25215497" # $Family_Name = [1] "bZIP" # $DBID = [1] "ENSMUSP00000065363" # translateMetadataToMotifDbStandardForm <- function(x) { checkTrue(all(c("tf_id", "motif_id", "species", "TF_Name", "PMID", "Family_Name", "DBID") %in% names(x))) standardizedOrganism <- standardizeOrganism(x$species) proteinIdType <- deduceProteinIdType(x$DBID, standardizedOrganism) std <- list(providerName=x$motif_id, providerId=x$ma_id, dataSource="cisbp_1.02", geneSymbol=x$TF_Name, geneId=NA, geneIdType=NA, proteinId=x$DBID, proteinIdType=proteinIdType, organism=standardizedOrganism, sequenceCount=NA, bindingSequence=NA, bindingDomain=NA, tfFamily=x$Family_Name, experimentType=NA, pubmedID=x$PMID) std } # translateMetadataToMotifDbStandardForm #------------------------------------------------------------------------------------------------------------------------ deduceProteinIdType <- function(id, organism) { if(substr(id, 1, 1) == "Y" & organism == "Scerevisiae") return("SGD") if(substr(id, 1, 2) == "EN") return("ENSEMBL") return(NA) } # deduceProteinIdType #------------------------------------------------------------------------------------------------------------------------ standardizeOrganism <- function(x) { #printf("standardizeOrganism: %s", x); tokens <- strsplit(x, "_")[[1]] if(length(tokens) != 2){ warning(sprintf("cisbp import could not standardize species name: '%s'", x)) return(x) } result <- sprintf("%s%s", substr(tokens[1], 1, 1), tokens[2]) result } # standardizeOrganism #------------------------------------------------------------------------------------------------------------------------ createMotifDbArchiveFile <- function(dataDir, RDataFileName, count=NA) { if(is.na(count)) file.names <- list.files(file.path(dataDir, "cisbp", "pwms")) else file.names <- head(list.files(file.path(dataDir, "cisbp", "pwms")), n=count) parseFunction <- function(filename){ full.path <- file.path(dataDir, "cisbp", "pwms", filename) checkTrue(file.exists(full.path)) text <- scan(full.path, sep="\n", quiet=TRUE, what=character(0)) pwm <- parsePwm(title, text) pwm$matrix } matrices <- lapply(file.names, parseFunction) titles <- unlist(lapply(file.names, function(filename) sub(".txt", "", filename))) names(matrices) <- titles if(!exists("tbl")){ metadata.file <- file.path(dataDir, "cisbp", "cisbp-metadata-6559-matrices.Rdata") load(metadata.file, env=.GlobalEnv) } count <- length(titles) tbl.md <- as.data.frame(list(providerName=vector("character", count), providerId=vector("character", count), dataSource=vector("character", count), geneSymbol=vector("character", count), geneId=vector("character", count), geneIdType=vector("character", count), proteinId=vector("character", count), proteinIdType=vector("character", count), organism=vector("character", count), sequenceCount=vector("character", count), bindingSequence=vector("character", count), bindingDomain=vector("character", count), tfFamily=vector("character", count), experimentType=vector("character", count), pubmedID=vector("character", count))) tbl.md.row = 0 for(title in titles){ tbl.md.row <- tbl.md.row + 1 if(!title %in% tbl$motif_id){ #printf("skipping matrix %d, no metadata for %s", tbl.md.row, title) next } row <- grep(title, tbl$motif_id) md <- as.list(tbl[row,]) md.fixed <- translateMetadataToMotifDbStandardForm(md) tbl.md[tbl.md.row,] <- as.data.frame(md.fixed) } # for title empties <- which(nchar(tbl.md$providerName) == 0) if(length(empties) > 0){ tbl.md <- tbl.md[-empties,] } rownames(tbl.md) <- paste(tbl.md$organism, tbl.md$dataSource, tbl.md$providerName, sep="-") matrices <- matrices[-empties] names(matrices) <- rownames(tbl.md) printf("saving %d matrices with metadata to %s", nrow(tbl.md), file.path(getwd(), RDataFileName)) save(tbl.md, matrices, file=RDataFileName) # rebuild & install MotifDb, then watch as it loads: # x <- MotifDb:::.MotifDb(TRUE, FALSE) # mcols(query(x, "csipb")) } # createMotifDbArchiveFile #------------------------------------------------------------------------------------------------------------------------ ## # files are named by Motif_ID, which also provides the database key used to create the metadata entries ## # for each motif's matrix, "M1093_1.02.txt" and "M1093_1.02" ## # cisbp at version 1.02 has 6559 matrices. each of these is annotated to different organisms ## # producing maybe 70k metadata table entries ## # M1093 ## createMetadataTable = function (dataDir, motifIDs) ## { ## ## select <- "select mb.Motif_ID, tf.TF_Name, Species, PMID" ## from <- "from motif_best as mb, tfs as tf, motifs as mo, motif_sources as ms" ## where <- "where mb.Motif_ID='M1903_1.02' and mb.Motif_ID=mo.Motif_ID and mb.TF_ID = tf.TF_ID and mo.MSource_ID=ms.MSource_ID" ## ## select <- "select mb.Motif_ID, tf.TF_Name, Species, PMID, Family_Name, pr.DBID" ## from <- "from motif_best as mb, tfs as tf, motifs as mo, motif_sources as ms, tf_families as fa, proteins as pr" ## where <- paste(sprintf("where mb.Motif_ID='%s' and ", "M1903_1.02"), ## "mb.Motif_ID=mo.Motif_ID and", ## "mb.TF_ID = tf.TF_ID and", ## "mo.MSource_ID = ms.MSource_ID and", ## "tf.Family_ID = fa.Family_ID and", ## "pr.TF_ID = tf.TF_ID") ## ## tbl <- dbGetQuery(db, paste(select, from, where, sep=" ")) ## # multiple protein identifiers are often returned; we need just one ## dups <- which(duplicated(tbl[, -(grep("DBID", colnames(tbl)))])) ## if(length(dups) > 0) ## tbl <- tbl[-dups,] ## ## filename <- file.path(dataDir, "cisbp", "md-raw.tsv") ## printf("checking for readable metadata file:") ## printf(" %s", filename) ## stopifnot(file.exists(filename)) ## ## tbl.raw <- read.table(filename, sep="\t", header=TRUE, as.is=TRUE) ## tbl.md = data.frame () ## matrix.ids = names (matrices) ## ## for (matrix.id in matrix.ids) { ## matrix <- matrices[[matrix.id]] ## short.matrix.name <- sub("\\..*$", "", matrix.id) ## stopifnot(length(grep(short.matrix.name, tbl.raw$ma.name)) == 1) ## md <- as.list(subset(tbl.raw, ma.name==short.matrix.name)) ## dataSource <- "cisbp" ## #browser() ## organism <- ncbiTaxonimicCodeToLinnaean(md$ncbi.tax.code) ## new.row = list (providerName=matrix.id, ## providerId=matrix.id, ## dataSource=dataSource, ## geneSymbol=md$gene.symbol, ## geneId=NA, ## geneIdType=NA, ## proteinId=md$uniprot, ## proteinIdType=guessProteinIdentifierType(md$uniprot), ## organism=ncbiTaxonimicCodeToLinnaean(md$ncbi.tax.code), ## sequenceCount=max(colSums(matrix)), ## bindingSequence=NA_character_, ## bindingDomain=NA, ## tfFamily=md$family, ## experimentType=md$type, ## pubmedID="24194598") ## tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE)) ## full.name = sprintf ('%s-%s-%s', organism, dataSource, matrix.id) ## rownames (tbl.md) [nrow (tbl.md)] = full.name ## } # for i ## ## invisible (tbl.md) ## ## } # createMetadataTable ## #------------------------------------------------------------------------------------------------------------------------ ## renameMatrices = function (matrices, tbl.md) ## { ## stopifnot (length (matrices) == nrow (tbl.md)) ## names (matrices) = rownames (tbl.md) ## invisible (matrices) ## ## } # renameMatrices ## #------------------------------------------------------------------------------------------------------------------------ ## # an empirical and not altogether trustworthy solution to identifying identifier types. ## guessProteinIdentifierType = function (moleculeName) ## { ## if (nchar (moleculeName) == 0) ## return (NA_character_) ## if (is.na (moleculeName)) ## return (NA_character_) ## ## first.char = substr (moleculeName, 1, 1) ## ## if (first.char == 'Y') ## return ('SGD') ## ## if (first.char %in% c ('P', 'Q', 'O', 'A', 'C')) ## return ('UNIPROT') ## ## if (length (grep ('^NP_', moleculeName)) == 1) ## return ('REFSEQ') ## ## return (NA_character_) ## ## } # guessProteinIdentifierType ## #------------------------------------------------------------------------------------------------------------------------ ## normalizeMatrices = function (matrices) ## { ## mtx.normalized = sapply (matrices, ## function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector))) ## ## invisible (mtx.normalized) ## ## } # normalizeMatrices ## #------------------------------------------------------------------------------------------------------------------------ parsePwm = function (title, text) { lines <- strsplit(text, '\t') stopifnot(lines[[1]] == c("Pos", "A", "C", "G", "T")) line.count <- length(lines) row.count <- length(lines) - 1 col.count <- 4 # our standard form is 4 rows (one per nucelotide) and n columns # cisbp matrices come in transposed: read them as-is, then transpose # to our format result <- matrix(nrow=row.count, ncol=col.count, dimnames=list(as.character(1:row.count), c("A","C", "G", "T"))) for(i in 1:row.count){ row <- i + 1 result [i,] = as.numeric (lines[[row]][2:5]) } # for i result = t (result) return (list (title=title, matrix=result)) } # parsePwm ## #---------------------------------------------------------------------------------------------------- ## ncbiTaxonimicCodeToLinnaean <- function(code) ## { ## code <- as.character(code) ## ## lookup <- list("3702" = "Arabidopsis thaliana", ## "3888" = "Pisum sativum", ## "4094" = "Nicontia sp.", ## "4102" = "Petunia hybrida", ## "4151" = "Antirrhinum majus", ## "4513" = "Hordeum vulgare", ## "4565" = "Triticum aestivam", ## "4577" = "Zea mays", ## "4932" = "Saccaromyces cerevisiae", ## "6239" = "Caenorhabditis elegans", ## "7227" = "Drosophila melanogaster", ## "7729" = "Halocynthia roretzi", ## "7742" = "Vertebrata", ## "8022" = "Onchorhynchus mykiss", ## "8355" = "Xenopus laevis", ## "8364" = "Silurana tropicalis", ## "9031" = "Gallus gallus", ## "9606" = "Homo sapiens", ## "9913" = "Bos taurus", ## "9986" = "Oryctolagus cuniculus", ## "10090" = "Mus musculus", ## "10116" = "Rattus norvegicus", ## "10117" = "Rattus rattus") ## ## if (code %in% names(lookup)) ## return(lookup[[code]]) ## ## NA ## ## } # ncbiTaxonimicCodeToLinnaean ## #---------------------------------------------------------------------------------------------------- ##