## This file will contain the code to make ONE Inparanoid DB from ## source files it downloads, unzips and imports (including metadata). ## The code to wrap that file into a package will still live in ## AnnotationForge for now. ## TODO add a table with taxIDs mapped onto the short names. ## this file is here: http://inparanoid.sbc.su.se/download/current/sequences/species.mapping.inparanoid8 ## helper to get a single dir listing. .getSubDirs <- function(dir){ ## now get the dir require(RCurl) require(XML) ## So I need to make a handler that collects the parts I want: getLinks = function() { links = character() list(a = function(node, ...) { links <<- c(links, xmlGetAttr(node, "href")) node }, links = function()links) } ## now get the links (sub dirs) h1 = getLinks() htmlTreeParse(dir, handlers = h1) res <- h1$links() ## base result ## some filtering res <- res[!(res %in% c("?C=N;O=D", "?C=M;O=A", "?C=S;O=A", "?C=D;O=A", "/download/current/"))] res } ## helper to get one file set .getAFileSet <- function(dir, dataDir){ files <- .getSubDirs(dir) ## filter empty dirs (are not tarballs) files <- files[grepl(".tgz$",files)] allPaths <- paste0(dir, files) ## now actually download all these files localFiles <- file.path(dataDir, files) mapply(download.file, url=allPaths, destfile=localFiles) ## untar all that stuff... lapply(localFiles, untar, exdir=dataDir) ## And then return the list of junk you just downloaded localFiles } ## helper for removing junk from data and outputting warnings as needed .prepareData <- function(file, cols) { ## read in the data ## AND some of their files have a period randomly stuck onto the end... ## OKAY then we will do this nonsense to try and recover... if(!file.exists(file) && .Platform$OS.type == "unix"){ message("Attempting to guess the fileName based on:", file) file <- system(paste0("ls ",file,"*"), intern=TRUE) message("Our guess is that you wanted:", file) } message("reading in ", file) insVals <- read.delim(file=file, header=FALSE, sep="\t", quote="", colClasses=c("integer", rep("character", 5)), stringsAsFactors=FALSE) ## check to see if any critical stuff is missing countCol <- function(col){ numNA <- sum(is.na(col)) } critVals <- insVals[, cols] ## FIXME: don't need to coerce to matrix to get column count of NAs ## should be sapply(critVals, countCol) NAColCnts <- apply(as.matrix(critVals), 2, countCol) ## Then we need to make log entries for flaws that we find... clnVals <- insVals for(i in 1:length(NAColCnts)){ if(NAColCnts[i]>0){ warning(paste("CRITICAL TABLE FLAW! There were ",NAColCnts[i], " NAs inside of critical col ",cols[i]," inside the file named ", file,'\n',sep="")) ## cat(paste("CRITICAL TABLE FLAW! There were ",NAColCnts[i], ## " NAs inside of critical col ",cols[i]," inside the file named ", ## file,'\n',sep=""),file="BADINPSrcFiles.log", append=TRUE) ## then scrub out the bad data rows (on crit cols) clnVals <- clnVals[!is.na(insVals[, cols[i]]), ] } } clnVals } ## helper to get us a .lookupTableName <- function(shortName){ allNames <- read.delim(system.file('extdata','inp8_Full_species_mapping', package='AnnotationForge'), sep="\t", header=TRUE, stringsAsFactors=FALSE) lidx <-grepl(shortName, allNames$inparanoidSpecies) res <- allNames[lidx, 'tableNames'] #gsub(" ", "_", gsub("-","_", gsub("\\.","",res))) } ## helper to populate one file to become a table in a database .popInpTable = function(con, file, species, dataDir="."){ ## cleanup because there is some peculiarity for some of their files. file <- sub("\\..tgz","\\.tgz",file) ## extract the tableName we need for the table name etc. ## fileSpecies <- sub("^InParanoid.+?-","",file, perl=TRUE) fileSpecies <- sub("^.*/InParanoid.+?-","",file, perl=TRUE) ## DEBUG fileSpecies <- sub(".tgz$","",fileSpecies) ## get tableName by translating the fileSpecies tableName <- .lookupTableName(fileSpecies) ## tableName <- sub("\\.","_",fileSpecies) ## tableName <- sub("-","_",tableName) ## get the actual extracted src file name that I need. srcFile <- file.path(dataDir, paste0("sqltable.",species,"-",fileSpecies)) ##Make a table message(paste0("Creating table: ", tableName)) sql<- paste0(" CREATE TABLE IF NOT EXISTS ", tableName, " ( clust_id INTEGER NOT NULL, clu2 VARCHAR(10) NOT NULL, species VARCHAR(15) NOT NULL, score VARCHAR(6) NOT NULL, inp_id VARCHAR(30) NOT NULL, seed_status CHAR(4));") sqliteQuickSQL(con, sql) ##Populate it with the contents of the filename message(cat(paste0("Populating table: ",tableName))) clnVals <- .prepareData(file=srcFile) sql<- paste0(" INSERT into ", tableName, " (clust_id,clu2,species,score,inp_id,seed_status) VALUES (?,?,?,?,?,?)") dbBeginTransaction(con) dbGetPreparedQuery(con, sql, clnVals) dbCommit(con) } ## Helper to toss out all the generated files... .cleanupFiles <- function(files, dataDir="."){ ## unlink allows wildcards fileBase <- sub(file.path(dataDir,""),"", sub("InParanoid","", sub(".tgz","",files))) unlink(file.path(dataDir,paste0("*",fileBase,"*"))) unlink(file.path(dataDir,"*.stdout")) } ## .makeMapCounts <- function(con, species){ ## tabs <- ## lookup function for "Full_species_mapping" ## ## Then generate sql inserts ## sqls <- paste0('INSERT INTO map_metadata ', tabs) ## } ## TODO: add Organism and species to the metadata! ## helper for filling in the metadtaa table .makeBasicMetadata <- function(con, species){ message(paste0("Creating metadata table")) sql<- paste0(" CREATE TABLE if not exists metadata ( name VARCHAR(80) PRIMARY KEY, value VARCHAR(255));") sqliteQuickSQL(con, sql) meta <- read.delim(system.file('extdata','inp8_metadata', package='AnnotationForge'), sep="\t", header=TRUE, stringsAsFactors=FALSE) species <- sub("_", " ", species) meta[meta$name=='ORGANISM','value'] <- species meta[meta$name=='SPECIES','value'] <- species sql<- paste0("INSERT INTO metadata VALUES (:name, :value)") dbBeginTransaction(con) dbGetPreparedQuery(con, sql, bind.data = meta) dbCommit(con) } ## map metadata is also something I don't need to inp8 (no more maps) ## .makeMapMetadata <- function(){ ## meta <- read.delim(system.file('extdata','inp8_metadata', ## package='AnnotationForge'), ## sep="\t", header=TRUE, stringsAsFactors=FALSE) ## } ## .makeMetadata <- function(con, species){ ## ## make regular metadata. ## .makeBasicMetadata(con) ## ## make the map_metadata ## .makeMapMetadata(con) ## ## make the map_counts ??? - I am guessing that I don't even want this. ## } ## function to make a set of tables. makeInpDb <- function(dir, dataDir="."){ ## Start by getting all the data we need files <- .getAFileSet(dir, dataDir) ## temp hack if you don't want to re-DL the files each time ## files = dir('.', pattern='*.tgz') ## set up for a database require("RSQLite") ## the connection is for the KIND of DB abbrevSpeciesName <- sub("-.*.tgz$","",files[1]) ## abbrevSpeciesName <- sub("^InParanoid.","",abbrevSpeciesName) abbrevSpeciesName <- sub("^.*/InParanoid.","",abbrevSpeciesName) ## DEBUG! DBNAME <- paste0("hom.", .lookupTableName(abbrevSpeciesName), ".inp8", ".sqlite") con <- dbConnect(SQLite(),dbname=DBNAME) ## each DB will need a connection for(i in seq_along(files)){ ## Then start making tables... .popInpTable(con, file=files[i], species=abbrevSpeciesName, dataDir=dataDir) } ## Don't forget the metadata... ## .makeMetadata(con, species=.lookupTableName(abbrevSpeciesName)) .makeBasicMetadata(con, species=.lookupTableName(abbrevSpeciesName)) ## And end by unlinking all the data we don't need anymore... .cleanupFiles(files, dataDir) ## and close the connection dbDisconnect(con) ## return the name of the DB connection for external saving DBNAME } ## This part of the file will contain the code to make all the ## Inparanoid DBs by knowing about where the resources live online and ## then asking for each one in turn. ## makeInpDbs <- function(dataDir="."){ ## curLoc <- 'http://inparanoid.sbc.su.se/download/current/Orthologs/' ## subDirs <- .getSubDirs(curLoc) ## ## minor filtering ## ## subDirs <- subDirs[!(subDirs %in% c('stderr/'))] ## ## species <- sub("/","",subDirs) ## ## allDirs <- paste0(curLoc, subDirs) ## ## ## for each AllDir, we want to make a DB with: ## ## lapply(allDirs, makeInpDb, dataDir) ## ## JUST for now: lets just make the ones we supported before (for testing) ## traditional <- c('A.thaliana','C.elegans','D.melanogaster','D.rerio','H.sapiens','M.musculus','R.norvegicus','S.cerevisiae') ## allDirs <- paste0(curLoc, traditional,"/") ## lapply(allDirs, makeInpDb, dataDir) ## } ## Test this out ## library(InparanoidBaseBuilder); makeInpDbs() ############################################################################### ## Then from the DB you can do this ## library(AnnotationDbi); hom.Homo_sapiens.inp8.db <- loadDb('hom.Homo_sapiens.inp8.sqlite') ## test keytypes and columns() ## keytypes(hom.Homo_sapiens.inp8.db) ## ## RESULTS are TOO short. :( ## This *appears* to work... ## k = head(keys(hom.Homo_sapiens.inp8.db, keytype="PONGO_ABELII")) ## This doesn't work right (in part, because of 5 letter code holdover stuff) ## I need to make some simpler methods for inparanoid8 stuff... ## select(hom.Homo_sapiens.inp8.db, keys=k, columns="MUS_MUSCULUS", keytype="PONGO_ABELII") ## TODO: add Organism and species to the metadata! ### TEST again: ## library(AnnotationForge); ## debug(makeInpDb); ## makeInpDb(dir="http://inparanoid.sbc.su.se/download/current/Orthologs/A.aegypti/", dataDir=".") ## debug(AnnotationForge:::.lookupTableName) ## library(AnnotationForge); ## debug(AnnotationForge:::.cleanupFiles); ## debug(makeInpDb); ## debug(AnnotationForge:::.popInpTable); ## example(makeInpDb) ## I need to have the db able to be saved ## library(AnnotationForge); ## db <- makeInpDb(dir="http://inparanoid.sbc.su.se/download/current/Orthologs/A.aegypti/", dataDir=tempdir())