######################################################################### ######################################################################### #### Helper functions for generating a generic DB #### ######################################################################### ######################################################################### ## This makes the central table of an EG DB. .makeCentralTable <- function(entrez, con){ message("Populating genes table:") sql<- paste(" CREATE TABLE IF NOT EXISTS genes ( _id INTEGER PRIMARY KEY, gene_id VARCHAR(10) NOT NULL UNIQUE -- Entrez Gene ID );") sqliteQuickSQL(con, sql) gene_id <- data.frame(entrez) ## TODO: data.frame() necessary??? sql<- paste("INSERT INTO genes(gene_id) VALUES(?);") dbBeginTransaction(con) dbGetPreparedQuery(con, sql, gene_id) dbCommit(con) message("genes table filled") } ## The following takes a data.frame and produces a simple table from that. It ## expects that the 1st column of that data.frame will always be entrez gene ## IDs. All fields are assumed to be varchars of size equal to the values in ## fieldNameLens. TODO: The cols in data have to be named and of equal ## length. indFields is a character vector with the names of fields that we ## want indexed. By default only _id will be indexed. .makeEmptySimpleTable <- function(con, table, tableFieldLines){ sql<- paste(" CREATE TABLE IF NOT EXISTS",table," ( _id INTEGER NOT NULL, -- REFERENCES genes ",tableFieldLines," FOREIGN KEY (_id) REFERENCES genes (_id) );") sqliteQuickSQL(con, sql) } .makeSimpleTable <- function(data, table, con, fieldNameLens=25, indFields="_id"){ message(paste("Populating",table,"table:")) tableFieldLines <- paste(paste(names(data)[-1]," VARCHAR(", fieldNameLens,") NOT NULL, -- data"), collapse="\n ") ## For temp table, lets do it like this: if(dim(data)[1] == 0){ ## if we don't have anything to put into the table, then we don't even ## want to make a table. warning(paste("no values found for table ",table, " in this data chunk.", sep="")) ## Create our real table. .makeEmptySimpleTable(con, table, tableFieldLines) return() }else{ dbWriteTable(con, "temp", data, row.names=FALSE) ## Create our real table. .makeEmptySimpleTable(con, table, tableFieldLines) selFieldLines <- paste(paste("t.",names(data)[-1],sep=""),collapse=",") sql<- paste(" INSERT INTO ",table," SELECT g._id as _id, ",selFieldLines," FROM genes AS g, temp AS t WHERE g.gene_id=t.gene_id ORDER BY g._id; ", sep="") sqliteQuickSQL(con, sql) ## Add index to all fields in indFields (default is all) for(i in seq_len(length(indFields))){ sqliteQuickSQL(con, paste("CREATE INDEX IF NOT EXISTS ", table,"_",indFields[i],"_ind ON ",table, " (",indFields[i],");", sep="")) } ## drop the temp table sqliteQuickSQL(con, "DROP TABLE temp;") } message(paste(table,"table filled")) } ## used to gather ancestor nodes for GO terms .expandGOFrame <- function(frame, AncestMap){ if(dim(frame)[1] ==0){ res <- data.frame(gene_id=0, go_id=0, evidence=0) return(res[FALSE,]) } ## I want to apply through the original frame and call for the ancestor ancList <- mget(as.character(frame$go_id), AncestMap, ifnotfound=NA) names(ancList) <- frame$gene_id eviCodes <- mget(as.character(frame$go_id), AncestMap, ifnotfound=NA) names(eviCodes) <- frame$evidence expAncList <- unlist2(ancList) expEviCodes <- unlist2(eviCodes) extraRows <- data.frame(gene_id=names(expAncList), go_id=expAncList, evidence=names(expEviCodes)) ##remove rows where go_id="all" extraRows <- extraRows[extraRows$go_id != "all",] unique(rbind(frame,extraRows)) } ## used to make sure that GO IDs that are NOT in the current GO package do not ## end up in our DB .filterGOFrame <- function(frame){ message("Dropping GO IDs that are too new for the current GO.db") frame[frame[["go_id"]] %in% Lkeys(GOTERM),] } ## TODO: modify this so that it no longer unwinds lists... ## used to make the 6 custom GO tables .makeGOTablesFromNCBI <- function(con){ bp <- .filterGOFrame(sqliteQuickSQL(con, paste("SELECT distinct gene_id, go_id, evidence FROM gene2go", "WHERE category = 'Process'"))) mf <- .filterGOFrame(sqliteQuickSQL(con, paste("SELECT distinct gene_id, go_id, evidence FROM gene2go", "WHERE category = 'Function'"))) cc <- .filterGOFrame(sqliteQuickSQL(con, paste("SELECT distinct gene_id, go_id, evidence FROM gene2go", "WHERE category = 'Component'"))) headerNames = c("gene_id","go_id","evidence") names(bp) <- headerNames names(mf) <- headerNames names(cc) <- headerNames .makeSimpleTable(bp, table = "go_bp", con, fieldNameLens=c(10,3), indFields = c("_id", "go_id")) .makeSimpleTable(mf, table = "go_mf", con, fieldNameLens=c(10,3), indFields = c("_id", "go_id")) .makeSimpleTable(cc, table = "go_cc", con, fieldNameLens=c(10,3), indFields = c("_id", "go_id")) ## Now expand the three data.frames to incl all ancestor terms bp_all <- .expandGOFrame(bp, GOBPANCESTOR) mf_all <- .expandGOFrame(mf, GOMFANCESTOR) cc_all <- .expandGOFrame(cc, GOCCANCESTOR) .makeSimpleTable(bp_all, table = "go_bp_all", con, fieldNameLens=c(10,3), indFields = c("_id", "go_id")) .makeSimpleTable(mf_all, table = "go_mf_all", con, fieldNameLens=c(10,3), indFields = c("_id", "go_id")) .makeSimpleTable(cc_all, table = "go_cc_all", con, fieldNameLens=c(10,3), indFields = c("_id", "go_id")) } ######################################################################### ######################################################################### .downloadAndSaveToTemp <- function(url, tmp){ require(RCurl) if (!url.exists(url)) stop("URL '", url, "' does not exist") binRes <- getBinaryURL(url) writeBin(binRes, con=tmp) } .tryDL <- function(url, tmp){ times = 4 ## try this many times to DL for(i in 1:times){ ## tryResult <- try( download.file(url, tmp, quiet=TRUE) , silent=TRUE) tryResult <- try( .downloadAndSaveToTemp(url, tmp) , silent=TRUE) if(is(tryResult,"try-error") && i < times){ Sys.sleep(20) }else if(is(tryResult,"try-error") && i >= times){ msg = paste("After 3 attempts, AnnotationDbi is still not able", "to access the following URL:", url, "You might want to try again later.", sep=" ") stop(paste(strwrap(msg,exdent=2), collapse="\n")) }else{ return() } } } .downloadData <- function (file, tax_id, NCBIFilesDir=NULL) { colClasses1 <- c("character",rep("NULL", times= length(unlist(file))-1)) colClasses2 <- c(rep("character", times= length(unlist(file)))) ## names(file) is something like: "gene2go.gz" message(paste("Getting data for ",names(file),sep="")) ## Where to DL from url <- paste("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/",names(file), sep="") if(is.null(NCBIFilesDir)){ tmp <- tempfile() ##download.file(url, tmp, quiet=TRUE) .tryDL(url, tmp) }else{ tmp <- paste(NCBIFilesDir, names(file), sep=.Platform$file.sep) if(!file.exists(tmp)){ .tryDL(url, tmp) }## otherwise we already have the file saved } ## TODO: this check could be more robust... header <- unlist(strsplit(readLines(tmp, n=1),"\t")) if(all(unlist(file) %in% header)){ ## then we have been here before. message("reading from a pre-processed file") vals <- read.delim(tmp, header=TRUE, sep="\t", quote="", stringsAsFactors=FALSE) }else{ message("discarding data from other organisms") if("tax_id" %in% unlist(file)){ ## when there is a tax_id need to subset tax_ids <- unlist(read.delim(tmp,header=FALSE,sep="\t",skip=1, stringsAsFactors=FALSE, quote="", colClasses = colClasses1)[,1]) ind <- grep(paste("^",tax_id,"$",sep=""), tax_ids, perl=TRUE) if(length(ind) == 0){## ie there are no matching tax_ids... vals <- data.frame(t(1:length(unlist(file))), stringsAsFactors=FALSE) colnames(vals) <- unlist(file) vals <- vals[FALSE,] }else{ vals <- read.delim(tmp, header=FALSE, sep="\t", skip=1+min(ind), nrows= max(ind) - min(ind) +1, stringsAsFactors=FALSE, quote="", colClasses = colClasses2) } }else{ vals <- read.delim(tmp, header=FALSE, sep="\t", skip=1, stringsAsFactors=FALSE, quote="", colClasses = colClasses2) } ## The following will just keep unwanted data from our temp DB tables. ## if there is a tax_id, ## then only return the piece that matches the organism in question colnames(vals) <- unlist(file) if(!is.null(vals[["tax_id"]])){ vals <- vals[vals[["tax_id"]]==tax_id,] } ## Remove the following save line if you think there are problems ## with the tx_id filtering above. ## save the processed results ONLY if(!is.null(NCBIFilesDir)){ write.table(vals, sep="\t", quote=FALSE, row.names=FALSE, file=tmp) } } vals } ## helper to populate base tables .populateBaseTable <- function(con, sql, data, table) { dbBeginTransaction(con) dbGetPreparedQuery(con, sql, data) dbCommit(con) message(paste("table ",table," filled",sep="")) } ## Convenience function for quickly making indices .makeIndex <- function(con,table, field){ indName <-paste("ind",table,field,sep="_") sqliteQuickSQL(con, paste("CREATE INDEX IF NOT EXISTS",indName,"ON",table,"(",field,")")) } ## metadata stuff .createMetadataTables <- function(con){ sqliteQuickSQL(con, paste("CREATE TABLE IF NOT EXISTS metadata (name", "VARCHAR(80) PRIMARY KEY,value VARCHAR(255))")) sqliteQuickSQL(con, paste("CREATE TABLE IF NOT EXISTS map_metadata ", "(map_name VARCHAR(80) NOT NULL,", "source_name VARCHAR(80) NOT NULL,", "source_url VARCHAR(255) NOT NULL,", "source_date VARCHAR(20) NOT NULL)")) sqliteQuickSQL(con, paste("CREATE TABLE IF NOT EXISTS map_counts ", "(map_name VARCHAR(80) PRIMARY KEY,", "count INTEGER NOT NULL)")) } .addMeta <- function(con, name, value){ data = data.frame(name,value,stringsAsFactors=FALSE) sql <- "INSERT INTO metadata (name,value) VALUES(?,?)" .populateBaseTable(con, sql, data, "metadata") } .addMetadata <- function(con, tax_id, genus, species){ name <- c("DBSCHEMAVERSION","DBSCHEMA","ORGANISM","SPECIES","CENTRALID", "TAXID", "EGSOURCEDATE","EGSOURCENAME","EGSOURCEURL", "GOSOURCEDATE","GOSOURCENAME","GOSOURCEURL", "GOEGSOURCEDATE","GOEGSOURCENAME","GOEGSOURCEURL", "Db type","Supporting package") value<- c("2.1","ORGANISM_DB",paste(genus,species),paste(genus,species),"EG", tax_id, date(), "Entrez Gene","ftp://ftp.ncbi.nlm.nih.gov/gene/DATA", .getGODate(), "Gene Ontology","ftp://ftp.geneontology.org/pub/go/godata", date(), "Entrez Gene","ftp://ftp.ncbi.nlm.nih.gov/gene/DATA", "OrgDb","AnnotationDbi") .addMeta(con, name, value) } .addMapMeta <- function(con, map_name, source_name, source_url, source_date){ data = data.frame(map_name,source_name,source_url, source_date,stringsAsFactors=FALSE) sql <- paste("INSERT INTO map_metadata (map_name,source_name,source_url,", "source_date) VALUES(?,?,?,?)") .populateBaseTable(con, sql, data, "map_metadata") } .addMapMetadata <- function(con, tax_id, genus, species){ map_name <- c("ENTREZID","GENENAME","SYMBOL","CHR","ACCNUM","REFSEQ","PMID", "PMID2EG","UNIGENE","ALIAS2EG","GO2EG","GO2ALLEGS", "GO","GO2ALLEGS") source_name <- c(rep("Entrez Gene",12),rep("Gene Ontology",2)) source_url <- c(rep("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA",12), rep("ftp://ftp.geneontology.org/pub/go/go",2)) source_date <- c(rep(date(),12),rep(.getGODate(),2)) .addMapMeta(con, map_name, source_name, source_url, source_date) ##clean up unwanted data ## if we used blast2GO, then we need to drop classic GO goSrcs <- sqliteQuickSQL(con, "SELECT source_name FROM map_metadata WHERE map_name='GO2EG'") if("Gene Ontology" %in% goSrcs & "blast2GO" %in% goSrcs){ sqliteQuickSQL(con,paste("DELETE FROM map_metadata where map_name", "='GO2EG' AND source_name='Gene Ontology'")) } } ## Use when there is no GO data .getBlast2GOData <- function(tax_id, con) { url = paste("http://www.b2gfar.org/_media/species:data:", tax_id,".annot.zip",sep="") tmp <- tempfile() ##download.file(url, tmp, quiet=TRUE) .tryDL(url,tmp) vals <- read.delim(unzip(tmp), header=FALSE, sep="\t", quote="", stringsAsFactors=FALSE) ## I will need to so extra stuff here to match up categories etc. ## (vals has to look like gene2go would, and I have to join to refseq and to ## accession just to get my EGs) RSVals <- vals[grep("refseq",vals[,4]),c(2,3)] GBVals <- vals[grep("genbank",vals[,4]),c(2,6)] colnames(GBVals) <- colnames(RSVals) <- c("go_id","accession") tables <- sqliteQuickSQL(con,"SELECT * FROM sqlite_master")$name if(!any(c("gene2refseq","gene2accession") %in% tables)){ stop("It is impossible to match up blasted GO terms with NCBI accessions.") }else{ if("gene2refseq" %in% tables){ g2rs <- sqliteQuickSQL(con, "SELECT gene_id, protein_accession FROM gene2refseq") g2rs[,2] <- sub("\\.\\d+$","",g2rs[,2]) vals <- merge(RSVals,g2rs, by.x="accession" , by.y="protein_accession") } if("gene2accession" %in% tables){ g2ac <- sqliteQuickSQL(con, "SELECT gene_id, protein_accession FROM gene2accession") gb <- merge(GBVals,g2ac, by.x="accession" , by.y="protein_accession") if("gene2refseq" %in% tables){ vals <- rbind(vals, gb) }else{ vals <- gb } } } ## Add to the metadata: name <- c("BL2GOSOURCEDATE","BL2GOSOURCENAME","BL2GOSOURCEURL") value<- c(date(),"blast2GO","http://www.blast2go.de/") .addMeta(con, name, value) ## modify and then add back to map_metadata map_name <- c("GO2EG", "GO2ALLEGS") source_name <- c(rep("blast2GO",2)) source_url <- c(rep("http://www.blast2go.de/",2)) source_date <- c(rep(date(),2)) .addMapMeta(con, map_name, source_name, source_url, source_date) ## then assemble the data back together to make it look like gene2go so ## it can be inserted there. rlen <- dim(vals)[1] data <-data.frame(tax_id = rep(tax_id, rlen), gene_id = as.character(vals[["gene_id"]]), go_id = vals[["go_id"]], evidence = rep("IEA", rlen), go_qualifier = rep("-", rlen), go_description = rep("-", rlen), pubmed_id = rep("-", rlen), category = Ontology(vals[["go_id"]]), stringsAsFactors=FALSE) data[,8] <- sub("BP","Process",data[,8]) data[,8] <- sub("CC","Component",data[,8]) data[,8] <- sub("MF","Function",data[,8]) ## cleanup of file left behind by unzip() operation. if(file.exists(paste(tax_id,".annot",sep=""))){ file.remove(paste(tax_id,".annot",sep="")) } ## return data data } ## need to be able to generate the columns data from the files and the types. .generateCols <- function (file) { types <- c("tax_id" = "INTEGER NOT NULL", "gene_id" = "INTEGER NOT NULL", "go_id" = "TEXT NOT NULL", "evidence" = "TEXT", "go_qualifier" = "TEXT", "go_description" = "TEXT", "pubmed_id" = "INTEGER", "category" = "TEXT", "status" = "TEXT", "rna_accession" = "TEXT", "rna_gi" = "INTEGER", "protein_accession" = "TEXT", "protein_gi" = "INTEGER", "genomic_dna_accession" = "TEXT", "genomic_dna_gi" = "INTEGER", "genomic_start" = "INTEGER", "genomic_end" = "INTEGER", "orientation" = "TEXT", "assembly" = "TEXT", "unigene_id" = "TEXT", "symbol" = "TEXT", "locus_tag" = "TEXT", "synonyms" = "TEXT", "dbXrefs" = "TEXT", "chromosome" = "TEXT", "map_location" = "TEXT", "description" = "TEXT", "gene_type" = "TEXT", "nomenclature_symbol" = "TEXT", "nomenclature_name" = "TEXT", "nomenclature_status" = "TEXT", "other_designations" = "TEXT", "mim_id" = "INTEGER NOT NULL", "relation_type" = "TEXT", "refseq_id" = "TEXT NOT NULL", "uniprot_id" = "TEXT NOT NULL" ) cols <- character() for(i in seq_len(length(file[[1]]))){ cols[i] <- paste(file[[1]][i],types[file[[1]]][i]) } paste(cols, collapse =",") } .mergeAndCleanAccession <- function(ac){ ## pair 1st col with 2nd, clean and rename acr <- ac[ac[,2]!="-",1:2] colnames(acr)[2] <- "accession" ## pair 1st col with 3rd, clean and rename acp <- ac[ac[,3]!="-",c(1,3)] colnames(acp)[2] <- "accession" ## merge these together ac <- rbind(acr,acp) ## Then clean off the period and trailing digits ac[,2] <- sub("\\.\\d+$","",ac[,2]) ac } ## also need somethign to generate the insert statement. .generateTempTableINSERTStatement <- function (file) { Qs <- paste(rep("?",length(file[[1]])), collapse=",") paste("INSERT INTO ",sub(".gz$","",names(file)), " VALUES(",Qs,");",sep="") } ## need code to generate a table for each of the ## file below is like: files[1] or files[2] .createTEMPNCBIBaseTable <- function (con, file, tax_id, NCBIFilesDir=NULL) { data <- .downloadData(file, tax_id, NCBIFilesDir=NCBIFilesDir) table <- sub(".gz$","",names(file)) if(is.null(table)) stop("Unable to infer a table name.") cols <- .generateCols(file) sql<- paste("CREATE TABLE IF NOT EXISTS ",table," (",cols,");") sqliteQuickSQL(con, sql) message(paste("Populating ",table," table:", sep="")) sql <- .generateTempTableINSERTStatement(file) if(dim(data)[1] != 0){ ## ie. there has to be SOME data... .populateBaseTable(con, sql, data, table) }else if(dim(data)[1] == 0 && names(file)=="gene2go.gz"){ ## we need blast2GO message(paste("Getting blast2GO data as a substitute for ",table,sep="")) data <- .getBlast2GOData(tax_id, con) sql <- paste("INSERT INTO gene2go(tax_id, gene_id, go_id, evidence, ", "go_qualifier, go_description,", "pubmed_id, category) VALUES(?,?,?,?,?,?,?,?);") .populateBaseTable(con, sql, data, table) }else{ message(paste("No data available for table ",table,sep="")) } } ## loop through and make the tables .makeBaseDBFromDLs <- function(files, tax_id, con, NCBIFilesDir=NULL){ for(i in seq_len(length(files))){ .createTEMPNCBIBaseTable(con, files[i], tax_id, NCBIFilesDir=NCBIFilesDir) } con } .dropOldTables <- function(con,fileNames){ fileNames <- sub(".gz", "", fileNames) message(paste("dropping table",fileNames)) for(i in seq_len(length(fileNames))){ sqliteQuickSQL(con, paste("DROP TABLE IF EXISTS",fileNames[i])) } } .generateOrgDbName <- function(genus, species){ specStr <- paste(toupper(substr(genus,1,1)),species,sep="") paste("org.",specStr,".eg",sep="") } .getGODate <- function(){ GOinf <- dbInfo(GO_dbconn()) GOinf[GOinf[["name"]]=="GOSOURCEDATE","value"] } ## wanted when we want to know how many of something we can get .computeSimpleEGMapCounts <- function(con, table, field){ sql<- paste("SELECT count(DISTINCT g.gene_id) FROM ",table, " AS t, genes as g WHERE t._id=g._id AND t.",field, " NOT NULL", sep="") message(sql) sqliteQuickSQL(con,sql) } ## when we want to know how many of something there is (reverse mapping?) .computeSimpleMapCounts <- function(con, table, field){ sql<- paste("SELECT count(DISTINCT t.",field,") FROM ",table, " AS t, genes as g WHERE t._id=g._id AND t.",field, " NOT NULL", sep="") message(sql) sqliteQuickSQL(con,sql) } ## TODO: correct the counts as needed .addMapCounts <- function(con, tax_id, genus, species){ map_name <- c("GENENAME","SYMBOL","SYMBOL2EG","CHR","REFSEQ","REFSEQ2EG", "UNIGENE","UNIGENE2EG","GO","GO2EG","GO2ALLEGS","ALIAS2EG", "TOTAL") count <- c(.computeSimpleEGMapCounts(con, "gene_info", "gene_name"), .computeSimpleEGMapCounts(con, "gene_info", "symbol"), .computeSimpleMapCounts(con, "gene_info", "symbol"), .computeSimpleEGMapCounts(con, "chromosomes", "chromosome"), .computeSimpleEGMapCounts(con, "refseq", "accession"), .computeSimpleMapCounts(con, "refseq", "accession"), .computeSimpleEGMapCounts(con, "unigene", "unigene_id"), .computeSimpleMapCounts(con, "unigene", "unigene_id"), .computeSimpleEGMapCounts(con, "accessions", "accession"), .computeSimpleMapCounts(con, "accessions", "accession"), #GO ones .computeSimpleEGMapCounts(con, "alias", "alias_symbol"), sqliteQuickSQL(con,"SELECT count(DISTINCT gene_id) FROM genes") ) data = data.frame(map_name,count,stringsAsFactors=FALSE) sql <- "INSERT INTO map_counts (map_name,count) VALUES(?,?)" .populateBaseTable(con, sql, data, "map_counts") } ######################################################################### ## Generate the database using the helper functions: ######################################################################### makeOrgDbFromNCBI <- function(tax_id, genus, species, NCBIFilesDir=NULL){ require(RSQLite) require(GO.db) dbFileName <- paste(.generateOrgDbName(genus,species),".sqlite",sep="") if(file.exists(dbFileName)){ file.remove(dbFileName) } con <- dbConnect(SQLite(), dbFileName) .createMetadataTables(con) ## just makes the tables ## I need a list of files, along with their column names ## (needed for schema definitions later) ## IF ANY OF THESE gets moved in the source files ## then things will be in the wrong place! files = list( "gene2pubmed.gz" = c("tax_id","gene_id", "pubmed_id"), "gene2accession.gz" = c("tax_id","gene_id","status","rna_accession", "rna_gi","protein_accession","protein_gi","genomic_dna_accession", "genomic_dna_gi","genomic_start","genomic_end","orientation", "assembly"), ## This one might be needed later "gene2refseq.gz" = c("tax_id","gene_id","status","rna_accession", "rna_gi","protein_accession","protein_gi","genomic_dna_accession", "genomic_dna_gi","genomic_start","genomic_end","orientation", "assembly"), "gene2unigene" = c("gene_id","unigene_id"), "gene_info.gz" = c("tax_id","gene_id","symbol","locus_tag", "synonyms","dbXrefs","chromosome","map_location","description", "gene_type","nomenclature_symbol","nomenclature_name", "nomenclature_status","other_designations", "modification_date"), ## "mim2gene.gz" = c("mim_id","gene_id","relation_type"), ## "gene_refseq_uniprotkb_collab.gz" = c("refseq_id","uniprot_id"), "gene2go.gz" = c("tax_id","gene_id","go_id","evidence", "go_qualifier", "go_description","pubmed_id","category") ) .makeBaseDBFromDLs(files, tax_id, con, NCBIFilesDir=NCBIFilesDir) ## Add metadata: .addMetadata(con, tax_id, genus, species) ## Add map_metadata: .addMapMetadata(con, tax_id, genus, species) ## Make the central table egs <- sqliteQuickSQL(con, "SELECT distinct gene_id FROM gene_info")[,1] .makeCentralTable(egs, con) ## Make the other tables: ## gene_info symbs <- sqliteQuickSQL(con, "SELECT distinct gene_id, description, symbol FROM gene_info") colnames(symbs) <- c("gene_id", "gene_name", "symbol") ## constraint requires that we always have BOTH a symbol and a name. symbs <- symbs[!is.na(symbs[,2]) & !is.na(symbs[,3]),] # strict constraint! ## Only 10 things fail this strict constraint for human. ## ALL because of no gene symbol (but that have a name) ## TODO: fix this so that I can make this table without ## such a strict constraint .makeSimpleTable(symbs, table="gene_info_temp", con, fieldNameLens=c(255,80)) ## alias ## requires sub-parsing. alias <- sqliteQuickSQL(con, "SELECT distinct gene_id, synonyms FROM gene_info") aliases <- sapply(alias[,2], strsplit, "\\|") numAlias <- sapply(aliases, length) alGenes <- rep(alias[,1], numAlias) alias <- data.frame(gene_id=alGenes,alias_symbol=unlist(aliases)) .makeSimpleTable(alias, table="alias", con) ## chr chrs <- sqliteQuickSQL(con, "SELECT distinct gene_id, chromosome FROM gene_info") .makeSimpleTable(chrs, table="chromosomes", con) ## pubmed pm <- sqliteQuickSQL(con, "SELECT distinct gene_id, pubmed_id FROM gene2pubmed") .makeSimpleTable(pm, table="pubmed", con) ## refseq ## requires sub-parsing. rs <- sqliteQuickSQL(con, "SELECT distinct gene_id,rna_accession,protein_accession FROM gene2refseq") rs <- .mergeAndCleanAccession(rs) .makeSimpleTable(rs, table="refseq", con) ## accessions ac <- sqliteQuickSQL(con, paste("SELECT distinct gene_id,rna_accession,protein_accession", "FROM gene2accession")) ac <- .mergeAndCleanAccession(ac) .makeSimpleTable(ac, table="accessions", con) ## unigene ug <- sqliteQuickSQL(con, "SELECT distinct g.gene_id, u.unigene_id FROM gene2unigene as u, gene_info as g WHERE u.gene_id=g.gene_id") .makeSimpleTable(ug, table="unigene", con) ## Make the GO tables: .makeGOTablesFromNCBI(con) ## Drop all the older tables (which will include the original "gene_info"). .dropOldTables(con,names(files)) ## Rename "gene_info_temp" to be just "gene_info": sqliteQuickSQL(con,"ALTER TABLE gene_info_temp RENAME TO gene_info") ## add GO views to the DB makeGOViews(con) ## Add map_counts: (must happen at end) .addMapCounts(con, tax_id, genus, species) } ## For argument checking: .isSingleString <- function(x){ is.atomic(x) && length(x) == 1L && is.character(x) } .isSingleStringOrNA <- function(x) { is.atomic(x) && length(x) == 1L && (is.character(x) || is.na(x)) } .isSingleStringOrNull <- function(x) { (is.atomic(x) && length(x) == 1L && is.character(x)) || (is.atomic(x) && length(x) == 0L && is.null(x)) } ## function to make the package: makeOrgPackageFromNCBI <- function(version, maintainer, author, outputDir = ".", tax_id, genus, species, NCBIFilesDir=NULL){ ## Arguement checking: if(!.isSingleString(version)) stop("'version' must be a single string") if(!.isSingleString(maintainer)) stop("'maintainer' must be a single string") if(!.isSingleString(author)) stop("'author' must be a single string") if(outputDir!="." && file.access(outputDir)[[1]]!=0){ stop("Selected outputDir '", outputDir,"' does not exist.")} if(!.isSingleString(tax_id)) stop("'tax_id' must be a single string") if(!.isSingleString(genus)) stop("'genus' must be a single string") if(!.isSingleString(species)) stop("'species' must be a single string") if(!.isSingleStringOrNull(NCBIFilesDir)) stop("'NCBIFilesDir' argument needs to be a single string or NULL") ## 'outputDir' is not passed to makeOrgDbFromNCBI(). Hence the db file ## is always created in ".". Maybe that could be revisited. makeOrgDbFromNCBI(tax_id=tax_id, genus=genus, species=species, NCBIFilesDir=NCBIFilesDir) dbName <- .generateOrgDbName(genus,species) dbfile <- paste(dbName, ".sqlite", sep="") seed <- new("AnnDbPkgSeed", Package= paste(dbName,".db",sep=""), Version=version, Author=author, Maintainer=maintainer, PkgTemplate="ORGANISM.DB", AnnObjPrefix=dbName, organism = paste(genus, species), species = paste(genus, species), biocViews = "annotation", manufacturerUrl = "no manufacturer", manufacturer = "no manufacturer", chipName = "no manufacturer") makeAnnDbPkg(seed, dbfile, dest_dir=outputDir) ## cleanup file.remove(dbfile) }