## This makes the special genes table of an EG DB. .makeGenesTable <- function(genes, con){ message("Populating genes table:") sql<- paste(" CREATE TABLE IF NOT EXISTS genes ( _id INTEGER PRIMARY KEY, GID VARCHAR(10) NOT NULL UNIQUE -- Gene ID );") sqliteQuickSQL(con, sql) geneid <- data.frame(genes) ## TODO: data.frame() necessary??? sql<- paste("INSERT INTO genes(GID) VALUES(?);") dbBeginTransaction(con) dbGetPreparedQuery(con, sql, geneid) dbCommit(con) sqliteQuickSQL(con, "CREATE INDEX IF NOT EXISTS genes__id_ind ON genes (_id)") sqliteQuickSQL(con, "CREATE INDEX IF NOT EXISTS genes_GID_ind ON genes (GID)") message("genes table filled") } .makeTable <- function(data, table, con, fieldNameLens=25){ ## indFields tracks the things to index (not GID but _id, plus the rest) indFields <- c(names(data)[!(names(data) %in% "GID")],"_id") message(paste("Populating",table,"table:")) tableFieldLines <- paste(paste(names(data)[-1]," VARCHAR(", fieldNameLens,") NOT NULL, -- data"), collapse="\n ") ## For temp table, lets do it like this: if(dim(data)[1] == 0){ ## if we don't have anything to put into the table, then we don't even ## want to make a table. warning(paste("no values found for table ",table, " in this data chunk.", sep="")) ## Create our real table. .makeEmptySimpleTable(con, table, tableFieldLines) return() }else{ dbWriteTable(con, "temp", data, row.names=FALSE) ## Create our real table. .makeEmptySimpleTable(con, table, tableFieldLines) selFieldLines <- paste(paste("t.",names(data)[-1],sep=""),collapse=",") sql<- paste0(" INSERT INTO ",table," SELECT g._id as _id, ",selFieldLines," FROM genes AS g, temp AS t WHERE g.GID=t.GID ORDER BY g._id;") sqliteQuickSQL(con, sql) ## Add index to all fields in indFields (default is all) for(i in seq_len(length(indFields))){ sqliteQuickSQL(con, paste0("CREATE INDEX IF NOT EXISTS ", table,"_",indFields[i],"_ind ON ",table, " (",indFields[i],");")) } ## drop the temp table sqliteQuickSQL(con, "DROP TABLE temp;") } message(paste(table,"table filled")) } ## helper to add most basic of metadata .addEssentialMetadata <- function(con, tax_id, genus, species, schema="NOSCHEMA_DB", type="OrgDb", centralID="GID"){ name <- c("DBSCHEMAVERSION","DBSCHEMA","ORGANISM","SPECIES","CENTRALID", "TAXID", "Db type","Supporting package") value<- c("2.1",schema,paste(genus,species),paste(genus,species), centralID,tax_id, type,"AnnotationDbi") AnnotationForge:::.addMeta(con, name, value) } .addOntologyData <- function(data){ ## And then make the new ones. require(GO.db) goOnts <- select(GO.db, as.character(data$GO), "ONTOLOGY") if(dim(goOnts)[1] == dim(data)[1]){ data <- cbind(data, goOnts) }else{stop("Ontology should be 1:1 with GOIDs")} ## and re-shuffle data[,c("GID","GO","EVIDENCE","ONTOLOGY")] } ## helper to prepare/filter data for two GO tables. .makeNewGOTables <- function(con, goTable, goData){ ## So 1st drop the old go table sqliteQuickSQL(con, paste0("DROP TABLE ",goTable,";")) ## add Ontologies data goData <- .addOntologyData(goData) ## now filter that for terms that are "too new" message("Dropping GO IDs that are too new for the current GO.db") goData <- goData[goData[["GO"]] %in% Lkeys(GOTERM),] ## Then make the 1st table .makeTable(goData, "go", con=con) ## Then prepare data for the 2nd (go_all) table gbp <- goData[goData$ONTOLOGY=="BP",c("GID","GO","EVIDENCE")] gcc <- goData[goData$ONTOLOGY=="CC",c("GID","GO","EVIDENCE")] gmf <- goData[goData$ONTOLOGY=="MF",c("GID","GO","EVIDENCE")] names(gbp) <- names(gcc) <- names(gmf) <- c("gene_id","go_id","evidence") ## then recycle older expand function bpAll <- .expandGOFrame(gbp, GOBPANCESTOR) mfAll <- .expandGOFrame(gmf, GOMFANCESTOR) ccAll <- .expandGOFrame(gcc, GOCCANCESTOR) ## then combine goAllData <- rbind(bpAll,ccAll,mfAll) names(goAllData) <- c("GID","GO","EVIDENCE") goAllData <- .addOntologyData(goAllData) names(goAllData) <- c("GID","GOALL","EVIDENCEALL","ONTOLOGYALL") ## Then make the 2nd table .makeTable(goAllData, "go_all", con=con) } ## function to put together the database. ## This takes a named list of data.frames. makeOrgDbFromDataFrames <- function(data, tax_id, genus, species, dbFileName, goTable){ ## set up DB connection require(RSQLite) if(file.exists(dbFileName)){ file.remove(dbFileName) } con <- dbConnect(SQLite(), dbFileName) AnnotationForge:::.createMetadataTables(con) ## TODO: why can't I drop these (investigate) # sqliteQuickSQL(con, "DROP TABLE map_counts;") # sqliteQuickSQL(con, "DROP TABLE map_metadata;") ## gather all GIDs together and make the genes table genes <- unique(unlist(unname(lapply(data, "[", 'GID')))) AnnotationForge:::.makeGenesTable(genes, con) ## Then do each data.frame in turn mapply(FUN=.makeTable, data, names(data), MoreArgs=list(con=con)) ## Add metadata but keep it very basic AnnotationForge:::.addEssentialMetadata(con, tax_id, genus, species) ## when we have a goTable, we make special GO tables if(goTable %in% names(data)){ ## Extra checks for go table (when specified) goData <- data[[goTable]] if(!all(names(goData) == c("GID", "GO", "EVIDENCE"))) stop("'goTable' must have three columns called 'GID','GO' and 'EVIDENCE'") if(any(!grepl("^GO:", as.character(goData$GO)))) stop("'goTable' GO Ids must be formatted like 'GO:XXXXXXX'") .makeNewGOTables(con, goTable, goData) } } ## TODO: change the function so it takes ... instead of list (so that ## the arguments are named). This can be switched from the list later ## by just trapping what is in ... and passing it on as a named list. ## like this: data <- list(...) ## TODO: this should return the path to the created dir so that the ## user can just call install.packages(on that filepath). ## function to make the package: makeOrgPackage <- function(..., version, maintainer, author, outputDir = ".", tax_id, genus, species, goTable=NA){ ## get all the arguments into a list data <- list(...) ## drop any rownames on all data.frames data <- lapply(data, function(x){rownames(x) <- NULL; x}) ## Data has to meet some strict criteria. ## check that it's a list of data.frames. dataClasses <- unique(sapply(data, class)) if(!is.list(data) || dataClasses!="data.frame") stop("'...' must be a named set of data.frame objects") ## There must be unique names for each data.frame. ( if(length(unique(names(data))) != length(data)) stop("All elements of '...' must be a named") ## None of the list names is allowed to be "genes", "metadata" blackListedNames <- c("genes","metadata") if(any(names(data) %in% blackListedNames)) stop("'genes' and 'metadata' are reserved. Please choose different names for elements of '...'.") ## The data.frames should NOT be allowed to have missing/redundant rows... lengthsUni <- sapply(data, function(x){dim(unique(x))[1]}) lengthsRaw <- sapply(data, function(x){dim(x)[1]}) if(any(lengthsUni != lengthsRaw)) stop("data.frames passed in to '...' should not contain redundant rows") ## There must be colnames for each data.frame.(they must each be ## unique) .noGID <- function(x){x[!(x %in% "GID")]} colnamesUni <- unique(.noGID(unlist(sapply(data, colnames)))) colnamesAll <- .noGID(unlist(sapply(data, colnames))) names(colnamesAll) <- NULL if(any(colnamesUni != colnamesAll)) stop("data.frames should have completely unique names for all fields that are not the primary gene id 'GID'") ## The 1st column of each data.frame must be a gene ID (GID) colnameGIDs <- sapply(data, function(x){colnames(x)[1]}) if(any(colnameGIDs != "GID")) stop("The 1st column must always be the gene ID 'GID'") ## The GID columns all have to be the same type. GIDCols <- unique(sapply(data, function(x){class(x[["GID"]])})) if(length(GIDCols) >1) stop("The type of data in the 'GID' columns must be the same for all data.frames") ## check other arguments if(!.isSingleString(version)) stop("'version' must be a single string") if(!.isSingleString(maintainer)) stop("'maintainer' must be a single string") if(!.isSingleString(author)) stop("'author' must be a single string") if(outputDir!="." && file.access(outputDir)[[1]]!=0){ stop("Selected outputDir '", outputDir,"' does not exist.")} if(!.isSingleString(tax_id)) stop("'tax_id' must be a single string") if(!.isSingleString(genus)) stop("'genus' must be a single string") if(!.isSingleString(species)) stop("'species' must be a single string") if(!.isSingleStringOrNA(goTable)) stop("'goTable' argument needs to be a single string or NA") if(!is.na(goTable) && !(goTable %in% names(data))) stop("When definined, 'goTable' needs to be a table name from the named data.frames passed in to '...'") ## generate name from the genus and species dbName <- .generateOrgDbName(genus,species) ## this becomes the file name for the DB dbFileName <- file.path(outputDir,paste0(dbName, ".sqlite")) ## Then make the DB makeOrgDbFromDataFrames(data, tax_id, genus, species, dbFileName, goTable) seed <- new("AnnDbPkgSeed", Package= paste0(dbName,".db"), Version=version, Author=author, Maintainer=maintainer, PkgTemplate="NOSCHEMA.DB", AnnObjPrefix=dbName, organism = paste(genus, species), species = paste(genus, species), biocViews = "annotation", manufacturerUrl = "no manufacturer", manufacturer = "no manufacturer", chipName = "no manufacturer") makeAnnDbPkg(seed, dbFileName, dest_dir=outputDir) ## cleanup message("Now deleting temporary database file") file.remove(dbFileName) ## return the path to the dir that was just created. file.path(outputDir,paste0(dbName,".db")) }