## Copyright 2013-2021 Ramon Diaz-Uriarte ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see <http://www.gnu.org/licenses/>. ## Say which are drivers: populate the drv vector. ## // In R, the user says which are the drivers. If does not say anuthing, ## // the default (NULL) then drivers are all in poset, epist, restrict. The ## // user can pass a vector with the names of the genes (not modules). Allow ## // also for empty, so this is faster if not needed. And check that if we ## // use a stopping rule based on drivers that drv vectors is not empty. ## - Say that a user can use a "0" as a gene name, but that is BAD idea. ## - Modules and order effects can be kind of funny? ## Gene names can contain no spaces, commas, or ">" check.gm <- function(gm) { ## Yes, Root: we want no ambiguities. ## Actually, that sucks. So we do not require it, but check for ## consistency. if(any(gm == "Root") && (gm[1] != "Root") ) stop("If Root is in the module table, it must be the first") if(any(names(gm) == "Root") && (names(gm)[1] != "Root") ) stop("If the name Root is in the module table, it must be the first") if( (names(gm)[1] == "Root") && (gm[1] != "Root") ) stop("The name Root is in the module table, but is not of Root") if( (gm[1] == "Root") && (names(gm)[1] != "Root") ) stop("Root is in the module table, but with a different name") ## if(gm[1] != "Root") ## stop("First value of a module table must be Root") ## if(names(gm)[1] != "Root") ## stop("First name of a module table must be Root") if(length(unique(names(gm))) != length(gm)) stop("Number of unique module names different from length of vector") if(gm[1] != "Root") gm <- c("Root" = "Root", gm) return(gm) } gtm2 <- function(x) { data.frame(cbind(nice.vector.eo(x, ","), x), stringsAsFactors = TRUE) } nice.vector.eo <- function(z, sep, rm.sign = FALSE) { ## with epistasis, maybe we want sorted? if(! rm.sign) return(setdiff(unlist(lapply(strsplit(z, " "), function(u) strsplit(u, sep))), "")) else ## remove the " ", the -, and the sep return(setdiff(unlist(lapply(strsplit(z, " "), function(u) strsplit(unlist(strsplit(u, "-")), sep))), "")) } gm.to.geneModuleL <- function(gm, one.to.one) { ## the table will be sorted by gene name gm <- check.gm(gm) ## the named vector with the mapping into the long geneModule df geneMod <- as.data.frame(rbindlist(lapply(gm, gtm2)), stringsAsFactors = TRUE) ## this stringsAsFactors is key geneMod$Module <- names(gm)[geneMod[, 2]] ## reverse lookup table colnames(geneMod)[1] <- c("Gene") geneMod <- geneMod[, -2] geneMod$Gene <- as.character(geneMod$Gene) ## geneMod$Module <- as.character(geneMod$Module) ## already a char geneMod <- geneMod[c(1, order(geneMod$Gene[-1]) + 1), ] geneMod$GeneNumID <- 0:(nrow(geneMod) - 1) ## this assumes sorted! and they need not be ## rlem <- rle(geneMod$Module) ## geneMod$ModuleNumID <- rep( 0:(length(rlem$values) - 1), rlem$lengths) if(one.to.one) { geneMod$ModuleNumID <- geneMod$GeneNumID if(!(identical(geneMod$Gene, geneMod$Module))) stop("Impossible: we are supposed to be in one-to-one for Module-Gene.") } else { ## Works, but does not give the most ordered module names. But ## keeps implicit order given by user. idm <- seq.int(length(gm) - 1) idm <- c("Root" = 0L, idm) names(idm) <- names(gm) ## This sorts by the names but is not optimal either ## idm1 <- seq.int(length(gm) - 1) ## idm <- c(0L, idm1) ## names(idm) <- c("Root", sort(setdiff(names(gm), "Root"))) geneMod$ModuleNumID <- idm[geneMod[, "Module"]] } if(length(unique(geneMod$Gene)) != nrow(geneMod)) { stop("Are there identical gene names in different modules?") } ## I think this is unreacheable now. Caught earlier. if(length(unique(geneMod$GeneNumID)) != nrow(geneMod)) { stop("Are there identical gene names in different modules?") } rownames(geneMod) <- 1:nrow(geneMod) geneMod } geneModuleNull <- function(namesM) { v <- c("Root", setdiff(namesM, "Root")) names(v) <- v return(v) } list.of.deps <- function(x) { ## lookupTypeDep <- c("MN" = 1, "monotone" = 1, ## "SM" = 2, "semimonotone" = 2) lookupTypeDep <- c("MN" = "monotone", "AND" = "monotone", "monotone" = "monotone", "CMPN" = "monotone", "OR" = "semimonotone", "SM" = "semimonotone", "semimonotone" = "semimonotone", "DMPN" = "semimonotone", "XOR" = "xmpn", "xmpn" = "xmpn", "XMPN" = "xmpn", "--" = "--", "-" = "--") ## FIXME: check values of typeDep if(length(x) > 1) { if(length(unique(x$s))!= 1) stop("Not all s identical within a child") if(length(unique(x$sh))!= 1) stop("Not all sh identical within a child") if(length(unique(x$typeDep))!= 1) stop("Not all typeDep identical within a child") if(length(unique(x$child))!= 1) stop("child not unique") } typeDep <- lookupTypeDep[unique(x$typeDep)] if(any(is_null_na(typeDep))) stop("typeDep value incorrect. Check spelling.") return(list( child = unique(x$child), s = unique(x$s), sh = unique(x$sh), typeDep = typeDep, parents = unlist(x$parent))) } to.long.rt <- function(rt, idm) { ## We now do this inconditionally, so that we do not need to use the if(!("Root" %in% rt$parent)) stop("Root must be one parent node") srt <- rt[order(rt$child), ] long.rt <- lapply(split(srt, srt$child), list.of.deps) ## idm is just a look up table for the id of the module ## add integer IDs addIntID <- function(z, idm) { z$childNumID <- idm[z$child] z$parentsNumID <- idm[z$parents] if( any(is.na(z$parentsNumID)) || any(is.na(z$childNumID)) ) { ## I think this can no longer be reached ever. Caught before. stop(paste("An ID is NA:", "Is a gene part of two different modules?", "(That includes being by itself and part", "of a module.)")) } ## These checks could be somewhere else instead of here. if(length(unique(z$parentsNumID)) != length(z$parentsNumID)) stop("No node can have the same parent multiple times") if(length(unique(z$parents)) != length(z$parents)) stop("No node can have the same parent multiple times") if(length(z$child) > 1) stop("Child nodes can have one, and only one, member") ## sort the parents, always. o <- order(z$parentsNumID) z$parentsNumID <- z$parentsNumID[o] z$parents <- as.character(z$parents[o]) return(z) } long.rt <- lapply(long.rt, function(x) addIntID(x, idm = idm)) return(long.rt) } epist.order.element <- function(x, y, sep, rm.sign = FALSE) { list(ids = nice.vector.eo(x, sep = sep, rm.sign = rm.sign), s = y) } oe.to.df <- function(x) { ma <- matrix(ncol = 2, nrow = 1 + length(x) - 2) if(length(x) == 2) { ma[1, 1] <- x[1] ma[1, 2] <- x[2] } else { ma[, 1] <- x[-length(x)] ma[, 2] <- x[-1] } return(data.frame(ma, stringsAsFactors = FALSE)) } epist.order.to.pairs.modules <- function(x, sep, rm.sign = TRUE) { ## We discard, do not even accept, the coefficient tmp <- epist.order.element(x, -99, sep = sep, rm.sign = rm.sign)$ids if(length(tmp) > 1) { ## if a single gene, as when we specify all genotypes, we do not ## want this if(sep == ":") return(data.frame(combinations(n = length(tmp), r = 2, v = tmp), stringsAsFactors = FALSE)) else if(sep == ">") { return(oe.to.df(tmp)) } } } to.pairs.modules <- function(x, sep, rm.sign = TRUE) { df <- data.frame(rbindlist( lapply(names(x), function(z) epist.order.to.pairs.modules(z, sep))), stringsAsFactors = TRUE) if(nrow(df) == 0L) { ## if only single genes in epist, we get nothing here. return(df) } else { colnames(df) <- c("parent", "child") if(sep == ":") df$typeDep <- "epistasis" else if(sep == ">") df$typeDep <- "orderEffect" return(unique(df)) } } to.long.epist.order <- function(epor, sep, rm.sign = FALSE) { ## just vectors for now long <- Map(function(x, y) epist.order.element(x, y, sep, rm.sign), names(epor), epor) names(long) <- NULL return(long) } addIntID.epist.order <- function(z, idm, sort, sign) { if( sort && (!sign)) warning("sort is TRUE but sign is FALSE. You sure?") if((!sort) && sign) warning("sort is FALSE but sign is TRUE. You sure?") ## Adds the integer, but takes care of sign if sign = TRUE if(!sign) { z$NumID <- idm[z$ids] signs <- grep("^-", z$ids) if(length(signs)) stop("You have a negative sign and you said sign = FALSE") } else { unsigned <- setdiff(unlist(lapply(z$ids, function(z) strsplit(z, "^-"))), "") NumID <- idm[unsigned] signs <- grep("^-", z$ids) if(length(signs)) { NumID[signs] <- -1 * NumID[signs] } z$NumID <- NumID } if(sort) { ## Essential for epistasis, but never do for order effects o <- order(z$NumID) z$NumID <- z$NumID[o] z$ids <- z$ids[o] } if(length(unique(z$NumID)) != length(z$NumID)) stop("No node can have the same id multiple times") if(length(unique(z$ids)) != length(z$ids)) stop("No node can have the same id multiple times") return(z) } checkRT <- function(mdeps) { if(ncol(mdeps) != 5) stop("mdeps must be of exactly 5 columns") if(!identical(colnames(mdeps), c("parent", "child", "s", "sh", "typeDep"))) stop(paste("Column names of mdeps not of appropriate format. ", "Should be parent, child, s, sh, typeDep")) } getNamesID <- function(fp) { ## Return a lookup table for names based on numeric IDs idname <- c(fp$geneModule$GeneNumID, fp$long.geneNoInt$GeneNumID, fp$fitnessLandscape_gene_id$GeneNumID) names(idname) <- c(fp$geneModule$Gene, fp$long.geneNoInt$Gene, fp$fitnessLandscape_gene_id$Gene) return(idname[-1]) ## remove Root!! } getGeneIDNum <- function(geneModule, geneNoInt, fitnessLandscape_gene_id, drv, sort = TRUE) { ## It returns the genes, as NumID, in the given vector with names drv ## initMutant uses this, for simplicity, without sorting, but noInt ## are always sorted ## Also used for the drivers with sort = TRUE ## Yes, we must do it twice because we do not know before hand which ## is which. This makes sure no NA. Period. if(any(is.na( match(drv, c(geneModule$Gene, geneNoInt$Gene, fitnessLandscape_gene_id$Gene))))) { stop(paste("For driver or initMutant you have passed genes", "not in the fitness table.")) } indicesM <- as.vector(na.omit(match( drv, geneModule$Gene))) indicesI <- as.vector(na.omit(sort(match( drv, geneNoInt$Gene)))) indicesF <- as.vector(na.omit(sort(match( drv, fitnessLandscape_gene_id$Gene)))) if(sort) { indicesM <- sort(indicesM) } return(c( geneModule$GeneNumID[indicesM], geneNoInt$GeneNumID[indicesI], fitnessLandscape_gene_id$GeneNumID[indicesF]) ) } ## genotFitnes and frequency type -> fitnesLanscapeVariables for FDF and ## fitness with numbers, not names ## Done in a single function since both operations make ## the same assumptions create_flvars_fitvars <- function(genotFitness, frequencyType) { x <- genotFitness[, -ncol(genotFitness), drop = FALSE] pasted <- apply(x, 1, function(z) paste(sort(which(z == 1)), collapse = "_")) npasted <- apply(x, 1, function(z) paste(sort(colnames(x)[which(z == 1)]), collapse = "_")) if(frequencyType == "abs") { prefix <- "n_" prefixre <- "^n_" } else { prefix <- "f_" prefixre <- "^f_" } flvars <- paste0(prefix, pasted) names(flvars) <- npasted ## make sure we get f_1_2 and not f_2_1, etc flvars2 <- flvars names(flvars2) <- paste0(prefix, names(flvars)) rmwt <- which(flvars2 == prefix) if(length(rmwt)) flvars2 <- flvars2[-rmwt] ## rm this. ## Need to rev the vector, to ensure larger patterns come first ## and to place "f_" as last. rflvars2 <- rev(flvars2) count_seps <- stringr::str_count(rflvars2, stringr::fixed("_")) if(any(diff(count_seps) > 0)) { warning("flvars not ordered?", "Check the conversion of gene names to numbers in fitness spec") rflvars2 <- rflvars2[order(count_seps, decreasing = TRUE)] } ## Users can pass in many possible orderings. Get all. full_rflvars <- all_orders_fv(rflvars2, prefix, prefixre) Fitness_as_fvars <- stringr::str_replace_all(genotFitness$Fitness, stringr::fixed(full_rflvars)) return(list(flvars = flvars, Fitness_as_fvars = Fitness_as_fvars)) } ## named vector, prefixes -> all possible combinations of the name all_orders_fv <- function(x, prefix, prefixre) { f1 <- function(zz, prefix, prefixre) { z <- gsub(prefixre, "", names(zz)) spl <- strsplit(z, "_")[[1]] if(length(spl) <= 1) { return(zz) } else { pp <- gtools::permutations(length(spl), length(spl), spl) ppo <- apply(pp, 1, function(u) paste(u, collapse = "_")) ppoo <- rep(zz, length(ppo)) names(ppoo) <- paste0(prefix, ppo) return(ppoo) } } ## I cannot use lapply, as it strips the names out <- vector(mode = "character", length = 0) for(i in 1:length(x)) { out <- c(out, f1(x[i], prefix, prefixre)) } return(out) } fVariablesN <- function (g, frequencyType) { if (is.null(g) | g == 0) stop("Number of genes must be integer > 0") combinationsList <- list() for (i in 0:g) { combinationsList <- append(combinationsList, combn(1:g, i, list, simplify = TRUE)) } if (frequencyType == "abs"){ fsVector <-sapply(sapply(combinationsList, function(x) paste0(x, collapse = "_")), function(x) paste0("n_", x)) }else{ fsVector <-sapply(sapply(combinationsList, function(x) paste0(x, collapse = "_")), function(x) paste0("f_", x)) } return (fsVector) } allFitnessORMutatorEffects <- function(rT = NULL, epistasis = NULL, orderEffects = NULL, noIntGenes = NULL, geneToModule = NULL, drvNames = NULL, keepInput = TRUE, genotFitness = NULL, ## refFE = NULL, calledBy = NULL, frequencyDependentFitness = FALSE, frequencyType = "freq_dep_not_used"){ #spPopSizes = NULL) { ## From allFitnessEffects. Generalized so we deal with Fitness ## and mutator. ## restrictions: the usual rt ## epistasis: as it says, with the ":" ## orderEffects: the ">" ## All of the above can be genes or can be modules (if you pass a ## geneToModule) ## rest: rest of genes, with fitness ## For epistasis and order effects we create the output object but ## missing the numeric ids of genes. With rT we do it in one go, as we ## already know the mapping of genes to numeric ids. We could do the ## same in epistasis and order, but we would be splitting twice ## (whereas for rT extracting the names is very simple). ## called appropriately? if( !(calledBy %in% c("allFitnessEffects", "allMutatorEffects") )) stop("How did you call this function?. Bug.") if(calledBy == "allMutatorEffects") { ## very paranoid check if( !is.null(rT) || !is.null(orderEffects) || !is.null(drvNames) || !is.null(genotFitness)) stop("allMutatorEffects called with forbidden arguments.", "Is this an attempt to subvert the function?") } if(!frequencyDependentFitness) { rtNames <- NULL epiNames <- NULL orNames <- NULL if(!is.null(rT)) { ## This is really ugly, but to prevent the stringsAsFactors I need it here: rT$parent <- as.character(rT$parent) rT$child <- as.character(rT$child) rT$typeDep <- as.character(rT$typeDep) rtNames <- unique(c(rT$parent, rT$child)) } #if(!is.null(spPopSizes)) #warning("spPopSizes will be considered NULL if frequencyDependentFitness = FALSE") if(!is.null(epistasis)) { long.epistasis <- to.long.epist.order(epistasis, ":") ## epiNames <- unique(unlist(lapply(long.epistasis, function(x) x$ids))) ## deal with the possible negative signs epiNames <- setdiff(unique( unlist(lapply(long.epistasis, function(x) lapply(x$ids, function(z) strsplit(z, "^-"))))), "") } else { long.epistasis <- list() } if(!is.null(orderEffects)) { long.orderEffects <- to.long.epist.order(orderEffects, ">") orNames <- unique(unlist(lapply(long.orderEffects, function(x) x$ids))) } else { long.orderEffects <- list() } allModuleNames <- unique(c(rtNames, epiNames, orNames)) if(is.null(geneToModule)) { gMOneToOne <- TRUE geneToModule <- geneModuleNull(allModuleNames) } else { gMOneToOne <- FALSE if(any(is.na(match(setdiff(names(geneToModule), "Root"), allModuleNames)))) stop(paste("Some values in geneToModule not present in any of", " rT, epistasis, or order effects")) if(any(is.na(match(allModuleNames, names(geneToModule))))) stop(paste("Some values in rT, epistasis, ", "or order effects not in geneToModule")) } geneModule <- gm.to.geneModuleL(geneToModule, one.to.one = gMOneToOne) idm <- unique(geneModule$ModuleNumID) names(idm) <- unique(geneModule$Module) if(!is.null(rT)) { checkRT(rT) long.rt <- to.long.rt(rT, idm) } else { long.rt <- list() ## yes, we want an object of length 0 } ## Append the numeric ids to epistasis and order if(!is.null(epistasis)) { long.epistasis <- lapply(long.epistasis, function(x) addIntID.epist.order(x, idm, sort = TRUE, sign = TRUE)) } if(!is.null(orderEffects)) { long.orderEffects <- lapply(long.orderEffects, function(x) addIntID.epist.order(x, idm, sort = FALSE, sign = FALSE)) } if(!is.null(noIntGenes)) { if(inherits(noIntGenes, "character")) { wm <- paste("noIntGenes is a character vector.", "This is probably not what you want, and will", "likely result in an error downstream.", "You can get messages like", " 'not compatible with requested type', and others.", "We are stopping.") stop(wm) } mg <- max(geneModule[, "GeneNumID"]) gnum <- seq_along(noIntGenes) + mg if(!is.null(names(noIntGenes))) { ng <- names(noIntGenes) if( any(grepl(",", ng, fixed = TRUE)) || any(grepl(">", ng, fixed = TRUE)) || any(grepl(":", ng, fixed = TRUE)) ) stop("The name of some noIntGenes contain a ',' or a '>' or a ':'") if(any(ng %in% geneModule[, "Gene"] )) stop("A gene in noIntGenes also present in the other terms") if(any(duplicated(ng))) stop("Duplicated gene names in geneNoInt") if(any(is.na(ng))) stop("In noIntGenes some genes have names, some don't.", " Name all of them, or name none of them.") } else { ng <- gnum } geneNoInt <- data.frame(Gene = as.character(ng), GeneNumID = gnum, s = noIntGenes, stringsAsFactors = FALSE) } else { geneNoInt <- data.frame() } if(is.null(genotFitness)) { genotFitness <- matrix(NA, nrow = 0, ncol = 1) fitnessLandscape_df <- data.frame() fitnessLandscape_gene_id <- data.frame() } else { ## Yes, I am duplicating stuff for now. ## This makes life simpler in C++: ## In the map, the key is the genotype name, as ## cnn <- colnames(genotFitness)[-ncol(genotFitness)] cnn <- 1:(ncol(genotFitness) - 1) gfn <- apply(genotFitness[, -ncol(genotFitness), drop = FALSE], 1, function(x) paste(cnn[as.logical(x)], collapse = ", ")) ## rownames(genotFitness) <- gfn fitnessLandscape_df <- data.frame(Genotype = gfn, Fitness = genotFitness[, ncol(genotFitness)], stringsAsFactors = FALSE) fitnessLandscape_gene_id <- data.frame( Gene = colnames(genotFitness)[-ncol(genotFitness)], GeneNumID = cnn, stringsAsFactors = FALSE) } if( (length(long.rt) + length(long.epistasis) + length(long.orderEffects) + nrow(geneNoInt) + nrow(genotFitness)) == 0) stop("You have specified nothing!") if(calledBy == "allFitnessEffects") { if((length(long.rt) + length(long.epistasis) + length(long.orderEffects)) > 1) { graphE <- fitnessEffectsToIgraph(rT, epistasis, orderEffects) } else { graphE <- NULL } } else { graphE <- NULL } if(!is.null(drvNames)) { drv <- unique(getGeneIDNum(geneModule, geneNoInt, fitnessLandscape_gene_id, drvNames)) ## drivers should never be in the geneNoInt; Why!!!??? ## Catch the problem. This is an overkill, ## so since we catch the issue, we could leave the geneNoInt. But ## that should not be there in this call. ## if(any(drvNames %in% geneNoInt$Gene)) { ## stop(paste("At least one gene in drvNames is a geneNoInt gene.", ## "That is not allowed.", ## "If that gene is a driver, pass it as gene in the epistasis", ## "component.")) ## } ## drv <- getGeneIDNum(geneModule, NULL, drvNames) } else { ## we used to have this default ## drv <- geneModule$GeneNumID[-1] drv <- vector(mode = "integer", length = 0L) } if(!keepInput) { rT <- epistasis <- orderEffects <- noIntGenes <- NULL } out <- list(long.rt = long.rt, long.epistasis = long.epistasis, long.orderEffects = long.orderEffects, long.geneNoInt = geneNoInt, geneModule = geneModule, gMOneToOne = gMOneToOne, geneToModule = geneToModule, graph = graphE, drv = drv, rT = rT, epistasis = epistasis, orderEffects = orderEffects, noIntGenes = noIntGenes, fitnessLandscape = genotFitness, fitnessLandscape_df = fitnessLandscape_df, fitnessLandscape_gene_id = fitnessLandscape_gene_id, fitnessLandscapeVariables = vector(mode = "character", length = 0L), frequencyDependentFitness = frequencyDependentFitness, frequencyType = frequencyType) #spPopSizes = vector(mode = "integer", length = 0L) if(calledBy == "allFitnessEffects") { class(out) <- c("fitnessEffects") } else if(calledBy == "allMutatorEffects") { class(out) <- c("mutatorEffects") } } else { ## Frequency-dependent fitness if(is.null(genotFitness)) { #genotFitness <- matrix(NA, nrow = 0, ncol = 1) #fitnessLandscape_df <- data.frame() #fitnessLandscape_gene_id <- data.frame() stop("You have a null genotFitness in a frequency dependent fitness situation.") } else { cnn <- 1:(ncol(genotFitness) - 1) gfn <- apply(genotFitness[, -ncol(genotFitness), drop = FALSE], 1, function(x) paste(cnn[as.logical(x)], collapse = ", ")) ## rownames(genotFitness) <- gfn fitnessLandscape_df <- data.frame(Genotype = gfn, Fitness = genotFitness[, ncol(genotFitness)], stringsAsFactors = FALSE) attr(fitnessLandscape_df,'row.names') <- as.integer(attr(fitnessLandscape_df,'row.names')) fitnessLandscape_gene_id <- data.frame( Gene = colnames(genotFitness)[-ncol(genotFitness)], GeneNumID = cnn, stringsAsFactors = FALSE) if(frequencyType == "auto"){ ch <- paste(as.character(fitnessLandscape_df[, ncol(fitnessLandscape_df)]), collapse = "") #print(ch) if( grepl("f_", ch, fixed = TRUE) ){ frequencyType = "rel" } else{ frequencyType = "abs" } } else { frequencyType = frequencyType } ## Wrong: assumes all genotypes in fitness landscape ## fitnessLandscapeVariables = fVariablesN(ncol(genotFitness) - 1, frequencyType) stopifnot(identical(genotFitness$Fitness, fitnessLandscape_df$Fitness)) flvars_and_fitvars <- create_flvars_fitvars(genotFitness, frequencyType) fitnessLandscapeVariables <- flvars_and_fitvars$flvars Fitness_as_fvars <- flvars_and_fitvars$Fitness_as_fvars } if(!is.null(drvNames)) { drv <- unique(getGeneIDNum(geneModule, geneNoInt, fitnessLandscape_gene_id, drvNames)) } else { drv <- vector(mode = "integer", length = 0L) } defaultGeneModuleDF <- data.frame(Gene = "Root", Module = "Root", GeneNumID = 0, ModuleNumID = 0, stringsAsFactors = FALSE) ## Create, for the user, a single data frame with everything. ## This is what C++ should consume ## This ought to allow to pass fitness spec as letters. Preserve original Fitness_original_as_letters <- fitnessLandscape_df$Fitness fitnessLandscape_df$Fitness <- Fitness_as_fvars full_FDF_spec <- cbind(genotFitness[, -ncol(genotFitness)] , Genotype_as_numbers = fitnessLandscape_df$Genotype , Genotype_as_letters = genotype_letterslabel(genotFitness[, -ncol(genotFitness)]) , Genotype_as_fvars = fitnessLandscapeVariables ## used in C++ , Fitness_as_fvars = Fitness_as_fvars , Fitness_as_letters = Fitness_original_as_letters ) rownames(full_FDF_spec) <- 1:nrow(full_FDF_spec) ## fitnessLanscape and fitnessLandscape_df are now redundant given ## full_FDF_spec. Remove them later. In the mean time, ensure a ## single canonical object used. rm(fitnessLandscape_df) suppressWarnings(try(rm(fitnessLandscape), silent = TRUE)) rm(fitnessLandscapeVariables) rm(Fitness_as_fvars) rm(Fitness_original_as_letters) fitnessLandscape <- full_FDF_spec[, c(fitnessLandscape_gene_id$Gene, "Fitness_as_fvars")] colnames(fitnessLandscape)[ncol(fitnessLandscape)] <- "Fitness" fitnessLandscape_df <- full_FDF_spec[, c("Genotype_as_numbers", "Fitness_as_fvars")] colnames(fitnessLandscape_df) <- c("Genotype", "Fitness") out <- list(long.rt = list(), long.epistasis = list(), long.orderEffects = list(), long.geneNoInt = data.frame(), geneModule = defaultGeneModuleDF, ##Trick to pass countGenesFe>2, gMOneToOne = TRUE, geneToModule = c(Root = "Root"), graph = list(), drv = drv, rT = NULL, epistasis = NULL, orderEffects = NULL, noIntGenes = NULL, fitnessLandscape = genotFitness, ## redundant fitnessLandscape_df = fitnessLandscape_df, ## redundant fitnessLandscape_gene_id = fitnessLandscape_gene_id, ## fitnessLandscapeVariables = NULL, ## now part of full_FDF_spec frequencyDependentFitness = frequencyDependentFitness, frequencyType = frequencyType, full_FDF_spec = full_FDF_spec #spPopSizes = spPopSizes ) class(out) <- c("fitnessEffects") } return(out) } allFitnessEffects <- function(rT = NULL, epistasis = NULL, orderEffects = NULL, noIntGenes = NULL, geneToModule = NULL, drvNames = NULL, genotFitness = NULL, keepInput = TRUE, frequencyDependentFitness = FALSE, frequencyType = NA) { #spPopSizes = NULL) { if(!frequencyDependentFitness) { if(!is.na(frequencyType)){ warning("frequencyType set to NA") } ## this is a kludge, but we must pass something not NA and not NULL ## to the C++ code frequencyType = "freq_dep_not_used" if(!is.null(genotFitness)) { if(!is.null(rT) || !is.null(epistasis) || !is.null(orderEffects) || !is.null(noIntGenes) || !is.null(geneToModule)) { stop("You have a non-null genotFitness.", " If you pass the complete genotype to fitness mapping", " you cannot pass any of rT, epistasis, orderEffects", " noIntGenes or geneToModule.") } genotFitness_std <- to_genotFitness_std(genotFitness, frequencyDependentFitness = FALSE, frequencyType = frequencyType, simplify = TRUE) ## epistasis <- from_genotype_fitness(genotFitness) } else { genotFitness_std <- NULL } allFitnessORMutatorEffects( rT = rT, epistasis = epistasis, orderEffects = orderEffects, noIntGenes = noIntGenes, geneToModule = geneToModule, drvNames = drvNames, keepInput = keepInput, genotFitness = genotFitness_std, calledBy = "allFitnessEffects", frequencyDependentFitness = FALSE, frequencyType = frequencyType) #spPopSizes = spPopSizes) } else { if(!(frequencyType %in% c('abs', 'rel', 'auto'))){ #set frequencyType = "auto" in case you did not specify 'rel' or 'abs' frequencyType = "auto" message("frequencyType set to 'auto'") } if(is.null(genotFitness)) { stop("You have a null genotFitness in a frequency dependent fitness situation.") } else { genotFitness_std <- to_genotFitness_std(genotFitness, frequencyDependentFitness = TRUE, frequencyType = frequencyType, simplify = TRUE) allFitnessORMutatorEffects( rT = rT, epistasis = epistasis, orderEffects = orderEffects, noIntGenes = noIntGenes, geneToModule = geneToModule, drvNames = drvNames, keepInput = keepInput, genotFitness = genotFitness_std, calledBy = "allFitnessEffects", frequencyDependentFitness = TRUE, frequencyType = frequencyType) #spPopSizes = spPopSizes) } } } evalGenotypeORMut <- function(genotype, fmEffects, spPopSizes = spPopSizes, verbose = FALSE, echo = FALSE, model = "", calledBy_= NULL, currentTime = currentTime) { ## genotype can be a vector of integers, that are the exact same in ## the table of fmEffects or a vector of strings, or a vector (a ## string) with genes separated by "," or ">" if( !(calledBy_ %in% c("evalGenotype", "evalGenotypeMut") )) stop("How did you call this function?. Bug.") ## fmEffects could be a mutator effect if(!exists("fitnessLandscape_gene_id", where = fmEffects)) { fmEffects$fitnessLandscape_df <- data.frame() fmEffects$fitnessLandscape_gene_id <- data.frame() } if( (model %in% c("Bozic", "bozic1", "bozic2")) && (nrow(fmEffects$fitnessLandscape_df) > 0)) { warning("Bozic model passing a fitness landscape will not work", " for now.") } ## This will avoid errors is evalRGenotype where spPopSizes = NULL if (!fmEffects$frequencyDependentFitness) { spPopSizes = 0 } else { spPopSizes <- match_spPopSizes(spPopSizes, fmEffects) } if(echo) cat(paste("Genotype: ", genotype)) if(is.character(genotype)) { if(length(grep(">", genotype))) { genotype <- nice.vector.eo(genotype, ">") } else if(length(grep(",", genotype))) { genotype <- nice.vector.eo(genotype, ",") } all.g.nums <- c(fmEffects$geneModule$GeneNumID, fmEffects$long.geneNoInt$GeneNumID, fmEffects$fitnessLandscape_gene_id$GeneNumID) all.g.names <- c(fmEffects$geneModule$Gene, fmEffects$long.geneNoInt$Gene, fmEffects$fitnessLandscape_gene_id$Gene) if((length(genotype) == 1) && (genotype %in% c("", "WT") )) { genotype <- 0 } else { genotype <- all.g.nums[match(genotype, all.g.names)] } } else { all.g.nums <- c(fmEffects$geneModule$GeneNumID, fmEffects$long.geneNoInt$GeneNumID, fmEffects$fitnessLandscape_gene_id$GeneNumID) if(!all(sapply(genotype, function(x) x %in% all.g.nums))){ stop("Genotype as vector of numbers contains genes not in fitnessEffects/mutatorEffects.") } } ## FIXME: About the WT: in fmEffects$geneModule$Gene we have "Root". We might ## want to refactor that and turn all those to WT. It would avoid the need for ## the if above to catch the WT or "" if(!fmEffects$frequencyDependentFitness){ if( any(is.na(genotype)) ){ stop("Genotype contains NAs or genes not in fitnessEffects/mutatorEffects") } if((!length(genotype))){ stop("Genotypes must have at least one mutated gene") } if(any(genotype < 0)) { stop(paste("Genotypes cannot contain negative values.", "If you see this message, you found a bug.")) } if(length(genotype) == 1 && genotype == 0){ ## We do not handle WT for now stop("Genotype cannot be 0. We do not handle WT on its own in non-freq-dep: its fitness is 1 by decree.") } if(any(genotype == 0)){ stop("Genotype cannot contain any 0.") } } else { if(length(genotype) == 1 && is.na(genotype)){ stop("Genotype contains NA or a gene not in fitnessEffects/mutatorEffects") } else if(length(genotype) == 1 && genotype == 0) { ## for WT genotype <- vector(mode = "integer", length = 0L) } else if(length(genotype) > 1){ if( any(is.na(genotype)) ){ stop("Genotype contains NAs or genes not in fitnessEffects/mutatorEffects") } if(any(genotype == 0)){ stop("Genotype cannot contain any 0 if its length > 1") } } } if(model %in% c("Bozic", "bozic1", "bozic2") ) prodNeg <- TRUE else prodNeg <- FALSE ff <- evalRGenotype(rG = genotype, rFE = fmEffects, spPop = spPopSizes, verbose = verbose, prodNeg = prodNeg, calledBy_ = calledBy_, currentTime = currentTime) if(echo) { if(calledBy_ == "evalGenotype") { if(!prodNeg) cat(" Fitness: ", ff, "\n") else cat(" Death rate: ", ff, "\n") } else if(calledBy_ == "evalGenotypeMut") { cat(" Mutation rate product :", ff, "\n") } } return(ff) } evalGenotype <- function(genotype, fitnessEffects, spPopSizes = NULL, verbose = FALSE, echo = FALSE, model = "", currentTime = 0 ) { if(inherits(fitnessEffects, "mutatorEffects")) stop("You are trying to get the fitness of a mutator specification. ", "You did not pass an object of class fitnessEffects.") if (fitnessEffects$frequencyDependentFitness) { if (is.null(spPopSizes)) stop("You have a NULL spPopSizes") if (!(length(spPopSizes) == nrow(fitnessEffects$fitnessLandscape))) stop("spPopSizes must be as long as number of genotypes") } evalGenotypeORMut(genotype = genotype, fmEffects = fitnessEffects, spPopSizes = spPopSizes, verbose = verbose, echo = echo, model = model , calledBy_= "evalGenotype", currentTime = currentTime) } evalGenotypeFitAndMut <- function(genotype, fitnessEffects, mutatorEffects, spPopSizes = NULL, verbose = FALSE, echo = FALSE, model = "", currentTime = 0 ) { ## Must deal with objects from previous, pre flfast, modifications if(!exists("fitnessLandscape_gene_id", where = fitnessEffects)) { fitnessEffects$fitnessLandscape_df <- data.frame() fitnessEffects$fitnessLandscape_gene_id <- data.frame() } if( (model %in% c("Bozic", "bozic1", "bozic2")) && (nrow(fitnessEffects$fitnessLandscape_df) > 0)) { warning("Bozic model passing a fitness landscape will not work", " for now.") } if(fitnessEffects$frequencyDependentFitness) { if (is.null(spPopSizes)) stop("You have a NULL spPopSizes") if (!(length(spPopSizes) == nrow(fitnessEffects$fitnessLandscape))) stop("spPopSizes must be as long as number of genotypes") spPopSizes <- match_spPopSizes(spPopSizes, fitnessEffects) } # This will avoid errors is evalRGenotype where spPopSizes = NULL if (!fitnessEffects$frequencyDependentFitness) { spPopSizes = 0 } prodNeg <- FALSE ## Next is from evalGenotypeAndMut if(echo) cat(paste("Genotype: ", genotype)) if(is.character(genotype)) { if(length(grep(">", genotype))) { genotype <- nice.vector.eo(genotype, ">") } else if(length(grep(",", genotype))) { genotype <- nice.vector.eo(genotype, ",") } all.g.nums <- c(fitnessEffects$geneModule$GeneNumID, fitnessEffects$long.geneNoInt$GeneNumID, fitnessEffects$fitnessLandscape_gene_id$GeneNumID) all.g.names <- c(fitnessEffects$geneModule$Gene, fitnessEffects$long.geneNoInt$Gene, fitnessEffects$fitnessLandscape_gene_id$Gene) genotype <- all.g.nums[match(genotype, all.g.names)] } else { all.g.nums <- c(fitnessEffects$geneModule$GeneNumID, fitnessEffects$long.geneNoInt$GeneNumID, fitnessEffects$fitnessLandscape_gene_id$GeneNumID) if(!all(sapply(genotype, function(x)x %in% all.g.nums))){ stop("Genotype as vector of numbers contains genes not in fitnessEffects/mutatorEffects.") } } if(!fitnessEffects$frequencyDependentFitness){ if( any(is.na(genotype)) ){ stop("Genotype contains NAs or genes not in fitnessEffects/mutatorEffects") } if((!length(genotype))){ stop("Genotypes must have at least one mutated gene") } if(any(genotype < 0)) { stop(paste("Genotypes cannot contain negative values.", "If you see this message, you found a bug.")) } if(length(genotype) == 1 && genotype == 0){ stop("Genotype cannot be 0.") } if(any(genotype == 0)){ stop("Genotype cannot contain any 0.") } }else{ if(length(genotype) == 1 && is.na(genotype)){ stop("Genotype contains NA or a gene not in fitnessEffects/mutatorEffects") }else if(length(genotype) == 1 && genotype == 0){ genotype <- vector(mode = "integer", length = 0L) }else if(length(genotype) > 1){ if( any(is.na(genotype)) ){ stop("Genotype contains NAs or genes not in fitnessEffects/mutatorEffects") } if(any(genotype == 0)){ stop("Genotype cannot contain any 0 if its length > 1") } } } full2mutator_ <- matchGeneIDs(mutatorEffects, fitnessEffects)$Reduced if(model %in% c("Bozic", "bozic1", "bozic2") ) prodNeg <- TRUE else prodNeg <- FALSE evalRGenotypeAndMut(genotype, fitnessEffects, mutatorEffects, spPopSizes, full2mutator_, verbose = verbose, prodNeg = prodNeg, currentTime = currentTime) } ## fitnesEffects from FDF, spPopSizes -> reordered spPopSizes ## Verify if named. If not, issue a warning. ## If yes, check matches genotypes and reorder match_spPopSizes <- function(sp, fe){ if(!fe$frequencyDependentFitness) return(sp) if(is.null(names(sp))) { warning("spPopSizes unnamed: cannot check genotype names.") return(sp) } nns <- names(sp) nns <- sort_genes_genots(nns) names(sp) <- nns nnf <- fe$full_FDF_spec$Genotype_as_letters if( ( length(nns) != length(nnf) ) || (!(identical(sort(nnf), sort(nns))) ) ) stop("Genotype names in spPopSizes differ w.r.t. fitness landscape") return(sp[nnf]) } ## genotype names -> genotype names with names sorted sort_genes_genots <- function(x) { return( unlist( lapply( lapply(strsplit(x, ", "), sort), function(x) paste(x, collapse = ", ")))) ## Or this: ## lapply(names(nn3), function(x) paste(sort(strsplit(x, ", ")[[1]]), ## collapse = ", ")) } evalAllGenotypesORMut <- function(fmEffects, order = FALSE, max = 256, addwt = FALSE, model = "", spPopSizes = spPopSizes, calledBy_ = "", currentTime = currentTime) { ## minimal = FALSE) { if( !(calledBy_ %in% c("evalGenotype", "evalGenotypeMut") )) stop("How did you call this function?. Bug.") if( (calledBy_ == "evalGenotype" ) && (!inherits(fmEffects, "fitnessEffects"))) stop("You are trying to get the fitness of a mutator specification. ", "You did not pass an object of class fitnessEffects.") if( (calledBy_ == "evalGenotypeMut" ) && (!inherits(fmEffects, "mutatorEffects"))) stop("You are trying to get the mutator effects of a fitness specification. ", "You did not pass an object of class mutatorEffects.") if (fmEffects$frequencyDependentFitness) { if (is.null(spPopSizes)) stop("You have a NULL spPopSizes") if (!(length(spPopSizes) == nrow(fmEffects$fitnessLandscape))) stop("spPopSizes must be as long as number of genotypes") spPopSizes <- match_spPopSizes(spPopSizes, fmEffects) } # This will avoid errors is evalRGenotype where spPopSizes = NULL if (!fmEffects$frequencyDependentFitness) { spPopSizes = 0 } if(model %in% c("Bozic", "bozic1", "bozic2") ) prodNeg <- TRUE else prodNeg <- FALSE ## For non-FDF we do not evaluate WT; we rbind it as a 1, because it ## is 1 by decree. For FDF WT is evaluated on its own with evalWT ## below, and rbinded to the rest. allg <- generateAllGenotypes(fitnessEffects = fmEffects, order = order, max = max) allf <- vapply(allg$genotNums, function(x) evalRGenotype(x, fmEffects, spPopSizes, FALSE, prodNeg, calledBy_, currentTime), 1.1) if (fmEffects$frequencyDependentFitness){ evalWT <- evalRGenotype(vector(mode = "integer", length = 0L), fmEffects, spPopSizes, FALSE, prodNeg, calledBy_, currentTime) allf <- c(evalWT, allf) genotypes <- c("WT", allg$genotNames) } else { genotypes <- allg$genotNames } df <- data.frame(Genotype = genotypes, Fitness = allf, stringsAsFactors = FALSE) ## Why am I doing this? I am not computing the genotype. I test the ## evaluation of the empty genotype in the tests. But its evaluation ## generates warnings that are not needed. And by decree (even in the ## C++) this thing has a fitness of 1, a mutator effect of 1 since ## there are no terms. if(addwt & !fmEffects$frequencyDependentFitness) df <- rbind(data.frame(Genotype = "WT", Fitness = 1, stringsAsFactors = FALSE), df) if(calledBy_ == "evalGenotype") { if(prodNeg) colnames(df)[match("Fitness", colnames(df))] <- "Death_rate" class(df) <- c("evalAllGenotypes", class(df)) } else if (calledBy_ == "evalGenotypeMut") { colnames(df)[match("Fitness", colnames(df))] <- "MutatorFactor" class(df) <- c("evalAllGenotypesMut", class(df)) } return(df) } evalAllGenotypes <- function(fitnessEffects, order = FALSE, max = 256, addwt = FALSE, model = "", spPopSizes = NULL, currentTime = 0) { ## Must deal with objects from previous, pre flfast, modifications if(!exists("fitnessLandscape_gene_id", where = fitnessEffects)) { fitnessEffects$fitnessLandscape_df <- data.frame() fitnessEffects$fitnessLandscape_gene_id <- data.frame() } if( (model %in% c("Bozic", "bozic1", "bozic2")) && (nrow(fitnessEffects$fitnessLandscape_df) > 0)) { warning("Bozic model passing a fitness landscape will not work", " for now.") } evalAllGenotypesORMut( fmEffects = fitnessEffects, order = order, max = max, addwt = addwt, model = model, spPopSizes = spPopSizes, calledBy_= "evalGenotype", currentTime = currentTime) } generateAllGenotypes <- function(fitnessEffects, order = TRUE, max = 256) { if(order) tot <- function(n) {sum(sapply(seq.int(n), function(x) choose(n, x) * factorial(x)))} else tot <- function(n) {2^n} nn <- nrow(fitnessEffects$geneModule) -1 + nrow(fitnessEffects$long.geneNoInt) + nrow(fitnessEffects$fitnessLandscape_gene_id) tnn <- tot(nn) if(tnn > max) { m <- paste("There are", tnn, "genotypes.") m <- paste(m, "This is larger than max.") m <- paste(m, "Adjust max and rerun if you want") stop(m) } ## With mutator, the ids of genes need not go from 1:n vid <- allNamedGenes(fitnessEffects)$GeneNumID if(order) { f1 <- function(n) { lapply(seq.int(n), function(x) permutations(n = n, r = x, v = vid)) } } else { f1 <- function(n) { lapply(seq.int(n), function(x) combinations(n = n, r = x, v = vid))} } genotNums <- f1(nn) list.of.vectors <- function(y) { ## there's got to be a simpler way lapply(unlist(lapply(y, function(x) {apply(x, 1, list)}), recursive = FALSE), function(m) m[[1]]) } genotNums <- list.of.vectors(genotNums) names <- c(fitnessEffects$geneModule$Gene[-1], fitnessEffects$long.geneNoInt$Gene, fitnessEffects$fitnessLandscape_gene_id$Gene) genotNames <- unlist(lapply(lapply(genotNums, function(x) names[x]), function(z) paste(z, collapse = if(order){" > "} else {", "} ))) ## This ain't the best, as we repeatedly read and convert ## fitnessEffects. If this were slow, prepare C++ function that takes ## vectors of genotypes and uses same fitnessEffects. return(list(genotNums = genotNums, genotNames = genotNames )) } evalAllGenotypesFitAndMut <- function(fitnessEffects, mutatorEffects, order = FALSE, max = 256, addwt = FALSE, model = "", spPopSizes = NULL, currentTime = 0){ ## minimal = FALSE) { if(model %in% c("Bozic", "bozic1", "bozic2") ) { prodNeg <- TRUE } else { prodNeg <- FALSE } ## Must deal with objects from previous, pre flfast, modifications if(!exists("fitnessLandscape_gene_id", where = fitnessEffects)) { fitnessEffects$fitnessLandscape_df <- data.frame() fitnessEffects$fitnessLandscape_gene_id <- data.frame() } if( (model %in% c("Bozic", "bozic1", "bozic2")) && (nrow(fitnessEffects$fitnessLandscape_df) > 0)) { warning("Bozic model passing a fitness landscape will not work", " for now.") } if(fitnessEffects$frequencyDependentFitness) { if (is.null(spPopSizes)) stop("You have a NULL spPopSizes") if (!(length(spPopSizes) == nrow(fitnessEffects$fitnessLandscape))) stop("spPopSizes must be as long as number of genotypes") spPopSizes <- match_spPopSizes(spPopSizes, fitnessEffects) } # This will avoid errors is evalRGenotype where spPopSizes = NULL if (!fitnessEffects$frequencyDependentFitness) { spPopSizes = 0 } ## if(!minimal) allg <- generateAllGenotypes(fitnessEffects = fitnessEffects, order = order, max = max) ## else ## allg <- generateAllGenotypes_minimal(fitnessEffects = fitnessEffects, ## max = max) full2mutator_ <- matchGeneIDs(mutatorEffects, fitnessEffects)$Reduced allf <- t(vapply(allg$genotNums, function(x) evalRGenotypeAndMut(x, rFE = fitnessEffects, muEF= mutatorEffects, spPop = spPopSizes, full2mutator_ = full2mutator_, verbose = FALSE, prodNeg = prodNeg, currentTime = currentTime), c(1.1, 2.2))) if(fitnessEffects$frequencyDependentFitness){ evalWT <- evalRGenotypeAndMut(vector(mode = "integer", length = 0L), rFE = fitnessEffects, muEF = mutatorEffects, spPop = spPopSizes, full2mutator_ = full2mutator_, verbose = FALSE, prodNeg = prodNeg, currentTime = currentTime) allf <- rbind(evalWT, allf) genotypes <- c("WT", allg$genotNames) }else{ genotypes <- allg$genotNames } df <- data.frame(Genotype = genotypes, Fitness = allf[, 1], MutatorFactor = allf[, 2], stringsAsFactors = FALSE) if(fitnessEffects$frequencyDependentFitness == FALSE && addwt) df <- rbind(data.frame(Genotype = "WT", Fitness = 1, MutatorFactor = 1, stringsAsFactors = FALSE), df) if(prodNeg) colnames(df)[match("Fitness", colnames(df))] <- "Death_rate" class(df) <- c("evalAllGenotypesFitAndMut", class(df)) return(df) } fitnessEffectsToIgraph <- function(rT, epistasis, orderEffects) { df0 <- df1 <- df2 <- data.frame() if(!is.null(rT)) { df0 <- rT[, c("parent", "child", "typeDep")] } if(!is.null(epistasis)) { df1 <- to.pairs.modules(epistasis, ":") } if(!is.null(orderEffects)) { df2 <- to.pairs.modules(orderEffects, ">") } df <- rbind(df0, df1, df2) ## for special case of simple epi effects if(nrow(df) == 0) return(NULL) g1 <- graph.data.frame(df) E(g1)$color <- "black" E(g1)$color[E(g1)$typeDep == "SM"] <- "blue" E(g1)$color[E(g1)$typeDep == "OR"] <- "blue" E(g1)$color[E(g1)$typeDep == "semimonotone"] <- "blue" E(g1)$color[E(g1)$typeDep == "XMPN"] <- "red" E(g1)$color[E(g1)$typeDep == "XOR"] <- "red" E(g1)$color[E(g1)$typeDep == "xmpn"] <- "red" E(g1)$color[E(g1)$typeDep == "epistasis"] <- "orange" E(g1)$color[E(g1)$typeDep == "orderEffect"] <- "orange" E(g1)$lty <- 1 E(g1)$lty[E(g1)$typeDep == "epistasis"] <- 2 E(g1)$lty[E(g1)$typeDep == "orderEffect"] <- 3 E(g1)$arrow.mode <- 2 E(g1)$arrow.mode[E(g1)$typeDep == "epistasis"] <- 0 E(g1)$arrow.mode[E(g1)$typeDep == "orderEffect"] <- 2 return(g1) } ## FIXME: in the help: say we cannot properly show 3- or higher order ## interactions. plot.fitnessEffects <- function(x, type = "graphNEL", layout = NULL, expandModules = FALSE, autofit = FALSE, scale_char = ifelse(type == "graphNEL", 1/10, 5), return_g = FALSE, lwdf = 1, ## multiplier for lwd for graphNEL ...) { ## some other layouts I find OK ## layout.circle ## layout.reingold.tilford if really a tree ## o.w. it will use the default g <- x$graph if(is.null(g)) stop("This fitnessEffects object can not be ploted this way.", " It is probably one with fitness landscape specification, ", " so you might want to plot the fitness landscape instead.") if(type == "igraph") { if(expandModules && (!x$gMOneToOne)) { ## vlabels <- fe$geneToModule[vertex.attributes(g)$name] vlabels <- x$geneToModule[V(g)$name] V(g)$label <- vlabels } if(autofit) { plot(0, type = "n", axes = FALSE, ann = FALSE) ## ideas from http://stackoverflow.com/questions/14472079/match-vertex-size-to-label-size-in-igraph ## vsize <- (strwidth(V(g)$label) + strwidth("oo")) * 200 ## but this is a kludge. vsize <- (nchar(V(g)$label) + 1) * scale_char plot.igraph(g, vertex.size = vsize, vertex.shape = "rectangle", layout = layout) } else { plot.igraph(g, layout = layout) } if(return_g) return(g) } else if (type == "graphNEL") { g1 <- igraph.to.graphNEL(g) c1 <- unlist(lapply(edgeData(g1), function(x) x$color)) names(c1) <- sub("|", "~", names(c1), fixed = TRUE) s1 <- unlist(lapply(edgeData(g1), function(x) x$lty)) names(s1) <- names(c1) a1 <- unlist(lapply(edgeData(g1), function(x) max(x$arrow.mode - 1, 0))) names(a1) <- names(c1) lwd <- s1 lwd[lwd == 2] <- 2 ## o.w. too thin lwd[lwd == 3] <- 2 ## o.w. too thin lwd <- lwd * lwdf nAttrs <- list() if(expandModules && (!x$gMOneToOne)) { nnodes <- x$geneToModule[nodes(g1)] names(nnodes) <- nodes(g1) nAttrs$label <- nnodes } if(autofit) { nAttrs$width <- (nchar(nAttrs$label) + 1) * scale_char names(nAttrs$width) <- names(nAttrs$label) plot(g1, edgeAttrs = list(arrowsize = a1, lty = s1, lwd = lwd, color = c1), attrs=list(node=list(shape = "rectangle")), nodeAttrs = nAttrs) } else { plot(g1, edgeAttrs = list(arrowsize = a1, lty = s1, lwd = lwd, color = c1), nodeAttrs = nAttrs) } if(return_g) return(g1) } else { stop("plot type not recognized") } } nr_oncoSimul.internal <- function(rFE, birth, death, mu, initSize, sampleEvery, detectionSize, finalTime, initSize_species, initSize_iter, seed, verbosity, speciesFS, ratioForce, typeFitness, max.memory, mutationPropGrowth, initMutant, max.wall.time, keepEvery, K, detectionDrivers, onlyCancer, errorHitWallTime, max.num.tries, errorHitMaxTries, minDetectDrvCloneSz, extraTime, keepPhylog, detectionProb, AND_DrvProbExit, MMUEF = NULL, ## avoid partial matching, and set default fixation = NULL ## avoid partial matching ) { default_min_successive_fixation <- 50 ## yes, set at this for now if(!inherits(rFE, "fitnessEffects")) stop(paste("rFE must be an object of class fitnessEffects", "as created, for instance, with function", "allFitnessEffects")) if(countGenesFe(rFE) < 2) { stop("There must be at least two genes (loci) in the fitness effects. ", "If you only care about a degenerate case with just one,", "you can enter a second gene (locus)", "with fitness effect of zero.") } if( (length(mu) == 1) && !(is.null(names(mu)))) { stop("A length 1 mutation, but named. ", "This is ambiguous. ", "If you want per-gene mutation rates, each gene", "must have its entry in the mu vector. ", "(And regardless of per-gene mutation rates ", " there must be at least two gene/loci).") } ## Must deal with objects from previous, pre flfast, modifications ## Could move this way down to the bottom, right before ## .Call if(!exists("fitnessLandscape_gene_id", where = rFE)) { rFE$fitnessLandscape_df <- data.frame() rFE$fitnessLandscape_gene_id <- data.frame() } namedGenes <- allNamedGenes(rFE) if( length(mu) > 1) { if(is.null(names(mu))) stop("When using per-gene mutation rates the ", "mu vector must be named ", "(and if you have noIntGenes, those must have names).") if(length(mu) != countGenesFe(rFE)) stop("When using per-gene mutation rates, ", "there must be the same number of genes in the ", "mu vector and the fitness effects object.") if(!identical(sort(namedGenes$Gene), sort(names(mu)))) stop("When using per-gene mutation rates, ", "names of genes must match those in the ", "restriction table.") mu <- mu[order(match(names(mu), namedGenes$Gene))] ## Hyperparanoid check. Should never, ever, happen. if(!identical(names(mu), namedGenes$Gene)) stop("Names of re-ordered mu do not match names of genes") minmu <- 1e-40 if(any(mu <= minmu)) stop(paste("At least one per-gene mutation rate is negative", "or less than", minmu,". Remember that the per-base", "mutation rate in the human genome is about 1e-11 to 1e-9.")) } if(!is.null(initMutant)) { if(!all(unlist(lapply(initMutant, is.character)))) stop("initMutant must be a (list of) character string(s)") ## FIXME: remove in the future initMutantString <- initMutant if(length(grep(">", initMutant))) { initMutant <- lapply(initMutant, function(x) nice.vector.eo(x, ">")) } else if(length(grep(",", initMutant))) { initMutant <- lapply(initMutant, function(x) nice.vector.eo(x, ",")) } initMutant <- lapply(initMutant, function(x) { if( (length(x) == 1) && ((x == "") || (x == "WT"))) return(vector(mode = "integer", length = 0)) else return(getGeneIDNum(rFE$geneModule, rFE$long.geneNoInt, rFE$fitnessLandscape_gene_id, x, FALSE)) } ) if(max(unlist(lapply(initMutant, length))) > countGenesFe(rFE)) { ## if(length(initMutant) > countGenesFe(rFE)) { stop("For initMutant you passed more genes mutated ", "than the number of genes in the genotype (fitness effects).") } } else { initMutant <- vector(mode = "integer") initMutantString <- "" } if(any(initSize <= 0)) stop("At least one initSize <= 0") if(length(initMutant)) { if(length(initSize) != length(initMutant)) stop("Lengths of initSize and initMutant differ") } ## these are never user options ## if(initSize_species < 10) { ## warning("initSize_species too small?") ## } ## if(initSize_iter < 100) { ## warning("initSize_iter too small?") ## } if(typeFitness %in% c("bozic1", "bozic2")) { if(nrow(rFE$fitnessLandscape_df) > 0) warning("Bozic model passing a fitness landscape most likely will not work", " for now.") ## FIXME: bozic and fitness landscape ## the issue is that in the C++ code we directly do ## s = birth rate - 1 ## but we would need something different ## Can be done going through epistasis, etc. thesh <- unlist(lapply(rFE$long.rt, function(x) x$sh)) ## thes <- unlist(lapply(rFE$long.rt, function(x) x$s)) thes <- unlist(c(lapply(rFE$long.rt, function(x) x$s), lapply(rFE$long.epistasis, function(x) x$s), lapply(rFE$long.orderEffects, function(x) x$s), rFE$long.geneNoInt$s)) if(any(thes > 1 )) { m <- paste("You are using a Bozic model with", "the new restriction specification, and you have", "at least one s > 1.", "But that is the same as setting that to 1:", "we obviously cannot allow negative death rates,", "nor problems derived from multiplying odd or even", "numbers of negative numbers.", "You can set those > 1 to 1, but then you will", "eventually get the simulations to abort when the", "death rate becomes 0.") stop(m) } if(any( (thesh <= -1) & (thesh > -Inf))) { m <- paste("You are using a Bozic model with", "the new restriction specification, and you have", "at least one sh <= -1. Maybe you mean -Inf?") warning(m) } if(any(thes == 1 )) { m <- paste("You are using a Bozic model with", "the new restriction specification, and you have", "at least one s of 1. If that gene is mutated, ", "this will lead to a death rate of 0", "and the simulations will abort when you get a non finite", "value.") warning(m) } } if(!is.null(MMUEF)) { if(!inherits(MMUEF, "mutatorEffects")) stop("muEF must be a mutatorEffects object") full2mutator_ <- matchGeneIDs(MMUEF, rFE) if(any(is.na(full2mutator_$Full))) stop("Genes in mutatorEffects not present in fitnessEffects") if(any(is.na(full2mutator_))) stop("full2mutator failed for unknown reasons") full2mutator_ <- full2mutator_$Reduced } else { full2mutator_ <- vector(mode = "numeric", length = 0) ## muEF <- emptyFitnessEffects() } dpr <- detectionProbCheckParse(detectionProb, sum(initSize), verbosity) ## if( !is.null(cPDetect) && (sum(!is.null(p2), !is.null(n2)) >= 1 )) ## stop("Specify only cPDetect xor both of p2 and n2") ## if( (is.null(p2) + is.null(n2)) == 1 ) ## stop("If you pass one of n2 or p2, you must also pass the other. ", ## "Otherwise, we would not know what to do.") ## stopifnot(PDBaseline >= 0) ## stopifnot(n2 > PDBaseline) ## stopifnot(p2 < 1) ## stopifnot(p2 > 0) ## if( is.null(cPDetect) ) cPDetect <- -9 ## if( is.null(p2)) p2 <- 9 ## if( is.null(n2)) n2 <- -9 ## call <- match.call() ## Process the fixed list, if any if(!is_null_na(fixation)) { ng <- namedGenes ## rownames(ng) <- namedGenes[, "Gene"] ## Proposed extension to have exact matching of genotypes ng <- rbind( data.frame(Gene = "_", GeneNumID = -9, stringsAsFactors = FALSE), ng) rownames(ng) <- ng[, "Gene"] ## FIXME ## Later, accept a last argument, called tolerance. ## If not present, set to 0 ## and then, at at the head of fixation_list below if(is.list(fixation)) { if( (is.null(fixation[["fixation_tolerance"]])) || (is.na(fixation[["fixation_tolerance"]]))) { fixation_tolerance <- 0 } else { fixation_tolerance <- as.numeric(fixation[["fixation_tolerance"]]) fixation <- fixation[-which(names(fixation) == "fixation_tolerance")] } if( (is.null(fixation[["min_successive_fixation"]])) || (is.na(fixation[["min_successive_fixation"]]))) { min_successive_fixation <- default_min_successive_fixation } else { min_successive_fixation <- as.integer(fixation[["min_successive_fixation"]]) fixation <- fixation[-which(names(fixation) == "min_successive_fixation")] } if( (is.null(fixation[["fixation_min_size"]])) || (is.na(fixation[["fixation_min_size"]]))) { fixation_min_size <- 0 } else { fixation_min_size <- as.integer(fixation[["fixation_min_size"]]) fixation <- fixation[-which(names(fixation) == "fixation_min_size")] } } else { if(is_null_na(fixation["fixation_tolerance"])) { fixation_tolerance <- 0 } else { fixation_tolerance <- as.numeric(fixation["fixation_tolerance"]) fixation <- fixation[-which(names(fixation) == "fixation_tolerance")] } if(is_null_na(fixation["min_successive_fixation"])) { min_successive_fixation <- default_min_successive_fixation } else { min_successive_fixation <- as.integer(fixation["min_successive_fixation"]) fixation <- fixation[-which(names(fixation) == "min_successive_fixation")] } if(is_null_na(fixation["fixation_min_size"])) { fixation_min_size <- 0 } else { fixation_min_size <- as.integer(fixation["fixation_min_size"]) fixation <- fixation[-which(names(fixation) == "fixation_min_size")] } } if( (fixation_tolerance > 1) || (fixation_tolerance < 0) ) stop("Impossible range for fixation tolerance") if( (min_successive_fixation < 0) ) stop("Impossible range for min_successive_fixation") if( (fixation_min_size < 0) ) stop("Impossible range for fixation_min_size") ## Usual genotype specification and might allow ordered vectors ## in the future fixation_b <- lapply(fixation, nice.vector.eo, sep = ",") ulf <- unlist(fixation_b) if(any(ulf == ">")) stop("Order effects not allowed (yet?) in fixation.") ulfg <- ng[ulf, 1] if(any(is.na(ulfg))) stop(paste("The 'fixation' list contains genes that are not present", " in the fitness effects.")) ## Sorting here is crucial!! fixation_list <- lapply(fixation_b, function(x) sort(ng[x, 2])) fixation_list <- list(fixation_list = fixation_list, fixation_tolerance = fixation_tolerance, min_successive_fixation = min_successive_fixation, fixation_min_size = fixation_min_size) } else { fixation_list <- list() } return(c( nr_BNB_Algo5(rFE = rFE, mu_ = mu, death = death, initSize_ = initSize, sampleEvery = sampleEvery, detectionSize = detectionSize, finalTime = finalTime, initSp = initSize_species, initIt = initSize_iter, seed = seed, verbosity = verbosity, speciesFS = speciesFS, ratioForce = ratioForce, typeFitness_ = typeFitness, maxram = max.memory, mutationPropGrowth = as.integer(mutationPropGrowth), initMutant_ = initMutant, maxWallTime = max.wall.time, keepEvery = keepEvery, K = K, detectionDrivers = detectionDrivers, onlyCancer = onlyCancer, errorHitWallTime = errorHitWallTime, maxNumTries = max.num.tries, errorHitMaxTries = errorHitMaxTries, minDetectDrvCloneSz = minDetectDrvCloneSz, extraTime = extraTime, keepPhylog = keepPhylog, MMUEF = MMUEF, full2mutator_ = full2mutator_, n2 = dpr["n2"], p2 = dpr["p2"], PDBaseline = dpr["PDBaseline"], cPDetect_i= dpr["cPDetect"], checkSizePEvery = dpr["checkSizePEvery"], AND_DrvProbExit = AND_DrvProbExit, fixation_list = fixation_list), Drivers = list(rFE$drv), ## but when doing pops, these will be repeated geneNames = list(names(getNamesID(rFE))), InitMutant = initMutantString )) } countGenesFe <- function(fe) { ## recall geneModule has Root always ## We want to be able to use objects that did not have ## the fitness landscape component if(exists("fitnessLandscape_gene_id", where = fe)) return(nrow(fe$geneModule) + nrow(fe$long.geneNoInt) + nrow(fe$fitnessLandscape_gene_id) - 1) else return(nrow(fe$geneModule) + nrow(fe$long.geneNoInt) - 1) } allNamedGenes <- function(fe){ ## Returns a data frame with genes and their names and verifies all ## genes have names. ## Root should always be first, but just in case ## avoid assuming it ## This does is not a good idea as it assumes the user did not use ## "keepInput = FALSE". ## lni <- length(fe$noIntGenes) ## ## FIXME:test ## if((lni > 0) && ## (is.null(names(fe$noIntGenes)))) ## stop("When using per-gene mutation rates the ", ## "no interaction genes must be named ", ## "(i.e., the noIntGenes vector must have names).") ## accommodate objects w/o fitnessLandscape if(!is.null(fe$fitnessLandscape) && nrow(fe$fitnessLandscape)) { gn <- gtools::mixedsort( colnames(fe$fitnessLandscape)[-ncol(fe$fitnessLandscape)]) v1 <- data.frame(Gene = gn, GeneNumID = seq.int(length(gn)), stringsAsFactors = FALSE) } else { v1 <- fe$geneModule[, c("Gene", "GeneNumID")] if(nrow(fe$long.geneNoInt)) { v1 <- rbind(v1, fe$long.geneNoInt[, c("Gene", "GeneNumID")]) } if(any(v1[, "Gene"] == "Root")) v1 <- v1[-which(v1[, "Gene"] == "Root"), ] } rownames(v1) <- NULL if(any(v1$Gene == "WT")) { warning("A gene is named WT. You can expect problems ", "because we use WT to denote the wildtype ", "genotype. You might want to change it.") } return(v1) } get.gene.counts <- function(x) { # , timeSample = "last", # typeSample = "whole") { ## From get.mut.vector. Used for now just for testing timeSample <- "last" the.time <- get.the.time.for.sample(x, timeSample, NULL) if(the.time < 0) { counts <- rep(0, length(x$geneNames)) names(counts) <- x$geneNames freq <- rep(NA, length(x$geneNames)) names(freq) <- x$geneNames return(list(counts = counts, freq = freq, popSize = 0)) } pop <- x$pops.by.time[the.time, -1] if(all(pop == 0)) { stop("You found a bug: this should never happen") } ## if(typeSample %in% c("wholeTumor", "whole")) { popSize <- x$PerSampleStats[the.time, 1] counts <- as.vector(tcrossprod(pop, x$Genotypes)) names(counts) <- x$geneNames return(list(counts = counts, freq = counts/popSize, popSize = popSize)) ## return( (tcrossprod(pop, ## x$Genotypes)/popSize) ) ## } else if (typeSample %in% c("singleCell", "single")) { ## return(x$Genotypes[, sample(seq_along(pop), 1, prob = pop)]) ## } else { ## stop("Unknown typeSample option") ## } } geneCounts <- function(x) { if(inherits(x, "oncosimulpop")) { return( do.call(rbind, lapply(x, function(z) get.gene.counts(z)$counts)) ) } else { return( get.gene.counts(x)$counts) } } matchGeneIDs <- function(x, refFE) { n1 <- allNamedGenes(x) n2 <- allNamedGenes(refFE) colnames(n1)[2] <- "Reduced" colnames(n2)[2] <- "Full" ## Non standard evaluation thing and a note being generated in check. ## See also http://stackoverflow.com/questions/33299845/when-writing-functions-for-an-r-package-should-i-avoid-non-standard-evaluation ## But does not work well with replace. So use the NULL trick Reduced <- NULL dplyr::full_join(n2, n1, by = "Gene") %>% mutate(Reduced = replace(Reduced, is.na(Reduced), -9)) } detectionProbCheckParse <- function(x, initSize, verbosity) { default_p2 <- 0.1 default_n2 <- 2 * initSize default_PDBaseline <- 1.2 * initSize default_checkSizePEvery <- 20 ## No default cPDetect. That is done from p2 and n2 in C++. if(is_null_na(x)) { ## if((length(x) == 1) && (is.na(x))) { y <- vector() y["cPDetect"] <- -9 y["p2"] <- 9 y["n2"] <- -9 y["PDBaseline"] <- -9 y["checkSizePEvery"] <- Inf return(y) } else if((length(x) == 1) && (x == "default")) { ## Populate with defaults y <- vector() y["p2"] <- default_p2 y["n2"] <- default_n2 y["PDBaseline"] <- default_PDBaseline y["checkSizePEvery"] <- default_checkSizePEvery x <- y } if(length(x) >= 1) { if( !all(names(x) %in% c("cPDetect", "p2", "n2", "PDBaseline", "checkSizePEvery"))) stop("Names of some components of detectionProb are not recognized.", " Check for possible typos.") } ## This ain't conditional. If not given, always check if( !is_null_na(x["cPDetect"]) && (sum(!is_null_na(x["p2"]), !is_null_na(x["n2"])) >= 1 )) stop("Specify only cPDetect xor both of p2 and n2") if( (is_null_na(x["p2"]) + is_null_na(x["n2"])) == 1 ) stop("If you pass one of n2 or p2, you must also pass the other. ", "Otherwise, we would not know what to do.") if(is_null_na(x["PDBaseline"])) { x["PDBaseline"] <- default_PDBaseline if(verbosity > -1) message("\n PDBaseline set to default value of ", default_PDBaseline, "\n") } if(is_null_na(x["checkSizePEvery"])) { x["checkSizePEvery"] <- default_checkSizePEvery if(verbosity > -1) message("\n checkSizePEvery set to default value of ", default_checkSizePEvery, "\n") } ## If we get here, we checked consistency of whether cPDetect or p2/n2. ## Fill up with defaults the missing values if(is_null_na(x["cPDetect"])) { if(is_null_na(x["p2"])) { if(!is_null_na(x["n2"])) stop("Eh? no p2 but n2? Bug") x["n2"] <- default_n2 x["p2"] <- default_p2 if(verbosity > -1) message("\n n2, p2 set to default values of ", default_n2, ", ", default_p2, "\n") } } if(x["checkSizePEvery"] <= 0) stop("checkSizePEvery <= 0") if(x["PDBaseline"] <= 0) stop("PDBaseline <= 0") if(!is_null_na(x["n2"])) { if(x["n2"] <= x["PDBaseline"]) stop("n2 <= PDBaseline") if(x["p2"] >= 1) stop("p2 >= 1") if(x["p2"] <= 0) stop("p2 <= 0") x["cPDetect"] <- -9 } else { ## only if x["cPDetect"] is not NA if(is_null_na(x["cPDetect"])) stop("eh? you found a bug")## paranoia x["n2"] <- -9 x["p2"] <- -9 if(verbosity > -1) message("\n Using user-specified cPDetect as n2, p2 not given.\n") } return(x) } sampledGenotypes <- function(y, genes = NULL) { ## From a popSample object, or a matrix for that matter, ## show the sampled genotypes and their frequencies if(!is.null(genes)) { cols <- which(colnames(y) %in% genes ) y <- y[, cols] } ## nn <- colnames(y) genotlabel <- function(u, nn = colnames(y)) { if(any(is.na(u))) return(NA) else { return(paste(sort(nn[as.logical(u)]), collapse = ", ")) } } ## df <- data.frame(table( ## apply(y, 1, function(z) paste(sort(nn[as.logical(z)]), collapse = ", ") ) ## )) df <- data.frame(table( apply(y, 1, genotlabel), useNA = "ifany"), stringsAsFactors = TRUE) gn <- as.character(df[, 1]) gn[gn == ""] <- "WT" df <- data.frame(Genotype = gn, Freq = df[, 2], stringsAsFactors = FALSE) attributes(df)$ShannonI <- shannonI(na.omit(df)$Freq) class(df) <- c("sampledGenotypes", class(df)) return(df) } print.sampledGenotypes <- function(x, ...) { print.data.frame(x, ...) cat("\n Shannon's diversity (entropy) of sampled genotypes: ") cat(attributes(x)$ShannonI, "\n") } list_g_matches_fixed <- function(x, y) { ## Internal function, for testing the fixation output. ## x and y are vectors ## x is the set of output genotypes, y the set of fixed ## genotypes/subset of genotypes. ## Yes, this function has tests too in test.fixation.R ## All genotypes in x satisfy that they are supersets of at least one ## in y? That is true if, for every element in x, at least one y in ## that x. if(is.list(y)) y <- unlist(y) y.nice <- lapply(y, nice.vector.eo, sep = ",") x.nice <- lapply(x, nice.vector.eo, sep = ",") fu <- function(u, y.nice) any(unlist(lapply(y.nice, function(z) all(z %in% u)))) return(all(unlist(lapply(x.nice, fu, y.nice)))) } ## From genotlable above. But this is very useful ## fitnessLandscape (only genes) as columns with 0, 1 -> genotypes names as letters genotype_letterslabel <- function(df) { genotlabel <- function(u, nn = colnames(df)) { if(any(is.na(u))) return(NA) else { return(paste(sort(nn[as.logical(u)]), collapse = ", ")) } } tmp <- apply(df, 1, genotlabel) tmp[tmp == ""] <- "WT" return(tmp) } ###################################################################### ###################################################################### ###################################################################### ###################################################################### ## character vector, named replacement -> replaced vector ## named_replace: names are the (fixed) pattern, value the replacement ## stringr::str_replace_all probably better ## my_mgsub <- function(x, named_replace) { ## nn <- names(named_replace) ## xr <- x ## for(i in 1:length(named_replace)) { ## xr <- gsub(nn[i], named_replace[i], xr, fixed = TRUE) ## } ## return(xr) ## }