\name{oncoSimulIndiv} \alias{oncoSimulIndiv} \alias{oncoSimulPop} \alias{print.oncosimul} \alias{print.oncosimulpop} \alias{summary.oncosimul} \alias{summary.oncosimulpop} \alias{oncoSimulSample} \title{ Simulate tumor progression for one or more individuals, optionally returning just a sample in time. } \description{ Simulate tumor progression including possible restrictions in the order of driver mutations. Optionally add passenger mutations. Simulation is done using the BNB algorithm of Mather et al., 2012. } \usage{ oncoSimulIndiv(fp, model = "Exp", numPassengers = 30, mu = 1e-6, detectionSize = 1e8, detectionDrivers = 4, sampleEvery = ifelse(model \%in\% c("Bozic", "Exp"), 1, 0.025), initSize = 500, s = 0.1, sh = -1, K = initSize/(exp(1) - 1), keepEvery = sampleEvery, minDetectDrvCloneSz = "auto", extraTime = 0, finalTime = 0.25 * 25 * 365, onlyCancer = TRUE, keepPhylog = FALSE, max.memory = 2000, max.wall.time = 200, max.num.tries = 500, errorHitWallTime = TRUE, errorHitMaxTries = TRUE, verbosity = 0, initMutant = NULL, seed = NULL) oncoSimulPop(Nindiv, fp, model = "Exp", numPassengers = 30, mu = 1e-6, detectionSize = 1e8, detectionDrivers = 4, sampleEvery = ifelse(model \%in\% c("Bozic", "Exp"), 1, 0.025), initSize = 500, s = 0.1, sh = -1, K = initSize/(exp(1) - 1), keepEvery = sampleEvery, minDetectDrvCloneSz = "auto", extraTime = 0, finalTime = 0.25 * 25 * 365, onlyCancer = TRUE, keepPhylog = FALSE, max.memory = 2000, max.wall.time = 200, max.num.tries = 500, errorHitWallTime = TRUE, errorHitMaxTries = TRUE, initMutant = NULL, verbosity = 0, mc.cores = detectCores(), seed = "auto") oncoSimulSample(Nindiv, fp, model = "Exp", numPassengers = 0, mu = 1e-6, detectionSize = round(runif(Nindiv, 1e5, 1e8)), detectionDrivers = { if(inherits(fp, "fitnessEffects")) { if(length(fp$drv)) { nd <- (2: round(0.75 * length(fp$drv))) } else { nd <- 0 } } else { nd <- (2 : round(0.75 * max(fp))) } if (length(nd) == 1) nd <- c(nd, nd) sample(nd, Nindiv, replace = TRUE) }, sampleEvery = ifelse(model \%in\% c("Bozic", "Exp"), 1, 0.025), initSize = 500, s = 0.1, sh = -1, K = initSize/(exp(1) - 1), minDetectDrvCloneSz = "auto", extraTime = 0, finalTime = 0.25 * 25 * 365, onlyCancer = TRUE, keepPhylog = FALSE, max.memory = 2000, max.wall.time.total = 600, max.num.tries.total = 500 * Nindiv, typeSample = "whole", thresholdWhole = 0.5, initMutant = NULL, verbosity = 1, seed = "auto") } \arguments{ \item{Nindiv}{Number of individuals or number of different trajectories to simulate.} \item{fp}{ Either a poset that specifies the order restrictions (see \code{\link{poset}} if you want to use the specification as in v.1. Otherwise, a fitnessEffects object (see \code{\link{allFitnessEffects}}). Other arguments below (s, sh) make sense only if you use a poset, as they are included in the fitnessEffects object. } \item{model}{ One of "Bozic", "Exp", "McFarlandLog" (the last one can be abbreviated to "McFL"). } \item{numPassengers}{ The number of passenger genes.The total number of genes (drivers plus passengers) must be smaller than 64. All driver genes should be included in the poset (even if they depend on no one and no one depends on them), and will be numbered from 1 to the total number of driver genes. Thus, passenger genes will be numbered from (number of driver genes + 1):(number of drivers + number of passengers). } \item{mu}{ Mutation rate. } \item{detectionSize}{ What is the minimal number of cells for cancer to be detected. For \code{oncoSimulSample} this can be a vector. } \item{detectionDrivers}{ The minimal number of drivers present in any clone for cancer to be detected. For \code{oncoSimulSample} this can be a vector. The default in this case is a vector of drivers from a uniform between 2 and 0.75 the total number of drivers } \item{sampleEvery}{ How often the whole population is sampled. This is not the same as the interval between successive samples that keep stored (for that, see \code{keepEvery}). For very fast growing clones, you might need to have a small value here to minimize possible numerical problems (such as huge increase in population size between two successive samples that can then lead to problems for random number generators). Likewise, for models with density dependence (such as McF) this value should be very small. } \item{initSize}{ Initial population size. } \item{s}{ Selection coefficient for drivers. Only relevant if using a poset as this is included in the fitnessEffects object. } \item{sh}{ Selection coefficient for drivers with restrictions not satisfied. A value of 0 means there are no penalties for a driver appearing in a clone when its restrictions are not satisfied. To specify "sh=Inf" (in Diaz-Uriarte, 2014) use sh = -1. Only relevant if using a poset as this is included in the fitnessEffects object. } \item{K}{ Initial population equilibrium size in the McFarland models. } \item{keepEvery}{ Time interval between successive whole population samples that are actually stored. This must be larger or equal to sampleEvery. If keepEvery is not a multiple integer of sampleEvery, the keepEvery in use will be the smallest multiple integer of keepEvery larger than the specified keepEvery. If you want nice plots, set \code{sampleEvery} and \code{keepEvery} to small values (say, 1 or 0.5). Otherwise, you can use a \code{sampleEvery} of 1 but a \code{keepEvery} of 15, so that the return objects are not huge. } %% \item{endTimeEvery}{ %% If endTimeEvery is > 0, even if conditions for finishing a simulation %% (number of drivers or population size) are met at time \emph{t}, the %% simulation will run at least until \emph{t + endTimeEvery} and %% conditions will be checked again. Only if conditions for finishing a %% simulation are still met, will the simulation end. %% The reason for this parameter is to ensure that, say, a clone with a %% certain number of drivers that would cause the simulation to end has %% not just appeared but then gone extinct shortly after. Beware, though, %% that in simulations with very fast growth, setting large endTimeEvery %% can result in the simulations taking a long time to finish or hitting %% the wall time limit.} \item{minDetectDrvCloneSz}{ A value of 0 or larger than 0 (by default equal to \code{initSize} in the McFarland model). If larger than 0, when checking if we are done with a simulation, we verify that the sum of the population sizes of all clones that have a number of mutated drivers larger or equal to \code{detectionDrivers} is larger or equal to this \code{minDetectDrvCloneSz}. The reason for this parameter is to ensure that, say, a clone with a certain number of drivers that would cause the simulation to end has not just appeared and is present in only one individual that might then immediately go extinct. This can be relevant in secenarios such as the McFarland model. See also \code{extraTime}. } \item{extraTime}{ A value larger than zero waits those many additional time periods before exiting after having reached the exit condition (population size, number of drivers). The reason for this setting is to prevent the McFL models from always exiting at a time when one clone is increasing its size quickly (see \code{minDetectDrvCloneSz}). By setting an \code{extraTime} larger than 0, we can sample at points when we are at the plateau. } \item{finalTime}{ What is the maximum number of time units that the simulation can run. } \item{onlyCancer}{ Return only simulations that reach cancer? If set to TRUE, only simulations that satisfy the \code{detectionDrivers} or the \code{detectionSize} requirements will be returned: the simulation will be repeated, within the limits set by \code{max.num.tries} and \code{max.wall.time} (and, for \code{oncoSimulSample} also \code{max.num.tries.total} and \code{max.wall.time.total}), until one which meets the \code{detectionDrivers} or \code{detectionSize} is obtained. Otherwise, the simulation is returned regardless of final population size or number of drivers in any clone and this includes simulations where the population goes extinct. } \item{keepPhylog}{ If TRUE, keep track of when and from which clone each clone is created. See also \code{\link{plotClonePhylog}}. } %% \item{seed}{The seed to use for the C++ random number generator. If not %% passed explicitly (the default) then chosen from R.} \item{initMutant}{For v.2, a string with the mutations of the initial mutant, if any. This is the same format as for \code{\link{evalGenotype}}. For v.1, the single mutation of the initial clone for the simulations. The default (if you pass nothing) is to start the simulation from the wildtype genotype with nothing mutated.} \item{max.num.tries}{Only applies when \code{onlyCancer = TRUE}. What is the maximum number of times, for an individual simulation, we can repeat the simulation for it to reach cancer? There are certain parameter settings where reaching cancer is extremely unlikely and you might not want to run forever in those cases. } \item{max.num.tries.total}{Only applies when \code{onlyCancer = TRUE} and for \code{oncoSimulSample}. What is the maximum number of times, over all simulations for all individuals in a population sample, that we can repeat the simulations so that cancer is reached for all individuals? The idea is to set a limit on the average minimal probability of reaching cancer for a set of simulations to be accepted.} \item{max.wall.time}{ Maximum wall time for each individual simulation run. If the simulation is not done in this time, it is aborted. } \item{max.wall.time.total}{ Maximum wall time for all the simulations (when using \code{oncoSimulSample}), in seconds. If the simulation is not completed in this time, it is aborted. To prevent problems from a single individual simulation going wild, this limit is also enforced per simulation (so the run can be aborted directly from C++). } \item{errorHitMaxTries}{ If TRUE (the default) a simulation that reaches the maximum number of repetitions allowed is considered not to have succesfully finished and, thus, an error, and no output from it will be reported. This is often what you want.%% But if you want to see %% what is happening for some set of parameters that seem to never %% finish, set this to FALSE, and check what the output looks like. See \code{Details}. } \item{errorHitWallTime}{ If TRUE (the default) a simulation that reaches the maximum wall time is considered not to have succesfully finished and, thus, an error, and no output from it will be reported. This is often what you want. %% But if you want to see what is happening for some set of %% parameters that seem to never finish, set this to FALSE, and check %% what the output looks like. See \code{Details}. } \item{max.memory}{ The largest size (in MB) of the matrix of Populations by Time. If it creating it would use more than this amount of memory, it is not created. This prevents you from accidentally passing parameters that will return an enormous object. } \item{verbosity}{ If 0, run as silently as possible. Otherwise, increasing values of verbosity provide progressively more information about intermediate steps, possible numerical notes/warnings from the C++ code, etc. } \item{typeSample}{ "singleCell" (or "single") for single cell sampling, where the probability of sampling a cell (a clone) is directly proportional to its population size. "wholeTumor" (or "whole") for whole tumor sampling (i.e., this is similar to a biopsy being the entire tumor). See \code{\link{samplePop}}. } \item{thresholdWhole}{ In whole tumor sampling, whether a gene is detected as mutated depends on thresholdWhole: a gene is considered mutated if it is altered in at least thresholdWhole proportion of the cells in that individual. See \code{\link{samplePop}}. } \item{mc.cores}{Number of cores to use when simulating more than one individual (i.e., when calling oncoSimulPop).} \item{seed}{The seed for the C++ PRNG. You can pass a value. If you set it to NULL, then a seed will be generated in R and passed to C++. If you set it to "auto", then if you are using v.1, the behavior is the same as if you set it to NULL (a seed will be generated in R and passed to C++) but if you are using v.2, a random seed will be produced in C++. %% using the randutils code from M.E.O'Neill. If you need reproducibility, either pass a value or set it to NULL (setting it to NULL will make the C++ seed reproducible if you use the same seed in R via \code{set.seed}). However, even using the same value of \code{seed} is unlikely to give the exact same results between platforms and compilers. Moreover, note that the defaults for \code{seed} are not the same in \code{oncoSimulIndiv}, \code{oncoSimulPop} and \code{oncoSimulSample}. } } \details{ The basic simulation algorithm implemented is the BNB one of Mather et al., 2012, where I have added modifications to fitness based on the restrictions in the order of mutations. Full details about the algorithm are provided in Mather et al., 2012. The evolutionary models, including references, and the rest of the parameters are explained in Diaz-Uriarte, 2014, especially in the Supplementary Material. The model called "Bozic" is based on Bozic et al., 2010, and the model called "McFarland" in McFarland et al., 2013. oncoSimulPop simply calls oncoSimulIndiv multiple times. When run on POSIX systems, it can use multiple cores (via mclapply). The \code{summary} methods for these classes return some of the return values (see next) as a one-row (for class oncosimul) or multiple row (for class oncosimulpop) data frame. The \code{print} methods for these classes simply print the summary. Changing options \code{errorHitMaxTries} and \code{errorHitWallTime} can be useful when conducting many simulations, as in the call to \code{oncoSimulPop}: setting them to TRUE means nothing is recorded for those simulations where ending conditions are not reached but setting them to FALSE would allow you to record the output; this would potentially result in a mixture where some simulations would not have reached the ending condition, but this might sometimes be what you want. Note, however, that \code{oncoSimulSample} always has both them to TRUE, as it could not be otherwise. \code{GenotyesWDistinctOrderEff} provides the information about order effects that is missing from \code{Genotyoes}. When there are order effects, the \code{Genotypes} matrix can contain genotypes that are not distinguishable. Suppose there are two genes, the first and the second. In the \code{Genotype} output you can get two columns where there is a 1 in both genes: those two columns correspond to the two possible orders (first gene mutated first, or first gene mutated after the second). \code{GenotypesWDistinctOrderEff} disambiguates this. The same is done by \code{GenotypeLabels}; this is easier to decode for a human (a string of gene labels) but a little bit harder to parse automatically. } \value{ For \code{oncoSimulIndiv} a list, of class "oncosimul", with the following components: \item{pops.by.time }{A matrix of the population sizes of the clones, with clones in columns and time in row. Not all clones are shown here, only those that were present in at least on of the keepEvery samples.} \item{NumClones }{Total number of clones in the above matrix. This is not the total number of distinct clones that have appeared over all simulations (which is likely to be larger or much larger).} \item{TotalPopSize}{Total population size at the end.} \item{Genotypes}{A matrix of genotypes. For each of the clones in the pops.by.time matrix, its genotype, with a 0 if the gene is not mutated and a 1 if it is mutated.} \item{MaxNumDrivers}{The largest number of mutated driver genes ever seen in the simulation in any clone.} \item{MaxDriversLast}{The largest number of mutated drivers in any clone at the end of the simulation.} \item{NumDriversLargestPop}{The number of mutated driver genes in the clone with largest population size. } \item{LargestClone}{Population size of the clone with largest number of population size.} \item{PropLargestPopLast}{Ratio of LargestClone/TotalPopSize} \item{FinalTime}{The time (in time units) at the end of the simulation.} \item{NumIter}{The number of iterations of the BNB algorithm.} \item{HittedWallTime}{TRUE if we reached the limit of max.wall.time. FALSE otherwise.} \item{TotalPresentDrivers}{The total number of mutated driver genes, whether or not in the same clone. The number of elements in \code{OccurringDrivers}, below.} \item{CountByDriver}{A vector of length number of drivers, with the count of the number of clones that have that driver mutated.} \item{OccurringDrivers}{The actual number of drivers mutated.} \item{PerSampleStats}{A 5 column matrix with a row for each sampling period. The columns are: total population size, population size of the largest clone, the ratio of the two, the largest number of drivers in any clone, and the number of drivers in the clone with the largest population size.} \item{other}{A list that contains statistics for an estimate of the simulation error when using the McFarland model as well as other statistics. For the McFarland model, the relevant value is errorMF, which is -99 unless in the McFarland model. For the McFarland model it is the largest difference of successive death rates. The entries names minDMratio and minBMratio are the smallest ratio, over all simulations, of death rate to mutation rate or birth rate to mutation rate. The BNB algorithm thrives when those are large.} For \code{oncoSimulPop} a list of length \code{Nindiv}, and of class \code{"oncosimulpop"}, where each element of the list is itself a list, of class \code{oncosimul}, with components as described above. In v.2, the output is of both class "oncosimul" and "oncosimul2". The oncoSimulIndiv return object differs in \item{GenotypesWDistinctOrderEff}{ A list of vectors, where each vector corresponds to a genotype in the \code{Genotypes}, showing (where it matters) the order of mutations. Each vector shows the genotypes, with the numeric codes, showing explicitly the order when it matters. So if you have genes 1, 2, 7 for which order relationships are given, and genes 3, 4, 5, 6 for which other interactions exist, any mutations in 1, 2, 7 are shown first, and in the order they occurred, before showing the rest of the mutations. See details.} \item{GenotypesLabels}{The genotypes, as character vectors with the original labels provided (i.e., not the integer codes). As before, mutated genes, for those where order matters, come first, and are separated by the rest by a "_". See details.} \item{OccurringDrivers}{This is the same as in v.1, but we use the labels, not the numeric id codes. Of course, if you entered integers as labels for the genes, you will see numbers (however, as a character string).} %% Fixme: mention probCancer is not a binomial, but a negative binomial. } \references{ Bozic, I., et al., (2010). Accumulation of driver and passenger mutations during tumor progression. \emph{ Proceedings of the National Academy of Sciences of the United States of America\/}, \bold{107}, 18545--18550. Diaz-Uriarte, R. (2015). Identifying restrictions in the order of accumulation of mutations during tumor progression: effects of passengers, evolutionary models, and sampling \url{http://www.biomedcentral.com/1471-2105/16/41/abstract} Gerstung et al., 2011. The Temporal Order of Genetic and Pathway Alterations in Tumorigenesis. \emph{PLoS ONE}, 6. McFarland, C.~D. et al. (2013). Impact of deleterious passenger mutations on cancer progression. \emph{Proceedings of the National Academy of Sciences of the United States of America\/}, \bold{110}(8), 2910--5. Mather, W.~H., Hasty, J., and Tsimring, L.~S. (2012). Fast stochastic algorithm for simulating evolutionary population dynamics. \emph{Bioinformatics (Oxford, England)\/}, \bold{28}(9), 1230--1238. } \note{Please, note that the meaning of the fitness effects in the McFarland model is not the same as in the original paper; the fitness coefficients are transformed to allow for a simpler fitness function as a product of terms. This differs with respect to v.1. See the vignette for details.} \author{Ramon Diaz-Uriarte} \seealso{ \code{\link{plot.oncosimul}}, \code{\link{examplePosets}}, \code{\link{samplePop}}, \code{\link{allFitnessEffects}} } \examples{ ################################# ##### ##### Examples using v.1 ##### ################################# ## use poset p701 data(examplePosets) p701 <- examplePosets[["p701"]] ## Bozic Model b1 <- oncoSimulIndiv(p701) summary(b1) plot(b1, addtot = TRUE) ## McFarland; use a small sampleEvery, but also a reasonable ## keepEvery. ## We also modify mutation rate to values similar to those in the ## original paper. ## Note that detectionSize will play no role ## finalTime is large, since this is a slower process ## initSize is set to 4000 so the default K is larger and we are likely ## to reach cancer. Alternatively, set K = 2000. m1 <- oncoSimulIndiv(p701, model = "McFL", mu = 5e-7, initSize = 4000, sampleEvery = 0.025, finalTime = 15000, keepEvery = 10, onlyCancer = FALSE) plot(m1, addtot = TRUE, log = "") ## Simulating 4 individual trajectories ## (I set mc.cores = 2 to comply with --as-cran checks, but you ## should either use a reasonable number for your hardware or ## leave it at its default value). p1 <- oncoSimulPop(4, p701, keepEvery = 10, mc.cores = 2) summary(p1) samplePop(p1) p2 <- oncoSimulSample(4, p701) ######################################### ###### ###### Examples using v.2: ###### ######################################### #### A model similar to the one in McFarland. We use 2070 genes. set.seed(456) nd <- 70 np <- 2000 s <- 0.1 sp <- 1e-3 spp <- -sp/(1 + sp) mcf1 <- allFitnessEffects(noIntGenes = c(rep(s, nd), rep(spp, np)), drv = seq.int(nd)) mcf1s <- oncoSimulIndiv(mcf1, model = "McFL", mu = 1e-7, detectionSize = 1e8, detectionDrivers = 100, sampleEvery = 0.02, keepEvery = 2, initSize = 2000, finalTime = 1000, onlyCancer = FALSE) plot(mcf1s, addtot = TRUE, lwdClone = 0.6, log = "") summary(mcf1s) plot(mcf1s) #### Order effects with modules, and 5 genes without interactions #### with fitness effects from an exponential distribution oi <- allFitnessEffects(orderEffects = c("F > D" = -0.3, "D > F" = 0.4), noIntGenes = rexp(5, 10), geneToModule = c("Root" = "Root", "F" = "f1, f2, f3", "D" = "d1, d2") ) oiI1 <- oncoSimulIndiv(oi, model = "Exp") oiI1$GenotypesLabels oiI1 ## note the order and separation by "_" oiP1 <- oncoSimulPop(2, oi, keepEvery = 10, mc.cores = 2) summary(oiP1) ## Even if order exists, this cannot reflect it; ## G1 to G10 are d1, d2, f1..,f3, and the 5 genes without ## interaction samplePop(oiP1) oiS1 <- oncoSimulSample(2, oi) ## The output contains only the summary of the runs AND ## the sample: oiS1 ## And their sizes do differ object.size(oiS1) object.size(oiP1) ######## Using a poset for pancreatic cancer from Gerstung et al. ### (s and sh are made up for the example; only the structure ### and names come from Gerstung et al.) pancr <- allFitnessEffects(data.frame(parent = c("Root", rep("KRAS", 4), "SMAD4", "CDNK2A", "TP53", "TP53", "MLL3"), child = c("KRAS","SMAD4", "CDNK2A", "TP53", "MLL3", rep("PXDN", 3), rep("TGFBR2", 2)), s = 0.05, sh = -0.3, typeDep = "MN")) plot(pancr) ### Use an exponential growth model pancr1 <- oncoSimulIndiv(pancr, model = "Exp") pancr1 summary(pancr1) plot(pancr1) pancr1$GenotypesLabels ## Pop and Sample pancrPop <- oncoSimulPop(4, pancr, keepEvery = 10, mc.cores = 2) summary(pancrPop) pancrSPop <- samplePop(pancrPop) pancrSPop pancrSamp <- oncoSimulSample(2, pancr) pancrSamp } \keyword{misc} \keyword{iteration}