git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/OncoSimulR@119231 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,8 +1,8 @@ |
1 | 1 |
Package: OncoSimulR |
2 | 2 |
Type: Package |
3 | 3 |
Title: Forward Genetic Simulation of Cancer Progresion with Epistasis |
4 |
-Version: 2.3.8 |
|
5 |
-Date: 2016-07-08 |
|
4 |
+Version: 2.3.9 |
|
5 |
+Date: 2016-07-09 |
|
6 | 6 |
Authors@R: c(person("Ramon", "Diaz-Uriarte", role = c("aut", "cre"), |
7 | 7 |
email = "rdiaz02@gmail.com"), |
8 | 8 |
person("Mark", "Taylor", role = "ctb", email = "ningkiling@gmail.com")) |
... | ... |
@@ -17,12 +17,13 @@ Description: Functions for forward population genetic simulation in |
17 | 17 |
rates can differ between genes, and we can include mutator/antimutator |
18 | 18 |
genes (to model mutator phenotypes). Simulations |
19 | 19 |
use continuous-time models and can include driver and passenger genes |
20 |
- and modules. Also included are functions for simulating random DAGs |
|
20 |
+ and modules. Also included are functions for: simulating random DAGs |
|
21 | 21 |
of the type found in Oncogenetic Tress, Conjunctive Bayesian Networks, |
22 |
- and other tumor progression models, and for plotting and sampling from |
|
22 |
+ and other tumor progression models; plotting and sampling from |
|
23 | 23 |
single or multiple realizations of the simulations, including |
24 |
- single-cell sampling, as well as functions for plotting the parent-child |
|
25 |
- relationships of the clones. |
|
24 |
+ single-cell sampling; plotting the parent-child relationships of the |
|
25 |
+ clones; generating random fitness landscapes (Rough Mount Fuji, House |
|
26 |
+ of Cards, and additive models) and plotting them. |
|
26 | 27 |
biocViews: BiologicalQuestion, SomaticMutation |
27 | 28 |
License: GPL (>= 3) |
28 | 29 |
URL: https://github.com/rdiaz02/OncoSimul, https://popmodels.cancercontrol.cancer.gov/gsr/packages/oncosimulr/ |
... | ... |
@@ -22,6 +22,8 @@ plotFitnessLandscape <- function(x, show_labels = TRUE, |
22 | 22 |
col = c("green4", "red", "yellow"), |
23 | 23 |
lty = c(1, 2, 3), use_ggrepel = FALSE, |
24 | 24 |
log = FALSE, max_num_genotypes = 2000, |
25 |
+ only_accessible = FALSE, |
|
26 |
+ accessible_th = 0, |
|
25 | 27 |
...) { |
26 | 28 |
|
27 | 29 |
## FIXME future: |
... | ... |
@@ -37,7 +39,6 @@ plotFitnessLandscape <- function(x, show_labels = TRUE, |
37 | 39 |
## adjacency matrix of genotype i go at row i and column i. Follow back |
38 | 40 |
## all entries in row i, and their previous, and forward all column i. |
39 | 41 |
|
40 |
- |
|
41 | 42 |
## gfm: genotype fitness matrix |
42 | 43 |
## afe: all fitness effects |
43 | 44 |
|
... | ... |
@@ -52,11 +53,17 @@ plotFitnessLandscape <- function(x, show_labels = TRUE, |
52 | 53 |
## get the string representation, etc. And this is for use |
53 | 54 |
## with OncoSimul. |
54 | 55 |
|
56 |
+ |
|
55 | 57 |
tfm <- to_Fitness_Matrix(x, max_num_genotypes = max_num_genotypes) |
56 | 58 |
|
57 |
- |
|
58 | 59 |
mutated <- rowSums(tfm$gfm[, -ncol(tfm$gfm)]) |
59 | 60 |
gaj <- genot_to_adj_mat(tfm$gfm) |
61 |
+ if(only_accessible) { |
|
62 |
+ gaj <- filter_inaccessible(gaj, accessible_th) |
|
63 |
+ remaining <- as.numeric(colnames(gaj)) |
|
64 |
+ mutated <- mutated[remaining] |
|
65 |
+ tfm$afe <- tfm$afe[remaining, , drop = FALSE] |
|
66 |
+ } |
|
60 | 67 |
vv <- which(!is.na(gaj), arr.ind = TRUE) |
61 | 68 |
|
62 | 69 |
## plot(x = mutated, y = e1$Fitness, ylab = "Fitness", |
... | ... |
@@ -171,6 +178,44 @@ plot.evalAllGenotypes <- plot.evalAllGenotypesMut <- |
171 | 178 |
###################################################################### |
172 | 179 |
|
173 | 180 |
|
181 |
+## wrap_filter_inaccessible <- function(x, max_num_genotypes, accessible_th) { |
|
182 |
+## ## wrap it, for my consumption |
|
183 |
+## tfm <- to_Fitness_Matrix(x, max_num_genotypes = max_num_genotypes) |
|
184 |
+## mutated <- rowSums(tfm$gfm[, -ncol(tfm$gfm)]) |
|
185 |
+## gaj <- genot_to_adj_mat(tfm$gfm) |
|
186 |
+## gaj <- filter_inaccessible(gaj, accessible_th) |
|
187 |
+## remaining <- as.numeric(colnames(gaj)) |
|
188 |
+## mutated <- mutated[remaining] |
|
189 |
+## tfm$afe <- tfm$afe[remaining, , drop = FALSE] |
|
190 |
+## return(list(remaining = remaining, |
|
191 |
+## mutated = mutated, |
|
192 |
+## tfm = tfm)) |
|
193 |
+## } |
|
194 |
+ |
|
195 |
+count_accessible_g <- function(gfm, accessible_th) { |
|
196 |
+ gaj <- genot_to_adj_mat(gfm) |
|
197 |
+ gaj <- filter_inaccessible(gaj, accessible_th) |
|
198 |
+ return(ncol(gaj) - 1) |
|
199 |
+} |
|
200 |
+ |
|
201 |
+filter_inaccessible <- function(x, accessible_th) { |
|
202 |
+ ## Return an adjacency matrix with only accessible paths. x is the gaj |
|
203 |
+ ## matrix created in the plots. The difference between genotypes |
|
204 |
+ ## separated by a hamming distance of 1 |
|
205 |
+ colnames(x) <- rownames(x) <- 1:ncol(x) |
|
206 |
+ while(TRUE) { |
|
207 |
+ ## remove first column |
|
208 |
+ ## We use fact that all(na.omit(x) < u) is true if all are NA |
|
209 |
+ ## so inaccessible rows are removed and thus destination columns |
|
210 |
+ wrm <- which(apply(x[, -1, drop = FALSE], 2, |
|
211 |
+ function(y) {all(na.omit(y) < accessible_th)})) + 1 |
|
212 |
+ if(length(wrm) == 0) break; |
|
213 |
+ x <- x[-wrm, -wrm, drop = FALSE] |
|
214 |
+ } |
|
215 |
+ x[x < 0] <- NA |
|
216 |
+ return(x) |
|
217 |
+} |
|
218 |
+ |
|
174 | 219 |
|
175 | 220 |
generate_matrix_genotypes <- function(g) { |
176 | 221 |
## FIXME future: do this for order too? Only if rfitness for order. |
... | ... |
@@ -6,10 +6,11 @@ rfitness <- function(g, c= 0.5, |
6 | 6 |
## reference. If "max" this is rep(1, g) |
7 | 7 |
scale = NULL, ## a two-element vector: min and max |
8 | 8 |
wt_is_1 = TRUE, ## wt has fitness 1 |
9 |
- log = FALSE ## log only makes sense if all values > |
|
9 |
+ log = FALSE, ## log only makes sense if all values > |
|
10 | 10 |
## 0. scale with min > 0, and/or set |
11 | 11 |
## wt_is_1 = TRUE |
12 |
- ) { |
|
12 |
+ min_accessible_genotypes = 0, |
|
13 |
+ accessible_th = 0) { |
|
13 | 14 |
## Like Franke et al., 2011 and others of Krug. Very similar to Greene |
14 | 15 |
## and Crona, 2014. And this allows moving from HoC to purely additive |
15 | 16 |
## changing c and sd. |
... | ... |
@@ -17,42 +18,57 @@ rfitness <- function(g, c= 0.5, |
17 | 18 |
## FIXME future: do this with order too? |
18 | 19 |
## - do not generate a matrix of genotypes but a matrix of ordered genot. |
19 | 20 |
m <- generate_matrix_genotypes(g) |
20 |
- f_r <- rnorm(nrow(m), mean = 0, sd = sd) |
|
21 |
- if(inherits(reference, "character") && length(reference) == 1) { |
|
22 |
- if(reference == "random") { |
|
23 |
- reference <- m[sample(nrow(m), 1), ] |
|
24 |
- } else if(reference == "max") { |
|
25 |
- reference <- rep(1, g) |
|
21 |
+ done <- FALSE |
|
22 |
+ while(!done) { |
|
23 |
+ f_r <- rnorm(nrow(m), mean = 0, sd = sd) |
|
24 |
+ if(inherits(reference, "character") && length(reference) == 1) { |
|
25 |
+ if(reference == "random") { |
|
26 |
+ referenceI <- m[sample(nrow(m), 1), ] |
|
27 |
+ } else if(reference == "max") { |
|
28 |
+ referenceI <- rep(1, g) |
|
29 |
+ } |
|
30 |
+ } else { |
|
31 |
+ referenceI <- reference |
|
32 |
+ } |
|
33 |
+ d_reference <- apply(m, 1, function(x) sum(abs(x - referenceI))) |
|
34 |
+ f_det <- -c * d_reference |
|
35 |
+ ## f_det <- rowSums(m) * slope/nrow(m) ## this is Greene and Krona |
|
36 |
+ fi <- f_r + f_det |
|
37 |
+ |
|
38 |
+ if(!is.null(scale)) { |
|
39 |
+ fi <- (fi - min(fi))/(max(fi) - min(fi)) |
|
40 |
+ fi <- scale[1] + fi * (scale[2] - scale[1]) |
|
26 | 41 |
} |
27 |
- } |
|
28 |
- d_reference <- apply(m, 1, function(x) sum(abs(x - reference))) |
|
29 |
- f_det <- -c * d_reference |
|
30 |
- ## f_det <- rowSums(m) * slope/nrow(m) ## this is Greene and Krona |
|
31 |
- fi <- f_r + f_det |
|
32 |
- |
|
33 |
- if(!is.null(scale)) { |
|
34 |
- fi <- (fi - min(fi))/(max(fi) - min(fi)) |
|
35 |
- fi <- scale[1] + fi * (scale[2] - scale[1]) |
|
36 |
- } |
|
37 |
- if(wt_is_1) { |
|
38 |
- ## We need to shift to avoid ratios involving negative numbers and |
|
39 |
- ## we need to avoid having any fitness at 0, specially the wt. If |
|
40 |
- ## any negative value, add the min, and shift by the magnitude of |
|
41 |
- ## the min to avoid any at 0. |
|
42 |
+ if(wt_is_1) { |
|
43 |
+ ## We need to shift to avoid ratios involving negative numbers and |
|
44 |
+ ## we need to avoid having any fitness at 0, specially the wt. If |
|
45 |
+ ## any negative value, add the min, and shift by the magnitude of |
|
46 |
+ ## the min to avoid any at 0. |
|
42 | 47 |
|
43 |
- ## If you use scale and wt_is_1, this will move the scale. It is |
|
44 |
- ## not possible to obtain a linear transformation that will keep |
|
45 |
- ## the min and max of the scale, and wt at 1. |
|
46 |
- min_fi <- min(fi) |
|
47 |
- if(min_fi < 0) |
|
48 |
- fi <- fi + 2 * abs(min(fi)) |
|
49 |
- fi <- fi/fi[1] |
|
50 |
- } |
|
51 |
- |
|
52 |
- if(log) { |
|
53 |
- fi <- log(fi/fi[1]) + 1 |
|
48 |
+ ## If you use scale and wt_is_1, this will move the scale. It is |
|
49 |
+ ## not possible to obtain a linear transformation that will keep |
|
50 |
+ ## the min and max of the scale, and wt at 1. |
|
51 |
+ min_fi <- min(fi) |
|
52 |
+ if(min_fi < 0) |
|
53 |
+ fi <- fi + 2 * abs(min(fi)) |
|
54 |
+ fi <- fi/fi[1] |
|
55 |
+ } |
|
56 |
+ |
|
57 |
+ if(log) { |
|
58 |
+ fi <- log(fi/fi[1]) + 1 |
|
59 |
+ } |
|
60 |
+ m <- cbind(m, Fitness = fi) |
|
61 |
+ if(min_accessible_genotypes > 0) { |
|
62 |
+ if(count_accessible_g(m, accessible_th) >= min_accessible_genotypes) { |
|
63 |
+ done <- TRUE |
|
64 |
+ } else { |
|
65 |
+ ## Cannot start again with a fitness column |
|
66 |
+ m <- m[, -ncol(m), drop = FALSE] |
|
67 |
+ } |
|
68 |
+ } else { |
|
69 |
+ done <- TRUE |
|
70 |
+ } |
|
54 | 71 |
} |
55 |
- m <- cbind(m, Fitness = fi) |
|
56 | 72 |
class(m) <- c(class(m), "genotype_fitness_matrix") |
57 | 73 |
return(m) |
58 | 74 |
} |
... | ... |
@@ -16,28 +16,40 @@ |
16 | 16 |
|
17 | 17 |
\usage{ |
18 | 18 |
plotFitnessLandscape(x, show_labels = TRUE, |
19 |
- col = c("green4", "red", "yellow"), |
|
20 |
- lty = c(1, 2, 3), |
|
21 |
- use_ggrepel = FALSE, |
|
22 |
- log = FALSE, max_num_genotypes = 2000, ...) |
|
19 |
+ col = c("green4", "red", "yellow"), |
|
20 |
+ lty = c(1, 2, 3), |
|
21 |
+ use_ggrepel = FALSE, |
|
22 |
+ log = FALSE, max_num_genotypes = 2000, |
|
23 |
+ only_accessible = FALSE, |
|
24 |
+ accessible_th = 0, |
|
25 |
+ ...) |
|
23 | 26 |
|
24 | 27 |
\method{plot}{genotype_fitness_matrix}(x, show_labels = TRUE, |
25 | 28 |
col = c("green4", "red", "yellow"), |
26 | 29 |
lty = c(1, 2, 3), |
27 | 30 |
use_ggrepel = FALSE, |
28 |
- log = FALSE, max_num_genotypes = 2000, ...) |
|
31 |
+ log = FALSE, max_num_genotypes = 2000, |
|
32 |
+ only_accessible = FALSE, |
|
33 |
+ accessible_th = 0, |
|
34 |
+ ...) |
|
29 | 35 |
|
30 | 36 |
\method{plot}{evalAllGenotypes}(x, show_labels = TRUE, |
31 | 37 |
col = c("green4", "red", "yellow"), |
32 | 38 |
lty = c(1, 2, 3), |
33 | 39 |
use_ggrepel = FALSE, |
34 |
- log = FALSE, max_num_genotypes = 2000, ...) |
|
40 |
+ log = FALSE, max_num_genotypes = 2000, |
|
41 |
+ only_accessible = FALSE, |
|
42 |
+ accessible_th = 0, |
|
43 |
+ ...) |
|
35 | 44 |
|
36 | 45 |
\method{plot}{evalAllGenotypesMut}(x, show_labels = TRUE, |
37 | 46 |
col = c("green4", "red", "yellow"), |
38 | 47 |
lty = c(1, 2, 3), |
39 | 48 |
use_ggrepel = FALSE, |
40 |
- log = FALSE, max_num_genotypes = 2000, ...) |
|
49 |
+ log = FALSE, max_num_genotypes = 2000, |
|
50 |
+ only_accessible = FALSE, |
|
51 |
+ accessible_th = 0, |
|
52 |
+ ...) |
|
41 | 53 |
} |
42 | 54 |
|
43 | 55 |
\arguments{ |
... | ... |
@@ -89,6 +101,18 @@ plotFitnessLandscape(x, show_labels = TRUE, |
89 | 101 |
types of input, we make a call to \code{\link{evalAllGenotypes}}, and |
90 | 102 |
use this as the maximum.} |
91 | 103 |
|
104 |
+\item{only_accessible}{If TRUE, show only accessible paths. A path is |
|
105 |
+ considered accesible if, at each mutational step (i.e., with the |
|
106 |
+ addition of each mutation) fitness increases by at least |
|
107 |
+ \code{accessible_th}. If you set \code{only_accessible = TRUE}, the |
|
108 |
+ number of genotypes displayed can be much smaller than the number of |
|
109 |
+ existing genotypes if many of those genotypes are not accessible via |
|
110 |
+ any path.} |
|
111 |
+ |
|
112 |
+\item{accessible_th}{The threshold for the minimal change in fitness at |
|
113 |
+ each mutation step (i.e., between successive genotypes) to be used if |
|
114 |
+ \code{only_accessible = TRUE}.} |
|
115 |
+ |
|
92 | 116 |
\item{\dots}{ |
93 | 117 |
Other arguments passed to \code{plot}. Not used for now. |
94 | 118 |
} |
... | ... |
@@ -4,16 +4,15 @@ |
4 | 4 |
|
5 | 5 |
\title{Generate random fitness.} |
6 | 6 |
|
7 |
-\description{ |
|
8 |
- Generate random fitness under a House of Cards, Rough Mount Fuji, or |
|
9 |
- additive model. |
|
10 |
-} |
|
7 |
+\description{ Generate random fitness landscapes under a House of Cards, |
|
8 |
+ Rough Mount Fuji, or additive model. } |
|
11 | 9 |
|
12 | 10 |
|
13 | 11 |
\usage{ |
14 | 12 |
|
15 | 13 |
rfitness(g, c = 0.5, sd = 1, reference = "random", scale = NULL, |
16 |
- wt_is_1 = TRUE, log = FALSE) |
|
14 |
+ wt_is_1 = TRUE, log = FALSE, min_accessible_genotypes = 0, |
|
15 |
+ accessible_th = 0) |
|
17 | 16 |
} |
18 | 17 |
|
19 | 18 |
|
... | ... |
@@ -48,7 +47,34 @@ rfitness(g, c = 0.5, sd = 1, reference = "random", scale = NULL, |
48 | 47 |
|
49 | 48 |
|
50 | 49 |
\item{log}{If TRUE, log-transform fitness.} |
50 |
+ |
|
51 |
+\item{min_accessible_genotypes}{If larger than 0, the minimum number of |
|
52 |
+ accessible genotypes in the fitness landscape. A genotype is |
|
53 |
+ considered accessible if you can reach if from the wildtype by going |
|
54 |
+ through at least one path where all changes in fitness are larger or |
|
55 |
+ equal to \code{accessible_th}. The changes in fitness are considered |
|
56 |
+ at each mutational step, i.e., at each addition of one mutation we |
|
57 |
+ compute the difference between the genotype with \code{k + 1} |
|
58 |
+ mutations minus the ancestor genotype with \code{k} mutations. Thus, a |
|
59 |
+ genotype is considered accessible if there is at least one path where |
|
60 |
+ fitness increases at each mutational step by at least |
|
61 |
+ \code{accessible_th}. |
|
62 |
+ |
|
63 |
+ If the condition is not satisfied, we continue generating random |
|
64 |
+ fitness landscapes with the specified parameters until the condition |
|
65 |
+ is satisfied. |
|
51 | 66 |
} |
67 |
+ |
|
68 |
+\item{accessible_th}{The threshold for the minimal change in fitness at |
|
69 |
+ each mutation step (i.e., between successive genotypes) that allows a |
|
70 |
+ genotype to be regarded as accessible. This only applies if |
|
71 |
+ \code{min_accessible_genotypes} is larger than 0. So if you want to |
|
72 |
+ allow small decreases in fitness in successive steps, use a small |
|
73 |
+ negative value for \code{accessible_th}. } |
|
74 |
+ |
|
75 |
+} |
|
76 |
+ |
|
77 |
+ |
|
52 | 78 |
\details{ |
53 | 79 |
|
54 | 80 |
The model used here follows the Rough Mount Fuji model in Szendro et |
55 | 81 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,33 @@ |
1 |
+test_that("exercise accessible genotypes code", { |
|
2 |
+ ## Because of compiler differences, this might only test the relevant |
|
3 |
+ ## code in Linux. But conditions will be true in all systems. |
|
4 |
+ |
|
5 |
+ RNGkind("Mersenne-Twister") |
|
6 |
+ |
|
7 |
+ set.seed(3) |
|
8 |
+ r1 <- rfitness(4, reference = rep(0, 4), c = 2.1, |
|
9 |
+ min_accessible_genotypes = 1) |
|
10 |
+ expect_true(max(r1[-1, "Fitness"]) >= 1.0) |
|
11 |
+ |
|
12 |
+ set.seed(3) |
|
13 |
+ r1 <- rfitness(4, reference = rep(0, 4), c = 2.1, |
|
14 |
+ min_accessible_genotypes = 1, |
|
15 |
+ accessible_th = 0.06) |
|
16 |
+ expect_true(max(r1[-1, "Fitness"]) >= 1.06) |
|
17 |
+ |
|
18 |
+ set.seed(3) |
|
19 |
+ r2 <- rfitness(4, reference = rep(0, 4), c = 2.1, |
|
20 |
+ min_accessible_genotypes = 2, |
|
21 |
+ accessible_th = 0.06) |
|
22 |
+ expect_true(max(r2[-1, "Fitness"]) >= 1.06) |
|
23 |
+ |
|
24 |
+ ## Exercising plotting code |
|
25 |
+ plot(r2) |
|
26 |
+ plot(r2, only_accessible = TRUE) |
|
27 |
+ plot(r2, only_accessible = TRUE, accessible_th = 0.06) |
|
28 |
+ ## this only show WT |
|
29 |
+ plot(r2, only_accessible = TRUE, accessible_th = 0.2) |
|
30 |
+ |
|
31 |
+}) |
|
32 |
+ |
|
33 |
+set.seed(NULL) |
... | ... |
@@ -4317,10 +4317,25 @@ samplePop(pancrPop) |
4317 | 4317 |
|
4318 | 4318 |
Now a simple multiple call to \texttt{oncoSimulIndiv} wrapped inside |
4319 | 4319 |
\Rfunction{mclapply}; this is basically the same we just did above. We set |
4320 |
-the class of the object to allow direct usage of \Rfunction{samplePop}. |
|
4320 |
+the class of the object to allow direct usage of |
|
4321 |
+\Rfunction{samplePop}. (Note: in Windows \texttt{mc.cores > 1} is not |
|
4322 |
+supported, so for the vignette to run in Windows, Linux, and Mac we |
|
4323 |
+explicitly set it here in the call to \Rfunction{mclapply}. For regular |
|
4324 |
+usage, you will not need to do this; just use whatever is appropriate for |
|
4325 |
+your operating system and number of cores. As well, we do not need any of |
|
4326 |
+this with \Rfunction{oncoSimulPop} because the code inside |
|
4327 |
+\Rfunction{oncoSimulPop} already takes care of setting \texttt{mc.cores} |
|
4328 |
+to 1 in Windows). |
|
4321 | 4329 |
|
4322 | 4330 |
<<>>= |
4323 | 4331 |
library(parallel) |
4332 |
+ |
|
4333 |
+if(.Platform$OS.type == "windows") { |
|
4334 |
+ mc.cores <- 1 |
|
4335 |
+} else { |
|
4336 |
+ mc.cores <- 2 |
|
4337 |
+} |
|
4338 |
+ |
|
4324 | 4339 |
p2 <- mclapply(1:4, function(x) oncoSimulIndiv(pancr, |
4325 | 4340 |
detectionSize = 1e7, |
4326 | 4341 |
keepEvery = 10), |