Browse code

v.2.3.9. accessible genotypes

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/OncoSimulR@119231 bc3139a8-67e5-0310-9ffc-ced21a209358

Ramon Diaz-Uriarte authored on 09/07/2016 13:43:10
Showing 8 changed files

... ...
@@ -1,8 +1,8 @@
1 1
 Package: OncoSimulR
2 2
 Type: Package
3 3
 Title: Forward Genetic Simulation of Cancer Progresion with Epistasis 
4
-Version: 2.3.8
5
-Date: 2016-07-08
4
+Version: 2.3.9
5
+Date: 2016-07-09
6 6
 Authors@R: c(person("Ramon", "Diaz-Uriarte", role = c("aut", "cre"),
7 7
 		     email = "rdiaz02@gmail.com"),
8 8
 	      person("Mark", "Taylor", role = "ctb", email = "ningkiling@gmail.com"))
... ...
@@ -17,12 +17,13 @@ Description: Functions for forward population genetic simulation in
17 17
     rates can differ between genes, and we can include mutator/antimutator
18 18
     genes (to model mutator phenotypes). Simulations
19 19
     use continuous-time models and can include driver and passenger genes
20
-    and modules.  Also included are functions for simulating random DAGs
20
+    and modules.  Also included are functions for: simulating random DAGs
21 21
     of the type found in Oncogenetic Tress, Conjunctive Bayesian Networks,
22
-    and other tumor progression models, and for plotting and sampling from
22
+    and other tumor progression models; plotting and sampling from
23 23
     single or multiple realizations of the simulations, including
24
-    single-cell sampling, as well as functions for plotting the parent-child
25
-    relationships of the clones.
24
+    single-cell sampling; plotting the parent-child relationships of the
25
+    clones; generating random fitness landscapes (Rough Mount Fuji, House
26
+    of Cards, and additive models) and plotting them.
26 27
 biocViews: BiologicalQuestion, SomaticMutation
27 28
 License: GPL (>= 3)
28 29
 URL: https://github.com/rdiaz02/OncoSimul, https://popmodels.cancercontrol.cancer.gov/gsr/packages/oncosimulr/
... ...
@@ -22,6 +22,8 @@ plotFitnessLandscape <- function(x, show_labels = TRUE,
22 22
                                  col = c("green4", "red", "yellow"),
23 23
                                  lty = c(1, 2, 3), use_ggrepel = FALSE,
24 24
                                  log = FALSE, max_num_genotypes = 2000,
25
+                                 only_accessible = FALSE,
26
+                                 accessible_th = 0,
25 27
                                  ...) {
26 28
 
27 29
     ## FIXME future:
... ...
@@ -37,7 +39,6 @@ plotFitnessLandscape <- function(x, show_labels = TRUE,
37 39
     ## adjacency matrix of genotype i go at row i and column i.  Follow back
38 40
     ## all entries in row i, and their previous, and forward all column i.
39 41
 
40
-
41 42
     ## gfm: genotype fitness matrix
42 43
     ## afe: all fitness effects
43 44
 
... ...
@@ -52,11 +53,17 @@ plotFitnessLandscape <- function(x, show_labels = TRUE,
52 53
     ## get the string representation, etc. And this is for use
53 54
     ## with OncoSimul.
54 55
 
56
+  
55 57
     tfm <- to_Fitness_Matrix(x, max_num_genotypes = max_num_genotypes)
56 58
 
57
-
58 59
     mutated <- rowSums(tfm$gfm[, -ncol(tfm$gfm)])
59 60
     gaj <- genot_to_adj_mat(tfm$gfm)
61
+    if(only_accessible) {
62
+        gaj <- filter_inaccessible(gaj, accessible_th)
63
+        remaining <- as.numeric(colnames(gaj))
64
+        mutated <- mutated[remaining]
65
+        tfm$afe <- tfm$afe[remaining, , drop = FALSE]
66
+    }
60 67
     vv <- which(!is.na(gaj), arr.ind = TRUE)
61 68
     
62 69
     ## plot(x = mutated, y = e1$Fitness, ylab = "Fitness",
... ...
@@ -171,6 +178,44 @@ plot.evalAllGenotypes <- plot.evalAllGenotypesMut <-
171 178
 ######################################################################
172 179
 
173 180
 
181
+## wrap_filter_inaccessible <- function(x, max_num_genotypes, accessible_th) {
182
+##     ## wrap it, for my consumption
183
+##     tfm <- to_Fitness_Matrix(x, max_num_genotypes = max_num_genotypes)
184
+##     mutated <- rowSums(tfm$gfm[, -ncol(tfm$gfm)])
185
+##     gaj <- genot_to_adj_mat(tfm$gfm)
186
+##     gaj <- filter_inaccessible(gaj, accessible_th)
187
+##     remaining <- as.numeric(colnames(gaj))
188
+##     mutated <- mutated[remaining]
189
+##     tfm$afe <- tfm$afe[remaining, , drop = FALSE]
190
+##     return(list(remaining = remaining,
191
+##                 mutated = mutated,
192
+##                 tfm = tfm))
193
+## }
194
+
195
+count_accessible_g <- function(gfm, accessible_th) {
196
+    gaj <- genot_to_adj_mat(gfm)
197
+    gaj <- filter_inaccessible(gaj, accessible_th)
198
+    return(ncol(gaj) - 1)
199
+}
200
+
201
+filter_inaccessible <- function(x, accessible_th) {
202
+    ## Return an adjacency matrix with only accessible paths. x is the gaj
203
+    ## matrix created in the plots. The difference between genotypes
204
+    ## separated by a hamming distance of 1
205
+    colnames(x) <- rownames(x) <- 1:ncol(x)
206
+    while(TRUE) {
207
+        ## remove first column
208
+        ## We use fact that all(na.omit(x) < u) is true if all are NA
209
+        ## so inaccessible rows are removed and thus destination columns
210
+        wrm <- which(apply(x[, -1, drop = FALSE], 2,
211
+                           function(y) {all(na.omit(y) < accessible_th)})) + 1
212
+        if(length(wrm) == 0) break;
213
+        x <- x[-wrm, -wrm, drop = FALSE]
214
+    }
215
+    x[x < 0] <- NA
216
+    return(x)
217
+}
218
+
174 219
 
175 220
 generate_matrix_genotypes <- function(g) {
176 221
     ## FIXME future: do this for order too? Only if rfitness for order.
... ...
@@ -6,10 +6,11 @@ rfitness <- function(g, c= 0.5,
6 6
                                      ## reference. If "max" this is rep(1, g)
7 7
                      scale = NULL, ## a two-element vector: min and max
8 8
                      wt_is_1 = TRUE, ## wt has fitness 1
9
-                     log = FALSE ## log only makes sense if all values >
9
+                     log = FALSE, ## log only makes sense if all values >
10 10
                                  ## 0. scale with min > 0, and/or set
11 11
                                  ## wt_is_1 = TRUE
12
-                     ) {
12
+                     min_accessible_genotypes = 0,
13
+                     accessible_th = 0) {
13 14
     ## Like Franke et al., 2011 and others of Krug. Very similar to Greene
14 15
     ## and Crona, 2014. And this allows moving from HoC to purely additive
15 16
     ## changing c and sd.
... ...
@@ -17,42 +18,57 @@ rfitness <- function(g, c= 0.5,
17 18
     ## FIXME future: do this with order too?
18 19
     ##    - do not generate a matrix of genotypes but a matrix of ordered genot.
19 20
     m <- generate_matrix_genotypes(g)
20
-    f_r <- rnorm(nrow(m), mean = 0, sd = sd)
21
-    if(inherits(reference, "character") && length(reference) == 1) {
22
-        if(reference == "random") {
23
-            reference <- m[sample(nrow(m), 1), ]
24
-        } else if(reference == "max") {
25
-            reference <- rep(1, g)
21
+    done <- FALSE
22
+    while(!done) {
23
+        f_r <- rnorm(nrow(m), mean = 0, sd = sd)
24
+        if(inherits(reference, "character") && length(reference) == 1) {
25
+            if(reference == "random") {
26
+                referenceI <- m[sample(nrow(m), 1), ]
27
+            } else if(reference == "max") {
28
+                referenceI <- rep(1, g)
29
+            } 
30
+        } else {
31
+            referenceI <- reference
32
+            }
33
+        d_reference <- apply(m, 1, function(x) sum(abs(x - referenceI)))
34
+        f_det <- -c * d_reference
35
+        ## f_det <- rowSums(m) * slope/nrow(m) ## this is Greene and Krona
36
+        fi <- f_r + f_det
37
+        
38
+        if(!is.null(scale)) {
39
+            fi <- (fi - min(fi))/(max(fi) - min(fi))
40
+            fi <- scale[1] + fi * (scale[2] - scale[1])
26 41
         }
27
-    }
28
-    d_reference <- apply(m, 1, function(x) sum(abs(x - reference)))
29
-    f_det <- -c * d_reference
30
-    ## f_det <- rowSums(m) * slope/nrow(m) ## this is Greene and Krona
31
-    fi <- f_r + f_det
32
-    
33
-    if(!is.null(scale)) {
34
-        fi <- (fi - min(fi))/(max(fi) - min(fi))
35
-        fi <- scale[1] + fi * (scale[2] - scale[1])
36
-    }
37
-    if(wt_is_1) {
38
-        ## We need to shift to avoid ratios involving negative numbers and
39
-        ## we need to avoid having any fitness at 0, specially the wt.  If
40
-        ## any negative value, add the min, and shift by the magnitude of
41
-        ## the min to avoid any at 0.
42
+        if(wt_is_1) {
43
+            ## We need to shift to avoid ratios involving negative numbers and
44
+            ## we need to avoid having any fitness at 0, specially the wt.  If
45
+            ## any negative value, add the min, and shift by the magnitude of
46
+            ## the min to avoid any at 0.
42 47
 
43
-        ## If you use scale and wt_is_1, this will move the scale. It is
44
-        ## not possible to obtain a linear transformation that will keep
45
-        ## the min and max of the scale, and wt at 1.
46
-        min_fi <- min(fi)
47
-        if(min_fi < 0)
48
-            fi <- fi + 2 * abs(min(fi))
49
-        fi <- fi/fi[1]
50
-    }
51
-    
52
-    if(log) {
53
-        fi <- log(fi/fi[1]) + 1
48
+            ## If you use scale and wt_is_1, this will move the scale. It is
49
+            ## not possible to obtain a linear transformation that will keep
50
+            ## the min and max of the scale, and wt at 1.
51
+            min_fi <- min(fi)
52
+            if(min_fi < 0)
53
+                fi <- fi + 2 * abs(min(fi))
54
+            fi <- fi/fi[1]
55
+        }
56
+        
57
+        if(log) {
58
+            fi <- log(fi/fi[1]) + 1
59
+        }
60
+        m <- cbind(m, Fitness = fi)
61
+        if(min_accessible_genotypes > 0) {
62
+            if(count_accessible_g(m, accessible_th) >= min_accessible_genotypes) {
63
+                done <- TRUE
64
+            } else {
65
+                ## Cannot start again with a fitness column
66
+                m <- m[, -ncol(m), drop = FALSE]
67
+            }
68
+        } else {
69
+            done <- TRUE
70
+        }
54 71
     }
55
-    m <- cbind(m, Fitness = fi)
56 72
     class(m) <- c(class(m), "genotype_fitness_matrix")
57 73
     return(m)
58 74
 }
... ...
@@ -1,3 +1,6 @@
1
+Changes in version 2.3.9 (2017-07-0):
2
+	- Accessible genotypes in rfitness and plotFitnessLandscape
3
+
1 4
 Changes in version 2.3.8 (2017-07-08):
2 5
 	- PDBasline default is now 1.2.
3 6
 	
... ...
@@ -16,28 +16,40 @@
16 16
 
17 17
 \usage{
18 18
 plotFitnessLandscape(x, show_labels = TRUE,
19
-                                   col = c("green4", "red", "yellow"),
20
-                                   lty = c(1, 2, 3),
21
-                                   use_ggrepel = FALSE,
22
-                                   log = FALSE, max_num_genotypes = 2000, ...)
19
+                     col = c("green4", "red", "yellow"),
20
+                     lty = c(1, 2, 3),
21
+                     use_ggrepel = FALSE,
22
+                     log = FALSE, max_num_genotypes = 2000,
23
+                     only_accessible = FALSE,
24
+                     accessible_th = 0,
25
+                     ...)
23 26
 
24 27
 \method{plot}{genotype_fitness_matrix}(x, show_labels = TRUE,
25 28
                                    col = c("green4", "red", "yellow"),
26 29
                                    lty = c(1, 2, 3),
27 30
                                    use_ggrepel = FALSE,
28
-                                   log = FALSE, max_num_genotypes = 2000, ...)
31
+                                   log = FALSE, max_num_genotypes = 2000,
32
+                                   only_accessible = FALSE,
33
+                                   accessible_th = 0,
34
+                                   ...)
29 35
 
30 36
 \method{plot}{evalAllGenotypes}(x, show_labels = TRUE,
31 37
                                    col = c("green4", "red", "yellow"),
32 38
                                    lty = c(1, 2, 3),
33 39
                                    use_ggrepel = FALSE,
34
-                                   log = FALSE, max_num_genotypes = 2000, ...)
40
+                                   log = FALSE, max_num_genotypes = 2000,
41
+                                   only_accessible = FALSE,
42
+                                   accessible_th = 0,
43
+                                   ...)
35 44
 
36 45
 \method{plot}{evalAllGenotypesMut}(x, show_labels = TRUE,
37 46
                                    col = c("green4", "red", "yellow"),
38 47
                                    lty = c(1, 2, 3),
39 48
                                    use_ggrepel = FALSE,
40
-                                   log = FALSE, max_num_genotypes = 2000, ...)
49
+                                   log = FALSE, max_num_genotypes = 2000,
50
+                                   only_accessible = FALSE,
51
+                                   accessible_th = 0,
52
+                                   ...)
41 53
 }
42 54
 
43 55
 \arguments{
... ...
@@ -89,6 +101,18 @@ plotFitnessLandscape(x, show_labels = TRUE,
89 101
   types of input, we make a call to \code{\link{evalAllGenotypes}}, and
90 102
   use this as the maximum.}
91 103
 
104
+\item{only_accessible}{If TRUE, show only accessible paths. A path is
105
+  considered accesible if, at each mutational step (i.e., with the
106
+  addition of each mutation) fitness increases by at least
107
+  \code{accessible_th}. If you set \code{only_accessible = TRUE}, the
108
+  number of genotypes displayed can be much smaller than the number of
109
+  existing genotypes if many of those genotypes are not accessible via
110
+  any path.}
111
+
112
+\item{accessible_th}{The threshold for the minimal change in fitness at
113
+  each mutation step (i.e., between successive genotypes) to be used if
114
+  \code{only_accessible = TRUE}.}
115
+
92 116
   \item{\dots}{
93 117
       Other arguments passed to \code{plot}. Not used for now.
94 118
   }
... ...
@@ -4,16 +4,15 @@
4 4
 
5 5
 \title{Generate random fitness.}
6 6
 
7
-\description{
8
-  Generate random fitness under a House of Cards, Rough Mount Fuji, or
9
-  additive model.
10
-}
7
+\description{ Generate random fitness landscapes under a House of Cards,
8
+  Rough Mount Fuji, or additive model.  }
11 9
 
12 10
 
13 11
 \usage{
14 12
 
15 13
 rfitness(g, c = 0.5, sd = 1, reference = "random", scale = NULL,
16
-         wt_is_1 = TRUE, log = FALSE)
14
+         wt_is_1 = TRUE, log = FALSE, min_accessible_genotypes = 0,
15
+         accessible_th = 0)
17 16
 }
18 17
 
19 18
 
... ...
@@ -48,7 +47,34 @@ rfitness(g, c = 0.5, sd = 1, reference = "random", scale = NULL,
48 47
 
49 48
 
50 49
 \item{log}{If TRUE, log-transform fitness.}
50
+
51
+\item{min_accessible_genotypes}{If larger than 0, the minimum number of
52
+  accessible genotypes in the fitness landscape. A genotype is
53
+  considered accessible if you can reach if from the wildtype by going
54
+  through at least one path where all changes in fitness are larger or
55
+  equal to \code{accessible_th}. The changes in fitness are considered
56
+  at each mutational step, i.e., at each addition of one mutation we
57
+  compute the difference between the genotype with \code{k + 1}
58
+  mutations minus the ancestor genotype with \code{k} mutations. Thus, a
59
+  genotype is considered accessible if there is at least one path where
60
+  fitness increases at each mutational step by at least
61
+  \code{accessible_th}.
62
+
63
+  If the condition is not satisfied, we continue generating random
64
+  fitness landscapes with the specified parameters until the condition
65
+  is satisfied.
51 66
 }
67
+
68
+\item{accessible_th}{The threshold for the minimal change in fitness at
69
+  each mutation step (i.e., between successive genotypes) that allows a
70
+  genotype to be regarded as accessible. This only applies if
71
+  \code{min_accessible_genotypes} is larger than 0.  So if you want to
72
+  allow small decreases in fitness in successive steps, use a small
73
+  negative value for \code{accessible_th}.  }
74
+
75
+} 
76
+
77
+
52 78
 \details{
53 79
 
54 80
   The model used here follows the Rough Mount Fuji model in Szendro et
55 81
new file mode 100644
... ...
@@ -0,0 +1,33 @@
1
+test_that("exercise accessible genotypes code", {
2
+    ## Because of compiler differences, this might only test the relevant
3
+    ## code in Linux. But conditions will be true in all systems.
4
+    
5
+    RNGkind("Mersenne-Twister")
6
+
7
+    set.seed(3)
8
+    r1 <- rfitness(4, reference = rep(0, 4), c = 2.1,
9
+                   min_accessible_genotypes = 1)
10
+    expect_true(max(r1[-1, "Fitness"]) >= 1.0)
11
+    
12
+    set.seed(3)
13
+    r1 <- rfitness(4, reference = rep(0, 4), c = 2.1,
14
+                   min_accessible_genotypes = 1,
15
+                   accessible_th = 0.06)
16
+    expect_true(max(r1[-1, "Fitness"]) >= 1.06)
17
+
18
+    set.seed(3)
19
+    r2 <- rfitness(4, reference = rep(0, 4), c = 2.1,
20
+                   min_accessible_genotypes = 2,
21
+                   accessible_th = 0.06)
22
+    expect_true(max(r2[-1, "Fitness"]) >= 1.06)
23
+
24
+    ## Exercising plotting code
25
+    plot(r2)
26
+    plot(r2, only_accessible = TRUE)
27
+    plot(r2, only_accessible = TRUE, accessible_th = 0.06)
28
+    ## this only show WT
29
+    plot(r2, only_accessible = TRUE, accessible_th = 0.2)
30
+   
31
+}) 
32
+
33
+set.seed(NULL)
... ...
@@ -4317,10 +4317,25 @@ samplePop(pancrPop)
4317 4317
 
4318 4318
 Now a simple multiple call to \texttt{oncoSimulIndiv} wrapped inside
4319 4319
 \Rfunction{mclapply}; this is basically the same we just did above. We set
4320
-the class of the object to allow direct usage of \Rfunction{samplePop}.
4320
+the class of the object to allow direct usage of
4321
+\Rfunction{samplePop}. (Note: in Windows \texttt{mc.cores > 1} is not
4322
+supported, so for the vignette to run in Windows, Linux, and Mac we
4323
+explicitly set it here in the call to \Rfunction{mclapply}. For regular
4324
+usage, you will not need to do this; just use whatever is appropriate for
4325
+your operating system and number of cores. As well, we do not need any of
4326
+this with \Rfunction{oncoSimulPop} because the code inside
4327
+\Rfunction{oncoSimulPop} already takes care of setting \texttt{mc.cores}
4328
+to 1 in Windows).
4321 4329
 
4322 4330
 <<>>=
4323 4331
 library(parallel)
4332
+
4333
+if(.Platform$OS.type == "windows") {
4334
+    mc.cores <- 1
4335
+} else {
4336
+    mc.cores <- 2
4337
+}
4338
+
4324 4339
 p2 <- mclapply(1:4, function(x) oncoSimulIndiv(pancr,
4325 4340
                                                detectionSize = 1e7,
4326 4341
                                                keepEvery = 10),