git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/Rtreemix@28790 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,468 @@ |
1 |
+################################################################################ |
|
2 |
+## FUNCTIONS FOR STABILITY ANALYSIS OF THE ONCOGENETIC TREES MIXTURE MODEL ## |
|
3 |
+################################################################################ |
|
4 |
+## Function for calculating the P value of a given similarity measure. |
|
5 |
+## What is the probability for obtaining the same or smaller |
|
6 |
+## value in a random vector of similarity values. |
|
7 |
+Pval.dist <- function(dist.val, random.vals) { |
|
8 |
+ return((sum(random.vals <= dist.val) + 1) /(length(random.vals) + 1)) |
|
9 |
+} |
|
10 |
+###################################################################### |
|
11 |
+## Function for calculating the Kullback-Leibler divergence between |
|
12 |
+## two discrete probability distributions p and q. |
|
13 |
+kullback.leibler <- function(p, q) { |
|
14 |
+ if(length(p) != length(q)) |
|
15 |
+ stop("Error: The distribution vectors have different lengths!") |
|
16 |
+ return(sum(p * log(p / q))) |
|
17 |
+} |
|
18 |
+###################################################################### |
|
19 |
+## Function for calculating the L1 distance between |
|
20 |
+## two discrete probability distributions p and q. |
|
21 |
+L1.dist <- function(p, q) { |
|
22 |
+ if(length(p) != length(q)) |
|
23 |
+ stop("Error: The distribution vectors have different lengths!") |
|
24 |
+ |
|
25 |
+ return(sum(abs(p - q))) |
|
26 |
+} |
|
27 |
+###################################################################### |
|
28 |
+## Function for calculating the cosine distance between |
|
29 |
+## two discrete probability distributions p and q. |
|
30 |
+cosin.dist <- function(p, q) { |
|
31 |
+ if(length(p) != length(q)) |
|
32 |
+ stop("Error: The distribution vectors have different lengths!") |
|
33 |
+ |
|
34 |
+ return(1 - (sum (p * q) / (L2.norm(p) * L2.norm(q)) )) |
|
35 |
+} |
|
36 |
+###################################################################### |
|
37 |
+## Function for calculating the L2 norm of a given vector x. |
|
38 |
+L2.norm <- function(x) { |
|
39 |
+ return(sqrt(sum(x * x))) |
|
40 |
+} |
|
41 |
+###################################################################### |
|
42 |
+## Function for calculating the Euclidian distance between |
|
43 |
+## two vectors x and y. |
|
44 |
+euclidian.dist <- function(x, y) { |
|
45 |
+ if(length(x) != length(y)) |
|
46 |
+ stop("Error: The vectors have different lengths!") |
|
47 |
+ |
|
48 |
+ return(sqrt(drop((x - y) %*% (x - y)))) |
|
49 |
+} |
|
50 |
+###################################################################### |
|
51 |
+## Function for calculating the rank-correlation distance between |
|
52 |
+## two vectors x and y. |
|
53 |
+rank.cor.dist <- function(x, y) { |
|
54 |
+ if(length(x) != length(y)) |
|
55 |
+ stop("Error: The vectors have different length!") |
|
56 |
+ |
|
57 |
+ return(1 - rcorr(x, y, type = "spearman")[[1]][1, 2]) |
|
58 |
+} |
|
59 |
+###################################################################### |
|
60 |
+## Function that implements a similarity measure for comparing the topologies |
|
61 |
+## of the trees of two mixture models mixture1 and mixture2. |
|
62 |
+## It returns a value that uses the number of different edges for caracterizing |
|
63 |
+## the difference between the models. |
|
64 |
+## For the comparisson it is necessary that the two models have the same |
|
65 |
+## number of tree components build on the same number of genetic events. |
|
66 |
+## It is assumed that the mixtures have at least two components (the first one is the noise). |
|
67 |
+comp.models <- function(mixture1, mixture2) { |
|
68 |
+ if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel")) |
|
69 |
+ stop("The specified mixture models should be of type RtreemixModel!") |
|
70 |
+ |
|
71 |
+ if(numTrees(mixture1) != numTrees(mixture2)) |
|
72 |
+ stop("The two specified mixture models don't have the same number of trees!") |
|
73 |
+ |
|
74 |
+ if(eventsNum(mixture1) != eventsNum(mixture2)) |
|
75 |
+ stop("The two specified mixture models don't have the same number of events!") |
|
76 |
+ no.trees <- numTrees(mixture1) |
|
77 |
+ if (no.trees == 1) |
|
78 |
+ stop("The mixture models should have at least two tree components, where the first one is the noise component!") |
|
79 |
+ |
|
80 |
+ no.events <- eventsNum(mixture1) |
|
81 |
+ |
|
82 |
+ true.order <- list() ## list specifying the edges in the components of mixture1 |
|
83 |
+ fit.order <- list() ## list specifying the edges in the components of mixture2 |
|
84 |
+ |
|
85 |
+ ## Build the vectors that characterize the edges of the components of the mixture models. |
|
86 |
+ ## The noise components are ignored since they always have the same (star) topology. |
|
87 |
+ for(k in 2:no.trees) { |
|
88 |
+ true.vec <- character(0) |
|
89 |
+ fit.vec <- character(0) |
|
90 |
+ for(l in 2:no.events) { |
|
91 |
+ ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l]))) |
|
92 |
+ true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch)) |
|
93 |
+ ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l]))) |
|
94 |
+ fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch)) |
|
95 |
+ } |
|
96 |
+ names(true.vec) <- nodes(getTree(mixture1, k))[-1] |
|
97 |
+ true.order <- c(true.order, list(true.vec)) |
|
98 |
+ names(fit.vec) <- nodes(getTree(mixture2, k))[-1] |
|
99 |
+ fit.order <- c(fit.order, list(fit.vec)) |
|
100 |
+ } |
|
101 |
+ ## Build the comparisson matrix. |
|
102 |
+ ## The (i, j) element is the number of distinct edges of the |
|
103 |
+ ## (i + 1)-th and (j + 1)-th component of the models |
|
104 |
+ ## mixture1 and mixture2 respectively. |
|
105 |
+ comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1) |
|
106 |
+ for(i in 1:(no.trees - 1)) |
|
107 |
+ for(j in 1:(no.trees - 1)) { |
|
108 |
+ comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]]) |
|
109 |
+ } |
|
110 |
+ ## Form the similarity pairs and calculate the similarity of the models |
|
111 |
+ ## as a sum of the number of different edges of the trees in the similarity pairs: |
|
112 |
+ if(no.trees > 2) { |
|
113 |
+ rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE) |
|
114 |
+ diff.sum <- min(comp.mat) |
|
115 |
+ row.index <- c(rc[1, 1]) ## get the row index of the minimum |
|
116 |
+ col.index <- c(rc[1, 2]) ## get the column index of the minimum |
|
117 |
+ |
|
118 |
+ for(m in 1:(nrow(comp.mat) - 1)) { |
|
119 |
+ rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE) |
|
120 |
+ diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)]) |
|
121 |
+ row.index <- c(row.index, rc[1, 1]) |
|
122 |
+ col.index <- c(col.index, rc[1, 2]) |
|
123 |
+ } |
|
124 |
+ } else { |
|
125 |
+ diff.sum <- comp.mat[1, 1] |
|
126 |
+ } |
|
127 |
+ ## Return a result between 0 and 1. |
|
128 |
+ return(diff.sum/((no.trees - 1) * (no.events - 1))) |
|
129 |
+ |
|
130 |
+} |
|
131 |
+###################################################################### |
|
132 |
+## Function that assignes to each node the level at which that node is |
|
133 |
+## in a specific tree (tree.num) of the trees mixture model. |
|
134 |
+## The start.val is the maximum depth of the tree with which the tree |
|
135 |
+## specified with tree.num will be compared. |
|
136 |
+get.tree.levels <- function(mixture, tree.num, start.val) { |
|
137 |
+ root <- nodes(getTree(mixture, tree.num))[1] |
|
138 |
+ levels <- acc(getTree(mixture, tree.num), root) |
|
139 |
+ ## vec <- rep(max(levels[[1]]), eventsNum(mixture) - 1) |
|
140 |
+ vec <- rep(start.val, eventsNum(mixture) - 1) |
|
141 |
+ names(vec) <- nodes(getTree(mixture, tree.num))[-1] |
|
142 |
+ |
|
143 |
+ vec[names(levels[[1]])] <- levels[[1]] |
|
144 |
+ |
|
145 |
+ return(vec) |
|
146 |
+} |
|
147 |
+###################################################################### |
|
148 |
+## Function that implements a similarity measure for comparing the topologies |
|
149 |
+## of the trees of two mixture models mixture1 and mixture2. |
|
150 |
+## It returns a value that uses the number of different edges and the depth at |
|
151 |
+## which the events are, for caracterizing the difference between the models. |
|
152 |
+## This measure is more application specific, since the depth at which the |
|
153 |
+## events are in a tree is very important for disease progression. |
|
154 |
+## For the comparisson it is necessary that the two models have the same |
|
155 |
+## number of tree components build on the same number of genetic events. |
|
156 |
+## It is assumed that the mixtures have at least two components (the first one is the noise). |
|
157 |
+comp.models.levels <- function(mixture1, mixture2) { |
|
158 |
+ if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel")) |
|
159 |
+ stop("The specified mixture models should be of type RtreemixModel!") |
|
160 |
+ |
|
161 |
+ if(numTrees(mixture1) != numTrees(mixture2)) |
|
162 |
+ stop("The two specified mixture models don't have the same number of trees!") |
|
163 |
+ |
|
164 |
+ if(eventsNum(mixture1) != eventsNum(mixture2)) |
|
165 |
+ stop("The two specified mixture models don't have the same number of events!") |
|
166 |
+ no.trees <- numTrees(mixture1) |
|
167 |
+ if (no.trees == 1) |
|
168 |
+ stop("The mixture models should have at least two tree components, where the first one is the noise component!") |
|
169 |
+ |
|
170 |
+ no.events <- eventsNum(mixture1) |
|
171 |
+ |
|
172 |
+ true.order <- list() ## list specifying the edges in the components of mixture1 |
|
173 |
+ fit.order <- list() ## list specifying the edges in the components of mixture2 |
|
174 |
+ |
|
175 |
+ start.vals1 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture1 (without the noise component) |
|
176 |
+ start.vals2 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture2 (without the noise component) |
|
177 |
+ |
|
178 |
+ ## Build the vectors that characterize the edges of the components of the mixture models. |
|
179 |
+ ## The noise components are ignored since they always have the same (star) topology. |
|
180 |
+ for(k in 2:no.trees) { |
|
181 |
+ true.vec <- character(0) |
|
182 |
+ fit.vec <- character(0) |
|
183 |
+ for(l in 2:no.events) { |
|
184 |
+ ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l]))) |
|
185 |
+ true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch)) |
|
186 |
+ ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l]))) |
|
187 |
+ fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch)) |
|
188 |
+ } |
|
189 |
+ names(true.vec) <- nodes(getTree(mixture1, k))[-1] |
|
190 |
+ true.order <- c(true.order, list(true.vec)) |
|
191 |
+ names(fit.vec) <- nodes(getTree(mixture2, k))[-1] |
|
192 |
+ fit.order <- c(fit.order, list(fit.vec)) |
|
193 |
+ |
|
194 |
+ ## Assign the (maximal depths + 1) for each tree in the mixtures |
|
195 |
+ start.vals1[k - 1] <- max(acc(getTree(mixture1, k), nodes(getTree(mixture1, k))[1])[[1]] + 1, na.remove = TRUE) |
|
196 |
+ start.vals2[k - 1] <- max(acc(getTree(mixture2, k), nodes(getTree(mixture2, k))[1])[[1]] + 1, na.remove = TRUE) |
|
197 |
+ } |
|
198 |
+ ## Build the comparisson matrix. |
|
199 |
+ ## The (i, j) element is the number of distinct edges of the |
|
200 |
+ ## (i + 1)-th and (j + 1)-th component of the models |
|
201 |
+ ## mixture1 and mixture2 respectively. |
|
202 |
+ comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1) |
|
203 |
+ for(i in 1:(no.trees - 1)) |
|
204 |
+ for(j in 1:(no.trees - 1)) { |
|
205 |
+ comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]]) |
|
206 |
+ } |
|
207 |
+ ## Form the similarity pairs and calculate the similarity of the models |
|
208 |
+ ## as a sum of the number of different edges of the trees in the similarity pairs |
|
209 |
+ ## and their corresponding L1-distance of the levels of the events: |
|
210 |
+ if(no.trees > 2) { |
|
211 |
+ rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE) |
|
212 |
+ diff.sum <- min(comp.mat) + |
|
213 |
+ L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]), |
|
214 |
+ get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]])) |
|
215 |
+ row.index <- c(rc[1, 1]) ## get the row index of the minimum |
|
216 |
+ col.index <- c(rc[1, 2]) ## get the column index of the minimum |
|
217 |
+ |
|
218 |
+ for(m in 1:(nrow(comp.mat) - 1)) { |
|
219 |
+ rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE) |
|
220 |
+ diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)]) + |
|
221 |
+ L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]), |
|
222 |
+ get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]])) |
|
223 |
+ row.index <- c(row.index, rc[1, 1]) |
|
224 |
+ col.index <- c(col.index, rc[1, 2]) |
|
225 |
+ } |
|
226 |
+ } else { |
|
227 |
+ if(no.trees == 2) { |
|
228 |
+ diff.sum <- comp.mat[1, 1] + |
|
229 |
+ L1.dist(get.tree.levels(mixture1, 2, start.vals2[1]), |
|
230 |
+ get.tree.levels(mixture2, 2, start.vals1[1])) |
|
231 |
+ } else { |
|
232 |
+ stop("The specified mixture models must have at least two tree components, where the first one is the noise component!") |
|
233 |
+ } |
|
234 |
+ } |
|
235 |
+ |
|
236 |
+ return(diff.sum) |
|
237 |
+ |
|
238 |
+} |
|
239 |
+###################################################################### |
|
240 |
+## Function that implements a similarity measure for comparing the topologies |
|
241 |
+## of the nontrivial tree components of a specified mixture model. |
|
242 |
+## It returns a value that uses the number of different edges for caracterizing |
|
243 |
+## the difference of the trees in the model. |
|
244 |
+## For the comparisson it is necessary that the model has at least two |
|
245 |
+## nontrivial components. |
|
246 |
+comp.trees <- function(model) { |
|
247 |
+ no.trees <- numTrees(model) |
|
248 |
+ if(no.trees <= 2) |
|
249 |
+ stop("The specified mixture model should have at least two nontrivial tree components.") |
|
250 |
+ |
|
251 |
+ no.events <- eventsNum(model) |
|
252 |
+ true.order <- list() ## list specifying the edges in the nontrivial components of the model |
|
253 |
+ ## Build the vectors that characterize the |
|
254 |
+ ## edges of the nontrivial components of the mixture model. |
|
255 |
+ for(k in 2:no.trees) { |
|
256 |
+ true.vec <- character(0) |
|
257 |
+ for(l in 2:no.events) { |
|
258 |
+ ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l]))) |
|
259 |
+ true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch)) |
|
260 |
+ } |
|
261 |
+ names(true.vec) <- nodes(getTree(model, k))[-1] |
|
262 |
+ true.order <- c(true.order, list(true.vec)) |
|
263 |
+ } |
|
264 |
+ ## Calculate the similarity of the nontrivial components of the model |
|
265 |
+ ## as a sum of the number of different edges of all combinations of |
|
266 |
+ ## two different nontrivial trees: |
|
267 |
+ diff.sum <- 0 |
|
268 |
+ for(k1 in 2:(no.trees - 1)) |
|
269 |
+ for(k2 in (k1 + 1):no.trees) |
|
270 |
+ diff.sum <- diff.sum + sum(true.order[[k1 - 1]] != true.order[[k2 - 1]]) |
|
271 |
+ |
|
272 |
+ ## Return a result between 0 and 1. |
|
273 |
+ return(diff.sum / (choose((no.trees - 1), 2) * (no.events - 1))) |
|
274 |
+} |
|
275 |
+###################################################################### |
|
276 |
+## Function that implements a similarity measure for comparing the topologies |
|
277 |
+## of the nontrivial tree components of a specified mixture model. |
|
278 |
+## It returns a value that uses the number of different edges and the depth at |
|
279 |
+## which the events are, for caracterizing the difference of the trees in the model. |
|
280 |
+## For the comparisson it is necessary that the model has at least two |
|
281 |
+## nontrivial components. |
|
282 |
+comp.trees.levels <- function(model) { |
|
283 |
+ no.trees <- numTrees(model) |
|
284 |
+ if(no.trees <= 2) |
|
285 |
+ stop("The specified mixture model should have at least two nontrivial tree components.") |
|
286 |
+ |
|
287 |
+ no.events <- eventsNum(model) |
|
288 |
+ start.vals <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the nontrivial tree components in the model |
|
289 |
+ true.order <- list() ## list specifying the edges in the nontrivial components of the model |
|
290 |
+ |
|
291 |
+ ## Build the vectors that characterize the |
|
292 |
+ ## edges of the nontrivial components of the mixture model. |
|
293 |
+ for(k in 2:no.trees) { |
|
294 |
+ true.vec <- character(0) |
|
295 |
+ for(l in 2:no.events) { |
|
296 |
+ ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l]))) |
|
297 |
+ true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch)) |
|
298 |
+ } |
|
299 |
+ names(true.vec) <- nodes(getTree(model, k))[-1] |
|
300 |
+ true.order <- c(true.order, list(true.vec)) |
|
301 |
+ |
|
302 |
+ ## Assign the maximal depths + 1 for each tree in the mixtures |
|
303 |
+ start.vals[k - 1] <- max(acc(getTree(model, k), nodes(getTree(model, k))[1])[[1]] + 1, na.remove = TRUE) |
|
304 |
+ } |
|
305 |
+ ## Calculate the similarity of the models as a sum of the number of |
|
306 |
+ ## different edges of all possible combinations of two different |
|
307 |
+ ## nontrivial trees in the model and their corresponding L1-distance |
|
308 |
+ ## of the levels of the events: |
|
309 |
+ diff.sum <- 0 |
|
310 |
+ for(k1 in 2:(no.trees - 1)) { |
|
311 |
+ for(k2 in (k1 + 1):no.trees) { |
|
312 |
+ diff.sum <- diff.sum + (sum(true.order[[k1 - 1]] != true.order[[k2 - 1]]) |
|
313 |
+ + L1.dist(get.tree.levels(model, k1, start.vals[k2 - 1]), |
|
314 |
+ get.tree.levels(model, k2, start.vals[k1 - 1]))) |
|
315 |
+ } |
|
316 |
+ } |
|
317 |
+ return(diff.sum) |
|
318 |
+} |
|
319 |
+###################################################################### |
|
320 |
+## Stability analysis of the oncogenetic trees mixture model. |
|
321 |
+## This includes stability analysis on different levels of the mixture |
|
322 |
+## model: GPS values, encoded probability distribution, tree topologies. |
|
323 |
+## The function outputs a list of analysis for the mentioned features. |
|
324 |
+## Each analysis contains the values of different similarity measures |
|
325 |
+## with their corresponding p-values. |
|
326 |
+## The models should have at least two components (the first one is the noise). |
|
327 |
+stability.sim <- function(no.trees = 3, ## number of tree components |
|
328 |
+ no.events = 9, ## number of genetic events |
|
329 |
+ prob = c(0.2, 0.8), ## interval for the edgeweights of the random mixture models |
|
330 |
+ no.draws = 300, ## number of samples drawn from the random models used for learning a mixture model |
|
331 |
+ no.rands = 100, ## number of rands for calculatin the p-values |
|
332 |
+ no.sim = 1 ## number of simulation iterations |
|
333 |
+ ) { |
|
334 |
+ ## Set the true types. |
|
335 |
+ no.trees <- as.integer(no.trees) |
|
336 |
+ no.events <- as.integer(no.events) |
|
337 |
+ no.draws <- as.integer(no.draws) |
|
338 |
+ no.rands <- as.integer(no.rands) |
|
339 |
+ no.sim <- as.integer(no.sim) |
|
340 |
+ ## Check if the necessary parameters are provided and have the correct form. |
|
341 |
+ if(no.trees < 2) |
|
342 |
+ stop("The specified mixture model should have at least two |
|
343 |
+tree components (where the first one is the noise).") |
|
344 |
+ if (no.events < 1) |
|
345 |
+ stop("The number of events must be an integer greater than zero!") |
|
346 |
+ if(no.draws < 1) |
|
347 |
+ stop("The number of draws (number of samples) must be an integer greater than zero!") |
|
348 |
+ if (no.rands < 1) |
|
349 |
+ stop("The number of random models for the p-values must be an integer greater than zero!") |
|
350 |
+ if (no.sim < 1) |
|
351 |
+ stop("The number of iterations for the waiting time simulation |
|
352 |
+must be an integer greater than zero!") |
|
353 |
+ if(!missing(prob)) { |
|
354 |
+ if(!is.numeric(prob) || (length(prob) != 2)) |
|
355 |
+ stop("Specify the boundaries of the conditional probabilities as a numeric vector |
|
356 |
+of length two = c(min, max)!") |
|
357 |
+ if(prob[2] < prob[1]) |
|
358 |
+ stop("In the probability vector the lower boundary must be smaller than the |
|
359 |
+upper boundary!") |
|
360 |
+ } |
|
361 |
+ |
|
362 |
+ simGPS <- numeric(0) ## results from the stability analysis of the GPS values |
|
363 |
+ topo.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models |
|
364 |
+ topo.levels.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models.levels |
|
365 |
+ result.distr <- numeric(0) ## results from the stability analysis of the probability distribution encoded by the model |
|
366 |
+ |
|
367 |
+ mat.true.gps <- numeric(0) ## matrix containing the true GPS values from each simulation iteration |
|
368 |
+ mat.fit.gps <- numeric(0) ## matrix containing the corresponding fitted GPS values from each simulation iteration |
|
369 |
+ |
|
370 |
+ list.true.models <- list() ## list containing the true models from each simulation iteration |
|
371 |
+ list.fit.models <- list() ## list containing the corresponding fitted models from each simulation iteration |
|
372 |
+ ## Simulation iterations: |
|
373 |
+ for(i in 1:as.integer(no.sim)) { |
|
374 |
+ print(i) ## simulation iteration |
|
375 |
+ ## Pick a true model from the space of random models and draw data from it. |
|
376 |
+ true.m <- generate(K = no.trees, no.events = no.events, prob = prob) |
|
377 |
+ Weights(true.m) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1))) |
|
378 |
+ sim.data <- sim(model = true.m, no.draws = no.draws) |
|
379 |
+ list.true.models <- c(list.true.models, true.m) |
|
380 |
+ ## Calculate the GPS and the distribution encoded by true.m |
|
381 |
+ true.gps <- GPS(gps(model = true.m, data = sim.data, no.sim = 10000)) |
|
382 |
+ mat.true.gps <- cbind(mat.true.gps, true.gps) |
|
383 |
+ true.distr <- distribution(model = true.m)$probability |
|
384 |
+ ## Fit a mixture model to the data sim.data |
|
385 |
+ fm <- fit(data = sim.data, K = no.trees, noise = TRUE, equal.edgeweights = TRUE, eps = 0.01) |
|
386 |
+ list.fit.models <- c(list.fit.models, fm) |
|
387 |
+ ## Calculate the GPS and the distribution encoded by fm |
|
388 |
+ fit.gps <- GPS(gps(model = fm, data = sim.data, no.sim = 10000)) |
|
389 |
+ mat.fit.gps <- cbind(mat.fit.gps, fit.gps) |
|
390 |
+ fit.distr <- distribution(model = fm)$probability |
|
391 |
+ ## Compute different distances between the distributions induced |
|
392 |
+ ## by the true and fitted models: |
|
393 |
+ true.cosin <- cosin.dist(true.distr, fit.distr) |
|
394 |
+ true.l1 <- L1.dist(true.distr, fit.distr) |
|
395 |
+ true.kull <- kullback.leibler(true.distr, fit.distr) |
|
396 |
+ ## Compute different distances between the GPS values |
|
397 |
+ ## for the true and fitted models: |
|
398 |
+ true.euclGPS <- euclidian.dist(true.gps, fit.gps) |
|
399 |
+ true.rank.dist <- rank.cor.dist(true.gps, fit.gps) |
|
400 |
+ ## Compute different similarity measures for comparing the |
|
401 |
+ ## topologies of the nontrivial components of the true and fitted models: |
|
402 |
+ true.topo.dif <- comp.models(true.m, fm) |
|
403 |
+ true.topo.levels.dif <- comp.models.levels(true.m, fm) |
|
404 |
+ ## Vectors for keeping the values for the different features |
|
405 |
+ ## of the random mixture models needed for the p-values calculation: |
|
406 |
+ rand.euclGPS <- numeric(0) |
|
407 |
+ rand.rankGPS <- numeric(0) |
|
408 |
+ |
|
409 |
+ rand.cos.distr <- numeric(0) |
|
410 |
+ rand.l1.distr <- numeric(0) |
|
411 |
+ rand.kull.distr <- numeric(0) |
|
412 |
+ |
|
413 |
+ rand.topo <- numeric(0) |
|
414 |
+ rand.topo.levels <- numeric(0) |
|
415 |
+ ## Create the vectors with random values for the p-values calculation: |
|
416 |
+ for(j in 1:(as.integer(no.rands) - 1)) { |
|
417 |
+ ## Generate random model and calculate the GPS and distribution: |
|
418 |
+ model <- generate(K = no.trees, no.events = no.events, prob = prob) |
|
419 |
+ Weights(model) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1))) |
|
420 |
+ random.gps <- GPS(gps(model = model, data = sim.data, no.sim = 10000)) |
|
421 |
+ random.distr <- distribution(model = model)$probability |
|
422 |
+ ## GPS: |
|
423 |
+ rand.euclGPS <- c(rand.euclGPS, euclidian.dist(true.gps, random.gps)) |
|
424 |
+ rand.rankGPS <- c(rand.rankGPS, rank.cor.dist(true.gps, random.gps)) |
|
425 |
+ ## Distribution: |
|
426 |
+ rand.cos.distr <- c(rand.cos.distr, cosin.dist(true.distr, random.distr)) |
|
427 |
+ rand.l1.distr <- c(rand.l1.distr, L1.dist(true.distr, random.distr)) |
|
428 |
+ rand.kull.distr <- c(rand.kull.distr, kullback.leibler(true.distr, random.distr)) |
|
429 |
+ ## Tree topologies: |
|
430 |
+ rand.topo <- c(rand.topo, comp.models(true.m, model)) |
|
431 |
+ rand.topo.levels <- c(rand.topo.levels, comp.models.levels(true.m, model)) |
|
432 |
+ } |
|
433 |
+ ## Stability analysis of GPS values |
|
434 |
+ ## (using Euclidian distance and rank correlation distance as similarity measures): |
|
435 |
+ simGPS <- rbind(simGPS, |
|
436 |
+ c(true.euclGPS, Pval.dist(true.euclGPS, rand.euclGPS), |
|
437 |
+ true.rank.dist, Pval.dist(true.rank.dist, rand.rankGPS))) |
|
438 |
+ ## Stability analysis of induced distributions |
|
439 |
+ ## (using cosine distance, L1 distance, Kullback-Leibler divergence as similarity measures): |
|
440 |
+ result.distr <- rbind(result.distr, |
|
441 |
+ c(true.cosin, Pval.dist(true.cosin, rand.cos.distr), |
|
442 |
+ true.l1, Pval.dist(true.l1, rand.l1.distr), |
|
443 |
+ true.kull, Pval.dist(true.kull, rand.kull.distr))) |
|
444 |
+ ## Stability analysis of tree topologies |
|
445 |
+ ## (using comp.models and comp.models.levels as similarity measures): |
|
446 |
+ topo.dif <- rbind(topo.dif, |
|
447 |
+ c(true.topo.dif, Pval.dist(true.topo.dif, rand.topo))) |
|
448 |
+ topo.levels.dif <- rbind(topo.levels.dif, |
|
449 |
+ c(true.topo.levels.dif, Pval.dist(true.topo.levels.dif, rand.topo.levels))) |
|
450 |
+ |
|
451 |
+ } |
|
452 |
+ ## Organize the output: |
|
453 |
+ colnames(simGPS) <- c("eucl.dist", "p.val.eucl", "rank.cor.dist", "p.val.rank") |
|
454 |
+ colnames(result.distr) <- c("cos", "p.val.cos", "L1", "p.val.L1", "KL", "p.val.KL") |
|
455 |
+ colnames(topo.dif) <- c("topo.dif", "p.value") |
|
456 |
+ colnames(topo.levels.dif) <- c("topo.levels.dif", "p.value") |
|
457 |
+ colnames(mat.true.gps) <- c(1:no.sim) |
|
458 |
+ colnames(mat.fit.gps) <- c(1:no.sim) |
|
459 |
+ output <- list(simGPS, result.distr, |
|
460 |
+ topo.dif, topo.levels.dif, |
|
461 |
+ mat.true.gps, mat.fit.gps, |
|
462 |
+ list.true.models, list.fit.models) |
|
463 |
+ names(output) <- c("GPS", "Distribution", "Tree topologies (distinct edges)", |
|
464 |
+ "Tree topologies (distinct edges + levelsL1dist)", "True GPS", |
|
465 |
+ "Fitted GPS", "True models", "Fitted models") |
|
466 |
+ |
|
467 |
+ return(output) |
|
468 |
+} |