```################################################################################
##  FUNCTIONS FOR STABILITY ANALYSIS OF THE ONCOGENETIC TREES MIXTURE MODEL   ##
################################################################################
## Function for calculating the P value of a given similarity measure.
## What is the probability for obtaining the same or smaller
## value in a random vector of similarity values.
Pval.dist <- function(dist.val, random.vals) {
return((sum(random.vals <= dist.val) + 1) /(length(random.vals) + 1))
}
######################################################################
## Function for calculating the Kullback-Leibler divergence between
## two discrete probability distributions p and q.
kullback.leibler <- function(p, q) {
if(length(p) != length(q))
stop("Error: The distribution vectors have different lengths!")
return(sum(p * log(p / q)))
}
######################################################################
## Function for calculating the L1 distance between
## two discrete probability distributions p and q.
L1.dist <- function(p, q) {
if(length(p) != length(q))
stop("Error: The distribution vectors have different lengths!")

return(sum(abs(p - q)))
}
######################################################################
## Function for calculating the cosine distance between
## two discrete probability distributions p and q.
cosin.dist <- function(p, q) {
if(length(p) != length(q))
stop("Error: The distribution vectors have different lengths!")

return(1 - (sum (p * q) / (L2.norm(p) * L2.norm(q)) ))
}
######################################################################
## Function for calculating the L2 norm of a given vector x.
L2.norm <- function(x) {
return(sqrt(sum(x * x)))
}
######################################################################
## Function for calculating the Euclidian distance between
## two vectors x and y.
euclidian.dist <- function(x, y) {
if(length(x) != length(y))
stop("Error: The vectors have different lengths!")

return(sqrt(drop((x - y) %*% (x - y))))
}
######################################################################
## Function for calculating the rank-correlation distance between
## two vectors x and y.
rank.cor.dist <- function(x, y) {
if(length(x) != length(y))
stop("Error: The vectors have different length!")

return(1 - rcorr(x, y, type = "spearman")[[1]][1, 2])
}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the trees of two mixture models mixture1 and mixture2.
## It returns a value that uses the number of different edges for caracterizing
## the difference between the models.
## For the comparisson it is necessary that the two models have the same
## number of tree components build on the same number of genetic events.
## It is assumed that the mixtures have at least two components (the first one is the noise).
comp.models <- function(mixture1, mixture2) {
if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel"))
stop("The specified mixture models should be of type RtreemixModel!")

if(numTrees(mixture1) != numTrees(mixture2))
stop("The two specified mixture models don't have the same number of trees!")

if(eventsNum(mixture1) != eventsNum(mixture2))
stop("The two specified mixture models don't have the same number of events!")
no.trees <- numTrees(mixture1)
if (no.trees == 1)
stop("The mixture models should have at least two tree components, where the first one is the noise component!")

no.events <- eventsNum(mixture1)

true.order <- list() ## list specifying the edges in the components of mixture1
fit.order <- list() ## list specifying the edges in the components of mixture2

## Build the vectors that characterize the edges of the components of the mixture models.
## The noise components are ignored since they always have the same (star) topology.
for(k in 2:no.trees) {
true.vec <- character(0)
fit.vec <- character(0)
for(l in 2:no.events) {
ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l])))
true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))
ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l])))
fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))
}
names(true.vec) <- nodes(getTree(mixture1, k))[-1]
true.order <- c(true.order, list(true.vec))
names(fit.vec) <- nodes(getTree(mixture2, k))[-1]
fit.order <- c(fit.order, list(fit.vec))
}
## Build the comparisson matrix.
## The (i, j) element is the number of distinct edges of the
## (i + 1)-th and (j + 1)-th component of the models
## mixture1 and mixture2 respectively.
comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1)
for(i in 1:(no.trees - 1))
for(j in 1:(no.trees - 1)) {
comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]])
}
## Form the similarity pairs and calculate the similarity of the models
## as a sum of the number of different edges of the trees in the similarity pairs:
if(no.trees > 2) {
rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE)
diff.sum <- min(comp.mat)
row.index <- c(rc[1, 1]) ## get the row index of the minimum
col.index <- c(rc[1, 2]) ## get the column index of the minimum

for(m in 1:(nrow(comp.mat) - 1)) {
rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE)
diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)])
row.index <- c(row.index, rc[1, 1])
col.index <- c(col.index, rc[1, 2])
}
} else {
diff.sum <- comp.mat[1, 1]
}
## Return a result between 0 and 1.
return(diff.sum/((no.trees - 1) * (no.events - 1)))

}
######################################################################
## Function that assignes to each node the level at which that node is
## in a specific tree (tree.num) of the trees mixture model.
## The start.val is the maximum depth of the tree with which the tree
## specified with tree.num will be compared.
get.tree.levels <- function(mixture, tree.num, start.val) {
root <- nodes(getTree(mixture, tree.num))[1]
levels <- acc(getTree(mixture, tree.num), root)
## vec <- rep(max(levels[[1]]), eventsNum(mixture) - 1)
vec <- rep(start.val, eventsNum(mixture) - 1)
names(vec) <- nodes(getTree(mixture, tree.num))[-1]

vec[names(levels[[1]])] <- levels[[1]]

return(vec)
}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the trees of two mixture models mixture1 and mixture2.
## It returns a value that uses the number of different edges and the depth at
## which the events are, for caracterizing the difference between the models.
## This measure is more application specific, since the depth at which the
## events are in a tree is very important for disease progression.
## For the comparisson it is necessary that the two models have the same
## number of tree components build on the same number of genetic events.
## It is assumed that the mixtures have at least two components (the first one is the noise).
comp.models.levels <- function(mixture1, mixture2) {
if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel"))
stop("The specified mixture models should be of type RtreemixModel!")

if(numTrees(mixture1) != numTrees(mixture2))
stop("The two specified mixture models don't have the same number of trees!")

if(eventsNum(mixture1) != eventsNum(mixture2))
stop("The two specified mixture models don't have the same number of events!")
no.trees <- numTrees(mixture1)
if (no.trees == 1)
stop("The mixture models should have at least two tree components, where the first one is the noise component!")

no.events <- eventsNum(mixture1)

true.order <- list() ## list specifying the edges in the components of mixture1
fit.order <- list() ## list specifying the edges in the components of mixture2

start.vals1 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture1 (without the noise component)
start.vals2 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture2 (without the noise component)

## Build the vectors that characterize the edges of the components of the mixture models.
## The noise components are ignored since they always have the same (star) topology.
for(k in 2:no.trees) {
true.vec <- character(0)
fit.vec <- character(0)
for(l in 2:no.events) {
ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l])))
true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))
ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l])))
fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))
}
names(true.vec) <- nodes(getTree(mixture1, k))[-1]
true.order <- c(true.order, list(true.vec))
names(fit.vec) <- nodes(getTree(mixture2, k))[-1]
fit.order <- c(fit.order, list(fit.vec))

## Assign the (maximal depths + 1) for each tree in the mixtures
start.vals1[k - 1] <- max(acc(getTree(mixture1, k), nodes(getTree(mixture1, k))[1])[[1]] + 1, na.remove = TRUE)
start.vals2[k - 1] <- max(acc(getTree(mixture2, k), nodes(getTree(mixture2, k))[1])[[1]] + 1, na.remove = TRUE)
}
## Build the comparisson matrix.
## The (i, j) element is the number of distinct edges of the
## (i + 1)-th and (j + 1)-th component of the models
## mixture1 and mixture2 respectively.
comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1)
for(i in 1:(no.trees - 1))
for(j in 1:(no.trees - 1)) {
comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]])
}
## Form the similarity pairs and calculate the similarity of the models
## as a sum of the number of different edges of the trees in the similarity pairs
## and their corresponding L1-distance of the levels of the events:
if(no.trees > 2) {
rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE)
diff.sum <- min(comp.mat) +
L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]),
get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]]))
row.index <- c(rc[1, 1]) ## get the row index of the minimum
col.index <- c(rc[1, 2]) ## get the column index of the minimum

for(m in 1:(nrow(comp.mat) - 1)) {
rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE)
diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)]) +
L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]),
get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]]))
row.index <- c(row.index, rc[1, 1])
col.index <- c(col.index, rc[1, 2])
}
} else {
if(no.trees == 2) {
diff.sum <- comp.mat[1, 1] +
L1.dist(get.tree.levels(mixture1, 2, start.vals2[1]),
get.tree.levels(mixture2, 2, start.vals1[1]))
} else {
stop("The specified mixture models must have at least two tree components, where the first one is the noise component!")
}
}

return(diff.sum)

}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the nontrivial tree components of a specified mixture model.
## It returns a value that uses the number of different edges for caracterizing
## the difference of the trees in the model.
## For the comparisson it is necessary that the model has at least two
## nontrivial components.
comp.trees <- function(model) {
no.trees <- numTrees(model)
if(no.trees <= 2)
stop("The specified mixture model should have at least two nontrivial tree components.")

no.events <- eventsNum(model)
true.order <- list() ## list specifying the edges in the nontrivial components of the model
## Build the vectors that characterize the
## edges of the nontrivial components of the mixture model.
for(k in 2:no.trees) {
true.vec <- character(0)
for(l in 2:no.events) {
ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l])))
true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))
}
names(true.vec) <- nodes(getTree(model, k))[-1]
true.order <- c(true.order, list(true.vec))
}
## Calculate the similarity of the nontrivial components of the model
## as a sum of the number of different edges of all combinations of
## two different nontrivial trees:
diff.sum <- 0
for(k1 in 2:(no.trees - 1))
for(k2 in (k1 + 1):no.trees)
diff.sum <- diff.sum + sum(true.order[[k1 - 1]] != true.order[[k2 - 1]])

## Return a result between 0 and 1.
return(diff.sum / (choose((no.trees - 1), 2) * (no.events - 1)))
}
######################################################################
## Function that implements a similarity measure for comparing the topologies
## of the nontrivial tree components of a specified mixture model.
## It returns a value that uses the  number of different edges and the depth at
## which the events are, for caracterizing the difference of the trees in the model.
## For the comparisson it is necessary that the model has at least two
## nontrivial components.
comp.trees.levels <- function(model) {
no.trees <- numTrees(model)
if(no.trees <= 2)
stop("The specified mixture model should have at least two nontrivial tree components.")

no.events <- eventsNum(model)
start.vals <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the nontrivial tree components in the model
true.order <- list() ## list specifying the edges in the nontrivial components of the model

## Build the vectors that characterize the
## edges of the nontrivial components of the mixture model.
for(k in 2:no.trees) {
true.vec <- character(0)
for(l in 2:no.events) {
ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l])))
true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))
}
names(true.vec) <- nodes(getTree(model, k))[-1]
true.order <- c(true.order, list(true.vec))

## Assign the maximal depths + 1 for each tree in the mixtures
start.vals[k - 1] <- max(acc(getTree(model, k), nodes(getTree(model, k))[1])[[1]] + 1, na.remove = TRUE)
}
## Calculate the similarity of the models as a sum of the number of
## different edges of all possible combinations of two different
## nontrivial trees in the model and their corresponding L1-distance
## of the levels of the events:
diff.sum <- 0
for(k1 in 2:(no.trees - 1)) {
for(k2 in (k1 + 1):no.trees) {
diff.sum <- diff.sum + (sum(true.order[[k1 - 1]] != true.order[[k2 - 1]])
+  L1.dist(get.tree.levels(model, k1, start.vals[k2 - 1]),
get.tree.levels(model, k2, start.vals[k1 - 1])))
}
}
return(diff.sum)
}
######################################################################
## Stability analysis of the oncogenetic trees mixture model.
## This includes stability analysis on different levels of the mixture
## model: GPS values, encoded probability distribution, tree topologies.
## The function outputs a list of analysis for the mentioned features.
## Each analysis contains the values of different similarity measures
## with their corresponding p-values.
## The models should have at least two components (the first one is the noise).
stability.sim <- function(no.trees = 3, ## number of tree components
no.events = 9, ## number of genetic events
prob = c(0.2, 0.8), ## interval for the edgeweights of the random mixture models
no.draws = 300, ## number of samples drawn from the random models used for learning a mixture model
no.rands = 100, ## number of rands for calculatin the p-values
no.sim = 1 ## number of simulation iterations
) {
## Set the true types.
no.trees <- as.integer(no.trees)
no.events <- as.integer(no.events)
no.draws <- as.integer(no.draws)
no.rands <- as.integer(no.rands)
no.sim <- as.integer(no.sim)
## Check if the necessary parameters are provided and have the correct form.
if(no.trees < 2)
stop("The specified mixture model should have at least two
tree components (where the first one is the noise).")
if (no.events < 1)
stop("The number of events must be an integer greater than zero!")
if(no.draws < 1)
stop("The number of draws (number of samples) must be an integer greater than zero!")
if (no.rands < 1)
stop("The number of random models for the p-values  must be an integer greater than zero!")
if (no.sim < 1)
stop("The number of iterations for the waiting time simulation
must be an integer greater than zero!")
if(!missing(prob)) {
if(!is.numeric(prob) || (length(prob) != 2))
stop("Specify the boundaries of the conditional probabilities as a numeric vector
of length two = c(min, max)!")
if(prob[2] < prob[1])
stop("In the probability vector the lower boundary must be smaller than the
upper boundary!")
}

simGPS <- numeric(0) ## results from the stability analysis of the GPS values
topo.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models
topo.levels.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models.levels
result.distr <- numeric(0) ## results from the stability analysis of the probability distribution encoded by the model

mat.true.gps <- numeric(0) ## matrix containing the true GPS values from each simulation iteration
mat.fit.gps <- numeric(0) ## matrix containing the corresponding fitted GPS values from each simulation iteration

list.true.models <- list() ## list containing the true models from each simulation iteration
list.fit.models <- list() ## list containing the corresponding fitted models from each simulation iteration
## Simulation iterations:
for(i in 1:as.integer(no.sim)) {
print(i) ## simulation iteration
## Pick a true model from the space of random models and draw data from it.
true.m <- generate(K = no.trees, no.events = no.events, prob = prob)
Weights(true.m) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1)))
sim.data <- sim(model = true.m, no.draws = no.draws)
list.true.models <- c(list.true.models, true.m)
## Calculate the GPS and the distribution encoded by true.m
true.gps <- GPS(gps(model = true.m, data = sim.data, no.sim = 10000))
mat.true.gps <- cbind(mat.true.gps, true.gps)
true.distr <- distribution(model = true.m)\$probability
## Fit a mixture model to the data sim.data
fm <- fit(data = sim.data, K = no.trees, noise = TRUE, equal.edgeweights = TRUE, eps = 0.01)
list.fit.models <- c(list.fit.models, fm)
## Calculate the GPS and the distribution encoded by fm
fit.gps <- GPS(gps(model = fm, data = sim.data, no.sim = 10000))
mat.fit.gps <- cbind(mat.fit.gps, fit.gps)
fit.distr <- distribution(model = fm)\$probability
## Compute different distances between the distributions induced
## by the true and fitted models:
true.cosin <- cosin.dist(true.distr, fit.distr)
true.l1 <- L1.dist(true.distr, fit.distr)
true.kull <- kullback.leibler(true.distr, fit.distr)
## Compute different distances between the GPS values
## for the true and fitted models:
true.euclGPS <- euclidian.dist(true.gps, fit.gps)
true.rank.dist <-  rank.cor.dist(true.gps, fit.gps)
## Compute different similarity measures for comparing the
## topologies of the nontrivial components of the true and fitted models:
true.topo.dif <- comp.models(true.m, fm)
true.topo.levels.dif <- comp.models.levels(true.m, fm)
## Vectors for keeping the values for the different features
## of the random mixture models needed for the p-values calculation:
rand.euclGPS <- numeric(0)
rand.rankGPS <- numeric(0)

rand.cos.distr <- numeric(0)
rand.l1.distr <- numeric(0)
rand.kull.distr <- numeric(0)

rand.topo <- numeric(0)
rand.topo.levels <- numeric(0)
## Create the vectors with random values for the p-values calculation:
for(j in 1:(as.integer(no.rands) - 1)) {
## Generate random model and calculate the GPS and distribution:
model <- generate(K = no.trees, no.events = no.events, prob = prob)
Weights(model) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1)))
random.gps <- GPS(gps(model = model, data = sim.data, no.sim = 10000))
random.distr <- distribution(model = model)\$probability
## GPS:
rand.euclGPS <- c(rand.euclGPS, euclidian.dist(true.gps, random.gps))
rand.rankGPS <- c(rand.rankGPS, rank.cor.dist(true.gps, random.gps))
## Distribution:
rand.cos.distr <- c(rand.cos.distr, cosin.dist(true.distr, random.distr))
rand.l1.distr <- c(rand.l1.distr, L1.dist(true.distr, random.distr))
rand.kull.distr <- c(rand.kull.distr, kullback.leibler(true.distr, random.distr))
## Tree topologies:
rand.topo <- c(rand.topo, comp.models(true.m, model))
rand.topo.levels <- c(rand.topo.levels, comp.models.levels(true.m, model))
}
## Stability analysis of GPS values
## (using Euclidian distance and rank correlation distance as similarity measures):
simGPS <- rbind(simGPS,
c(true.euclGPS, Pval.dist(true.euclGPS, rand.euclGPS),
true.rank.dist, Pval.dist(true.rank.dist, rand.rankGPS)))
## Stability analysis of induced distributions
## (using cosine distance, L1 distance, Kullback-Leibler divergence as similarity measures):
result.distr <- rbind(result.distr,
c(true.cosin, Pval.dist(true.cosin, rand.cos.distr),
true.l1, Pval.dist(true.l1, rand.l1.distr),
true.kull, Pval.dist(true.kull, rand.kull.distr)))
## Stability analysis of tree topologies
## (using comp.models and comp.models.levels as similarity measures):
topo.dif <- rbind(topo.dif,
c(true.topo.dif, Pval.dist(true.topo.dif, rand.topo)))
topo.levels.dif <- rbind(topo.levels.dif,
c(true.topo.levels.dif, Pval.dist(true.topo.levels.dif, rand.topo.levels)))

}
## Organize the output:
colnames(simGPS) <- c("eucl.dist", "p.val.eucl", "rank.cor.dist", "p.val.rank")
colnames(result.distr) <- c("cos", "p.val.cos", "L1", "p.val.L1", "KL", "p.val.KL")
colnames(topo.dif) <- c("topo.dif", "p.value")
colnames(topo.levels.dif) <- c("topo.levels.dif", "p.value")
colnames(mat.true.gps) <- c(1:no.sim)
colnames(mat.fit.gps) <- c(1:no.sim)
output <- list(simGPS, result.distr,
topo.dif, topo.levels.dif,
mat.true.gps, mat.fit.gps,
list.true.models, list.fit.models)
names(output) <- c("GPS", "Distribution", "Tree topologies (distinct edges)",
"Tree topologies (distinct edges + levelsL1dist)", "True GPS",
"Fitted GPS", "True models", "Fitted models")

return(output)
}
```