R/StabilityFunctions.R
6fe809cf
 ################################################################################
 ##  FUNCTIONS FOR STABILITY ANALYSIS OF THE ONCOGENETIC TREES MIXTURE MODEL   ##
 ################################################################################
 ## Function for calculating the P value of a given similarity measure.
 ## What is the probability for obtaining the same or smaller
 ## value in a random vector of similarity values.
 Pval.dist <- function(dist.val, random.vals) {
   return((sum(random.vals <= dist.val) + 1) /(length(random.vals) + 1))
 }
 ######################################################################
 ## Function for calculating the Kullback-Leibler divergence between
 ## two discrete probability distributions p and q.
 kullback.leibler <- function(p, q) {
   if(length(p) != length(q))
     stop("Error: The distribution vectors have different lengths!")  
   return(sum(p * log(p / q)))
 }
 ######################################################################
 ## Function for calculating the L1 distance between
 ## two discrete probability distributions p and q.
 L1.dist <- function(p, q) {
   if(length(p) != length(q))
     stop("Error: The distribution vectors have different lengths!")
 
   return(sum(abs(p - q)))
 }
 ######################################################################
 ## Function for calculating the cosine distance between
 ## two discrete probability distributions p and q.
 cosin.dist <- function(p, q) {
   if(length(p) != length(q))
     stop("Error: The distribution vectors have different lengths!")
 
   return(1 - (sum (p * q) / (L2.norm(p) * L2.norm(q)) ))
 }
 ######################################################################
 ## Function for calculating the L2 norm of a given vector x.
 L2.norm <- function(x) {
   return(sqrt(sum(x * x)))
 }
 ######################################################################
 ## Function for calculating the Euclidian distance between
 ## two vectors x and y.
 euclidian.dist <- function(x, y) {
   if(length(x) != length(y))
     stop("Error: The vectors have different lengths!")
 
   return(sqrt(drop((x - y) %*% (x - y))))
 }
 ######################################################################
 ## Function for calculating the rank-correlation distance between
 ## two vectors x and y.
 rank.cor.dist <- function(x, y) {
   if(length(x) != length(y))
     stop("Error: The vectors have different length!")
 
   return(1 - rcorr(x, y, type = "spearman")[[1]][1, 2])
 }
 ######################################################################
 ## Function that implements a similarity measure for comparing the topologies
 ## of the trees of two mixture models mixture1 and mixture2.
 ## It returns a value that uses the number of different edges for caracterizing
 ## the difference between the models.
 ## For the comparisson it is necessary that the two models have the same
 ## number of tree components build on the same number of genetic events.
 ## It is assumed that the mixtures have at least two components (the first one is the noise).
 comp.models <- function(mixture1, mixture2) {
   if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel"))
     stop("The specified mixture models should be of type RtreemixModel!")
   
   if(numTrees(mixture1) != numTrees(mixture2))
     stop("The two specified mixture models don't have the same number of trees!")
   
   if(eventsNum(mixture1) != eventsNum(mixture2))
     stop("The two specified mixture models don't have the same number of events!")
   no.trees <- numTrees(mixture1)
   if (no.trees == 1)
     stop("The mixture models should have at least two tree components, where the first one is the noise component!")
   
   no.events <- eventsNum(mixture1)
   
   true.order <- list() ## list specifying the edges in the components of mixture1
   fit.order <- list() ## list specifying the edges in the components of mixture2
 
   ## Build the vectors that characterize the edges of the components of the mixture models.
   ## The noise components are ignored since they always have the same (star) topology.
   for(k in 2:no.trees) {
     true.vec <- character(0)
     fit.vec <- character(0)
     for(l in 2:no.events) {
       ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l])))
       true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))       
       ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l])))
       fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))        
     }
     names(true.vec) <- nodes(getTree(mixture1, k))[-1]
     true.order <- c(true.order, list(true.vec))
     names(fit.vec) <- nodes(getTree(mixture2, k))[-1]
     fit.order <- c(fit.order, list(fit.vec))
   }
   ## Build the comparisson matrix.
   ## The (i, j) element is the number of distinct edges of the
   ## (i + 1)-th and (j + 1)-th component of the models
   ## mixture1 and mixture2 respectively.  
   comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1)
   for(i in 1:(no.trees - 1))
     for(j in 1:(no.trees - 1)) {
       comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]])
     }
   ## Form the similarity pairs and calculate the similarity of the models
   ## as a sum of the number of different edges of the trees in the similarity pairs:
   if(no.trees > 2) {
     rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE)
     diff.sum <- min(comp.mat)
     row.index <- c(rc[1, 1]) ## get the row index of the minimum
     col.index <- c(rc[1, 2]) ## get the column index of the minimum
 
     for(m in 1:(nrow(comp.mat) - 1)) {
       rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE)
       diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)])
       row.index <- c(row.index, rc[1, 1])
       col.index <- c(col.index, rc[1, 2])    
     }
   } else {
     diff.sum <- comp.mat[1, 1]
   }
   ## Return a result between 0 and 1.    
   return(diff.sum/((no.trees - 1) * (no.events - 1)))  
 
 }
 ######################################################################
 ## Function that assignes to each node the level at which that node is
 ## in a specific tree (tree.num) of the trees mixture model.
 ## The start.val is the maximum depth of the tree with which the tree
 ## specified with tree.num will be compared.
 get.tree.levels <- function(mixture, tree.num, start.val) {
   root <- nodes(getTree(mixture, tree.num))[1]
   levels <- acc(getTree(mixture, tree.num), root)
   ## vec <- rep(max(levels[[1]]), eventsNum(mixture) - 1)
   vec <- rep(start.val, eventsNum(mixture) - 1)
   names(vec) <- nodes(getTree(mixture, tree.num))[-1]
   
   vec[names(levels[[1]])] <- levels[[1]]
 
   return(vec)
 }
 ######################################################################
 ## Function that implements a similarity measure for comparing the topologies
 ## of the trees of two mixture models mixture1 and mixture2.
 ## It returns a value that uses the number of different edges and the depth at
 ## which the events are, for caracterizing the difference between the models.
 ## This measure is more application specific, since the depth at which the
 ## events are in a tree is very important for disease progression.
 ## For the comparisson it is necessary that the two models have the same
 ## number of tree components build on the same number of genetic events.
 ## It is assumed that the mixtures have at least two components (the first one is the noise).
 comp.models.levels <- function(mixture1, mixture2) {
   if((class(mixture1) != "RtreemixModel") || (class(mixture2) != "RtreemixModel"))
     stop("The specified mixture models should be of type RtreemixModel!")
   
   if(numTrees(mixture1) != numTrees(mixture2))
     stop("The two specified mixture models don't have the same number of trees!")
 
   if(eventsNum(mixture1) != eventsNum(mixture2))
     stop("The two specified mixture models don't have the same number of events!")
   no.trees <- numTrees(mixture1)
   if (no.trees == 1)
     stop("The mixture models should have at least two tree components, where the first one is the noise component!")  
   
   no.events <- eventsNum(mixture1)
   
   true.order <- list() ## list specifying the edges in the components of mixture1
   fit.order <- list() ## list specifying the edges in the components of mixture2
 
   start.vals1 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture1 (without the noise component) 
   start.vals2 <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the tree components in mixture2 (without the noise component)
 
   ## Build the vectors that characterize the edges of the components of the mixture models.
   ## The noise components are ignored since they always have the same (star) topology.  
   for(k in 2:no.trees) {
     true.vec <- character(0)
     fit.vec <- character(0)
     for(l in 2:no.events) {
       ch <- names(which(sapply(edges(getTree(mixture1, k)), function(x, el) {el %in% x}, nodes(getTree(mixture1, k))[l])))
       true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))       
       ch <- names(which(sapply(edges(getTree(mixture2, k)), function(x, el) {el %in% x}, nodes(getTree(mixture2, k))[l])))
       fit.vec <- c(fit.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))        
     }
     names(true.vec) <- nodes(getTree(mixture1, k))[-1]
     true.order <- c(true.order, list(true.vec))
     names(fit.vec) <- nodes(getTree(mixture2, k))[-1]
     fit.order <- c(fit.order, list(fit.vec))
 
     ## Assign the (maximal depths + 1) for each tree in the mixtures
     start.vals1[k - 1] <- max(acc(getTree(mixture1, k), nodes(getTree(mixture1, k))[1])[[1]] + 1, na.remove = TRUE)
     start.vals2[k - 1] <- max(acc(getTree(mixture2, k), nodes(getTree(mixture2, k))[1])[[1]] + 1, na.remove = TRUE)                          
   }
   ## Build the comparisson matrix.
   ## The (i, j) element is the number of distinct edges of the
   ## (i + 1)-th and (j + 1)-th component of the models
   ## mixture1 and mixture2 respectively.    
   comp.mat <- matrix(nrow = no.trees - 1, ncol = no.trees - 1)
   for(i in 1:(no.trees - 1))
     for(j in 1:(no.trees - 1)) {
       comp.mat[i, j] <- sum(true.order[[i]] != fit.order[[j]])
     }
   ## Form the similarity pairs and calculate the similarity of the models
   ## as a sum of the number of different edges of the trees in the similarity pairs
   ## and their corresponding L1-distance of the levels of the events:
   if(no.trees > 2) {
     rc <- which(comp.mat == min(comp.mat), arr.ind = TRUE)
     diff.sum <- min(comp.mat) +
       L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]),
               get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]]))
     row.index <- c(rc[1, 1]) ## get the row index of the minimum
     col.index <- c(rc[1, 2]) ## get the column index of the minimum
 
     for(m in 1:(nrow(comp.mat) - 1)) {
       rc <- which(comp.mat == min(comp.mat[-(row.index), -(col.index)]), arr.ind = TRUE)
       diff.sum <- diff.sum + min(comp.mat[-(row.index), -(col.index)]) +
         L1.dist(get.tree.levels(mixture1, rc[1, 1] + 1, start.vals2[rc[1, 2]]),
                 get.tree.levels(mixture2, rc[1, 2] + 1, start.vals1[rc[1, 1]]))
       row.index <- c(row.index, rc[1, 1])
       col.index <- c(col.index, rc[1, 2])   
     }
   } else {
     if(no.trees == 2) {
       diff.sum <- comp.mat[1, 1] +
         L1.dist(get.tree.levels(mixture1, 2, start.vals2[1]),
                 get.tree.levels(mixture2, 2, start.vals1[1]))
     } else {
       stop("The specified mixture models must have at least two tree components, where the first one is the noise component!")
     }
   }
     
   return(diff.sum)  
 
 }
 ######################################################################
 ## Function that implements a similarity measure for comparing the topologies
 ## of the nontrivial tree components of a specified mixture model.
 ## It returns a value that uses the number of different edges for caracterizing
 ## the difference of the trees in the model.
 ## For the comparisson it is necessary that the model has at least two
 ## nontrivial components.
 comp.trees <- function(model) {
   no.trees <- numTrees(model)
   if(no.trees <= 2)
     stop("The specified mixture model should have at least two nontrivial tree components.")
   
   no.events <- eventsNum(model)
   true.order <- list() ## list specifying the edges in the nontrivial components of the model  
   ## Build the vectors that characterize the
   ## edges of the nontrivial components of the mixture model.
   for(k in 2:no.trees) {
     true.vec <- character(0)
     for(l in 2:no.events) {
       ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l])))
       true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))             
     }
     names(true.vec) <- nodes(getTree(model, k))[-1]
     true.order <- c(true.order, list(true.vec))    
   }
   ## Calculate the similarity of the nontrivial components of the model 
   ## as a sum of the number of different edges of all combinations of
   ## two different nontrivial trees:
   diff.sum <- 0
   for(k1 in 2:(no.trees - 1)) 
     for(k2 in (k1 + 1):no.trees) 
       diff.sum <- diff.sum + sum(true.order[[k1 - 1]] != true.order[[k2 - 1]])
     
   ## Return a result between 0 and 1.
   return(diff.sum / (choose((no.trees - 1), 2) * (no.events - 1)))
 }
 ######################################################################
 ## Function that implements a similarity measure for comparing the topologies
 ## of the nontrivial tree components of a specified mixture model.
 ## It returns a value that uses the  number of different edges and the depth at
 ## which the events are, for caracterizing the difference of the trees in the model.
 ## For the comparisson it is necessary that the model has at least two
 ## nontrivial components.
 comp.trees.levels <- function(model) {
   no.trees <- numTrees(model)
   if(no.trees <= 2)
     stop("The specified mixture model should have at least two nontrivial tree components.")
   
   no.events <- eventsNum(model)
   start.vals <- vector(mode = "numeric", length = (no.trees - 1)) ## the depth of the nontrivial tree components in the model 
   true.order <- list() ## list specifying the edges in the nontrivial components of the model 
   
   ## Build the vectors that characterize the
   ## edges of the nontrivial components of the mixture model.  
   for(k in 2:no.trees) {
     true.vec <- character(0)
     for(l in 2:no.events) {
       ch <- names(which(sapply(edges(getTree(model, k)), function(x, el) {el %in% x}, nodes(getTree(model, k))[l])))
       true.vec <- c(true.vec, ifelse((identical(ch, character(0)) || is.null(ch)), "e", ch))             
     }
     names(true.vec) <- nodes(getTree(model, k))[-1]
     true.order <- c(true.order, list(true.vec))
     
     ## Assign the maximal depths + 1 for each tree in the mixtures
     start.vals[k - 1] <- max(acc(getTree(model, k), nodes(getTree(model, k))[1])[[1]] + 1, na.remove = TRUE)    
   }
   ## Calculate the similarity of the models as a sum of the number of
   ## different edges of all possible combinations of two different
   ## nontrivial trees in the model and their corresponding L1-distance
   ## of the levels of the events:
   diff.sum <- 0
   for(k1 in 2:(no.trees - 1)) {
     for(k2 in (k1 + 1):no.trees) {
       diff.sum <- diff.sum + (sum(true.order[[k1 - 1]] != true.order[[k2 - 1]])
                               +  L1.dist(get.tree.levels(model, k1, start.vals[k2 - 1]),
                                          get.tree.levels(model, k2, start.vals[k1 - 1])))
     }
   }
   return(diff.sum)
 }
 ######################################################################
 ## Stability analysis of the oncogenetic trees mixture model.
 ## This includes stability analysis on different levels of the mixture
 ## model: GPS values, encoded probability distribution, tree topologies.
 ## The function outputs a list of analysis for the mentioned features.
 ## Each analysis contains the values of different similarity measures
 ## with their corresponding p-values.
 ## The models should have at least two components (the first one is the noise).
 stability.sim <- function(no.trees = 3, ## number of tree components
                           no.events = 9, ## number of genetic events
                           prob = c(0.2, 0.8), ## interval for the edgeweights of the random mixture models 
                           no.draws = 300, ## number of samples drawn from the random models used for learning a mixture model
                           no.rands = 100, ## number of rands for calculatin the p-values
                           no.sim = 1 ## number of simulation iterations
                           ) {
   ## Set the true types.
   no.trees <- as.integer(no.trees)
   no.events <- as.integer(no.events)
   no.draws <- as.integer(no.draws)
   no.rands <- as.integer(no.rands)
   no.sim <- as.integer(no.sim)
   ## Check if the necessary parameters are provided and have the correct form.
   if(no.trees < 2)
     stop("The specified mixture model should have at least two
 tree components (where the first one is the noise).")
   if (no.events < 1)
     stop("The number of events must be an integer greater than zero!")
   if(no.draws < 1)
     stop("The number of draws (number of samples) must be an integer greater than zero!")  
   if (no.rands < 1)
     stop("The number of random models for the p-values  must be an integer greater than zero!")   
   if (no.sim < 1)
     stop("The number of iterations for the waiting time simulation
 must be an integer greater than zero!")
   if(!missing(prob)) {
     if(!is.numeric(prob) || (length(prob) != 2))
       stop("Specify the boundaries of the conditional probabilities as a numeric vector
 of length two = c(min, max)!")  
     if(prob[2] < prob[1])
       stop("In the probability vector the lower boundary must be smaller than the
 upper boundary!")
   }
   
   simGPS <- numeric(0) ## results from the stability analysis of the GPS values
   topo.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models
   topo.levels.dif <- numeric(0) ## results from the stability analysis of the topologies of the tree components of mixture models based on comp.models.levels
   result.distr <- numeric(0) ## results from the stability analysis of the probability distribution encoded by the model
   
   mat.true.gps <- numeric(0) ## matrix containing the true GPS values from each simulation iteration
   mat.fit.gps <- numeric(0) ## matrix containing the corresponding fitted GPS values from each simulation iteration
 
   list.true.models <- list() ## list containing the true models from each simulation iteration
   list.fit.models <- list() ## list containing the corresponding fitted models from each simulation iteration
   ## Simulation iterations:
   for(i in 1:as.integer(no.sim)) {
     print(i) ## simulation iteration
     ## Pick a true model from the space of random models and draw data from it.
     true.m <- generate(K = no.trees, no.events = no.events, prob = prob)
     Weights(true.m) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1)))
     sim.data <- sim(model = true.m, no.draws = no.draws)    
     list.true.models <- c(list.true.models, true.m)
     ## Calculate the GPS and the distribution encoded by true.m
     true.gps <- GPS(gps(model = true.m, data = sim.data, no.sim = 10000))
     mat.true.gps <- cbind(mat.true.gps, true.gps)
     true.distr <- distribution(model = true.m)$probability
     ## Fit a mixture model to the data sim.data    
     fm <- fit(data = sim.data, K = no.trees, noise = TRUE, equal.edgeweights = TRUE, eps = 0.01)
     list.fit.models <- c(list.fit.models, fm)
     ## Calculate the GPS and the distribution encoded by fm
     fit.gps <- GPS(gps(model = fm, data = sim.data, no.sim = 10000))
     mat.fit.gps <- cbind(mat.fit.gps, fit.gps)
     fit.distr <- distribution(model = fm)$probability
     ## Compute different distances between the distributions induced
     ## by the true and fitted models:
     true.cosin <- cosin.dist(true.distr, fit.distr)
     true.l1 <- L1.dist(true.distr, fit.distr)
     true.kull <- kullback.leibler(true.distr, fit.distr)
     ## Compute different distances between the GPS values
     ## for the true and fitted models:
     true.euclGPS <- euclidian.dist(true.gps, fit.gps)
     true.rank.dist <-  rank.cor.dist(true.gps, fit.gps)
     ## Compute different similarity measures for comparing the
     ## topologies of the nontrivial components of the true and fitted models:
     true.topo.dif <- comp.models(true.m, fm)
     true.topo.levels.dif <- comp.models.levels(true.m, fm)
     ## Vectors for keeping the values for the different features
     ## of the random mixture models needed for the p-values calculation:
     rand.euclGPS <- numeric(0)
     rand.rankGPS <- numeric(0)
 
     rand.cos.distr <- numeric(0)
     rand.l1.distr <- numeric(0)
     rand.kull.distr <- numeric(0)
 
     rand.topo <- numeric(0)
     rand.topo.levels <- numeric(0)
     ## Create the vectors with random values for the p-values calculation:
     for(j in 1:(as.integer(no.rands) - 1)) {
       ## Generate random model and calculate the GPS and distribution:
       model <- generate(K = no.trees, no.events = no.events, prob = prob)
       Weights(model) <- c(0.05, rep(0.95/(no.trees - 1), (no.trees - 1)))      
       random.gps <- GPS(gps(model = model, data = sim.data, no.sim = 10000))
       random.distr <- distribution(model = model)$probability
       ## GPS:
       rand.euclGPS <- c(rand.euclGPS, euclidian.dist(true.gps, random.gps))      
       rand.rankGPS <- c(rand.rankGPS, rank.cor.dist(true.gps, random.gps))
       ## Distribution:
       rand.cos.distr <- c(rand.cos.distr, cosin.dist(true.distr, random.distr))
       rand.l1.distr <- c(rand.l1.distr, L1.dist(true.distr, random.distr))      
       rand.kull.distr <- c(rand.kull.distr, kullback.leibler(true.distr, random.distr))
       ## Tree topologies:
       rand.topo <- c(rand.topo, comp.models(true.m, model))
       rand.topo.levels <- c(rand.topo.levels, comp.models.levels(true.m, model))
     }
     ## Stability analysis of GPS values
     ## (using Euclidian distance and rank correlation distance as similarity measures):
     simGPS <- rbind(simGPS,
                     c(true.euclGPS, Pval.dist(true.euclGPS, rand.euclGPS),   
                       true.rank.dist, Pval.dist(true.rank.dist, rand.rankGPS)))
     ## Stability analysis of induced distributions
     ## (using cosine distance, L1 distance, Kullback-Leibler divergence as similarity measures):
     result.distr <- rbind(result.distr,
                           c(true.cosin, Pval.dist(true.cosin, rand.cos.distr),
                             true.l1, Pval.dist(true.l1, rand.l1.distr),
                             true.kull, Pval.dist(true.kull, rand.kull.distr)))
     ## Stability analysis of tree topologies
     ## (using comp.models and comp.models.levels as similarity measures):
     topo.dif <- rbind(topo.dif,
                       c(true.topo.dif, Pval.dist(true.topo.dif, rand.topo)))
     topo.levels.dif <- rbind(topo.levels.dif,
                              c(true.topo.levels.dif, Pval.dist(true.topo.levels.dif, rand.topo.levels)))
     
   }
   ## Organize the output:
   colnames(simGPS) <- c("eucl.dist", "p.val.eucl", "rank.cor.dist", "p.val.rank")
   colnames(result.distr) <- c("cos", "p.val.cos", "L1", "p.val.L1", "KL", "p.val.KL")
   colnames(topo.dif) <- c("topo.dif", "p.value")
   colnames(topo.levels.dif) <- c("topo.levels.dif", "p.value")
   colnames(mat.true.gps) <- c(1:no.sim)
   colnames(mat.fit.gps) <- c(1:no.sim)
   output <- list(simGPS, result.distr,
                  topo.dif, topo.levels.dif,
                  mat.true.gps, mat.fit.gps,
                  list.true.models, list.fit.models)
   names(output) <- c("GPS", "Distribution", "Tree topologies (distinct edges)",
                      "Tree topologies (distinct edges + levelsL1dist)", "True GPS",
                      "Fitted GPS", "True models", "Fitted models")
     
   return(output)  
 }