# #' Likelihood Function of the Logistic Model #' #' @keywords internal #' #' @param Z data matrix #' @param X sample-level values #' @param Beta gene-level values #' .likfn = function(Z,X,Beta){ return((1-Z) + (2*Z - 1)/(1+exp(-X %*% Beta))) } # #' Posterior probability of detection #' #' @keywords internal #' #' @param Y detection matrix. #' @param W sample-level drop-out coefficients. #' @param Alpha gene-level drop-out features. #' @param X sample-level expression features. #' @param Beta gene-level sample coefficients. #' .pzfn = function(Y,W,Alpha,X,Beta){ matA = .likfn(1,X,Beta) # Expression matB = matA*.likfn(Y,W,Alpha) # Failure o = matB/(matB + (Y == 0)*(1-matA)) o[Y > 0] = 1 return(o) } #' Parameter estimation of zero-inflated bernoulli model #' #' This function implements an expectation-maximization algorithm for a #' zero-inflated bernoulli model of transcript detection, modeling gene #' expression state (off of on) as a bernoulli draw on a gene-specific #' expression rate (Z in {0,1}). Detection conditioned on expression is a #' logistic function of gene-level features. The bernoulli model is modeled #' numerically by a logistic model with an intercept. #' #' @param x matrix. An expression data matrix (genes in rows, cells in columns) #' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1). #' Default 0. #' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in #' rows, features in columns) #' @param bulk_model logical. Use median log-expression of gene in detected #' fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is #' specified. #' @param pos_controls logical. TRUE for all genes that are known to be #' expressed in all cells. #' @param em_tol numeric. Convergence treshold on log-likelihood. #' @param maxiter numeric. The maximum number of iterations. Default 100. #' @param verbose logical. Whether or not to print the value of the likelihood #' at each iteration. #' #' @export #' #' @return a list with the following elements: \itemize{ \item{W}{ coefficients #' of sample-specific logistic drop-out model } \item{Alpha}{ intercept and #' gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{ #' coefficient of gene-specific logistic expression model } #' \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)} #' \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in #' weighted PCA} \item{expected_state}{ the expected #' value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} #' \item{convergence}{ 0 if the algorithm converged and 1 if maxiter was #' reached} } #' #' @examples #' mat <- matrix(rpois(1000, lambda = 3), ncol=10) #' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10) #' ziber_out = suppressWarnings(estimate_ziber(mat, #' bulk_model = TRUE, #' pos_controls = 1:10)) #' estimate_ziber = function(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE, pos_controls = NULL, em_tol = 0.01, maxiter = 100, verbose = FALSE){ Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)[1])) if(is.null(gfeatM) & bulk_model){ x2 = x x2[x2 <= fp_tresh] = NA Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE)))) }else{ if(is.null(gfeatM)){ stop("No gene-level features specified") } Alpha = t(gfeatM) } # Prepare Gene-Intrinsic Intercept X = matrix(1,dim(x)[2],1) Beta = matrix(0,1,dim(x)[1]) # Prepare Unwanted Variation and Sample-Intrinsic Intercept Alpha = rbind(Alpha,rep(1,dim(x)[1])) K = dim(Alpha)[1] W = matrix(0,dim(x)[2],K) # Initialize Z Z = .pzfn(Y,W,Alpha,X,Beta) if(is.null(pos_controls)){ stop("Must supply positive controls genes to fit FNR characteristic") } # Fit W once using control genes cY = Y[,pos_controls] cAlpha = Alpha[,pos_controls] glm_pval = rep(NA,nrow(Z)) for (i in 1:dim(Z)[1]){ fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),family=binomial(logit))) glm_pval[i] = summary(fit)$coef[,4][1] if((glm_pval[i] < .01) & fit$converged){ W[i,] = fit$coefficients } else { if(!fit$converged){ warning(paste0("Sample ",i," failed GLM fit, applying", " expression independent model.")) }else{ warning(paste0("Sample ",i," exhibits expression dependence", " consistent with null, applying expression ", "independent model.")) } # Only intercept with pseudocounts W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) - log((sum(1-cY[i,]) + 0.5)/(ncol(cY) + 1))) } } Beta[,pos_controls] = NA Z[,pos_controls] = 1 matA = .likfn(1,X,Beta[,!pos_controls]) # Expression matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection is_perfect = (matC == 0) | (matA == 0) | (matA == 1) EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) + sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) + sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE) if(verbose){print(EL2)} EL1 = 2*EL2 niter = 1 while( is.infinite(EL1) | (EL2 - EL1) > em_tol ) { EL1 = EL2 # Update Beta cmz = colMeans(Z[,!pos_controls]) Beta[,!pos_controls] = t(matrix(log(cmz) - log(1-cmz))) # Update Z Z[,!pos_controls] = .pzfn(Y[,!pos_controls], W,Alpha[,!pos_controls], X,Beta[,!pos_controls]) matA = .likfn(1,X,Beta[,!pos_controls]) matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) is_perfect = (matC == 0) | (matA == 0) | (matA == 1) EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) + sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) + sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE) if(verbose){print(EL2)} niter = niter + 1 if(niter >= maxiter){ return(list(W = W, X = X, Alpha = Alpha, Beta = Beta, fnr_character = t(.likfn(1,W,Alpha)), p_nodrop = 1 - t((Y <= fp_tresh)*Z), expected_state = t(Z), loglik = EL2, convergence = 1,glm_pval = glm_pval)) } } return(list(W = W, X = X, Alpha = Alpha, Beta = Beta, fnr_character = t(.likfn(1,W,Alpha)), p_nodrop = 1 - t((Y <= fp_tresh)*Z), expected_state = t(Z), loglik = EL2, convergence = 0, glm_pval = glm_pval)) } #' Imputation of zero abundance based on general zero-inflated model #' #' This function is used to impute the data, weighted by probability of data #' coming from the zero-inflation part of the distribution. #' #' @details The imputation is carried out with the following formula: #' y_{ij}* = y_{ij} * Pr( No Drop | y_{ij}) + mu_{i} * Pr( Drop | y_{ij}). #' #' impute_args must contain 2 elements: 1) p_nodrop = posterior probability #' of data not having resulted from drop-out (genes in rows, cells in #' columns) 2) mu = expected expression of dropped data (genes in rows, cells #' in columns) #' #' @export #' #' @param expression the data matrix (genes in rows, cells in columns) #' @param impute_args arguments for imputation (see details) #' #' @return the imputed expression matrix. #' #' @examples #' mat <- matrix(rpois(1000, lambda = 3), ncol=10) #' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10) #' #' mu = matrix(rep(3/ppois(0,lambda = 3,lower.tail = FALSE),1000),ncol = 10) #' #' p_false = 1 / ( 1 + ppois(0, lambda = 3, lower.tail = TRUE ) / #' (0.01 * ppois(0, lambda = 3, lower.tail = FALSE) ) ) #' #' p_nodrop = matrix(rep(1-p_false,1000),ncol = 10) #' p_nodrop[mat > 0] = 1 #' #' impute_args = list() #' impute_args = list(mu = mu, p_nodrop = p_nodrop) #' #' imat = impute_expectation(mat,impute_args = impute_args) #' impute_expectation <- function(expression,impute_args) { imputed <- expression * impute_args$p_nodrop + impute_args$mu * (1 - impute_args$p_nodrop) return(imputed) } #' Null or no-op imputation #' #' @export #' #' @param expression the data matrix (genes in rows, cells in columns) #' @param impute_args arguments for imputation (not used) #' #' @return the imputed expression matrix. #' #' @examples #' mat <- matrix(rpois(1000, lambda = 5), ncol=10) #' imat = impute_null(mat) #' impute_null <- function(expression,impute_args) { return(expression) } #' Fast parameter estimation of zero-inflated bernoulli model #' #' This function implements Newton's method for solving zero of #' Expectation-Maximization equation at the limit of parameter convergence: a #' zero-inflated bernoulli model of transcript detection, modeling gene #' expression state (off of on) as a bernoulli draw on a gene-specific #' expression rate (Z in {0,1}). Detection conditioned on expression is a #' logistic function of gene-level features. The bernoulli model is modeled #' numerically by a logistic model with an intercept. #' #' @param x matrix. An expression data matrix (genes in rows, cells in columns) #' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1). #' Default 0. #' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in #' rows, features in columns) #' @param bulk_model logical. Use median log-expression of gene in detected #' fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is #' specified. #' @param pos_controls logical. TRUE for all genes that are known to be #' expressed in all cells. #' @param rate_tol numeric. Convergence treshold on expression rates (0-1). #' @param maxiter numeric. The maximum number of steps per gene. Default #' 100. #' @param verbose logical. Whether or not to print the value of the likelihood #' at each iteration. #' #' @export #' #' @return a list with the following elements: \itemize{ \item{W}{ coefficients #' of sample-specific logistic drop-out model } \item{Alpha}{ intercept and #' gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{ #' coefficient of gene-specific logistic expression model } #' \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)} #' \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in #' weighted PCA} \item{expected_state}{ the expected #' value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} #' \item{convergence}{for all genes, 0 if the algorithm converged and #' 1 if maxiter was reached} } #' #' @examples #' mat <- matrix(rpois(1000, lambda = 3), ncol=10) #' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10) #' ziber_out = suppressWarnings(fast_estimate_ziber(mat, #' bulk_model = TRUE, #' pos_controls = 1:10)) #' fast_estimate_ziber = function(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE, pos_controls = NULL, rate_tol = 0.01, maxiter = 100, verbose = FALSE){ Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)[1])) if(is.null(gfeatM) & bulk_model){ x2 = x x2[x2 <= fp_tresh] = NA Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE)))) }else{ if(is.null(gfeatM)){ stop("No gene-level features specified") } Alpha = t(gfeatM) } # Prepare Unwanted Variation and Sample-Intrinsic Intercept Alpha = rbind(Alpha,rep(1,dim(x)[1])) K = dim(Alpha)[1] W = matrix(0,dim(x)[2],K) if(is.null(pos_controls)){ stop("Must supply positive controls genes to fit FNR characteristic") } # Fit W once using control genes cY = Y[,pos_controls] cAlpha = Alpha[,pos_controls] glm_pval = rep(NA,nrow(Y)) for (i in 1:nrow(Y)){ fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),family=binomial(logit))) glm_pval[i] = summary(fit)$coef[,4][1] if((glm_pval[i] < .01) & fit$converged){ W[i,] = fit$coefficients } else { if(!fit$converged){ warning(paste0("Sample ",i," failed GLM fit, applying", " expression independent model.")) }else{ warning(paste0("Sample ",i," exhibits expression dependence", " consistent with null, applying expression ", "independent model.")) } # Only intercept with pseudocounts W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) - log((sum(1-cY[i,]) + 0.5)/(ncol(cY) + 1))) } } fnr_character = .likfn(1,W,Alpha) f_fun = function(x,y,theta){ x - mean(1/(1+(1-y)*((1/x - 1)/theta))) } df_fun = function(x,y,theta){ 1 - mean((theta - y*theta)/(1 + y*(-1 + x) + (-1 + theta)*x)^2) } rates = rep(0.5,ncol(fnr_character)) rates[pos_controls] = 1 convergences = rep(0,ncol(fnr_character)) convergences[pos_controls] = NA for(i in which(!pos_controls)){ rate_est = rates[i] y = Y[,i] theta = 1 - fnr_character[,i] dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta) niter = 0 while(abs(dr) > rate_tol){ niter = niter + 1 rate_est = rate_est + dr dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta) if(niter >= maxiter){ convergences[i] = 1 break } } rates[i] = rate_est } # Prepare Gene-Intrinsic Intercept X = matrix(1,dim(x)[2],1) Beta = matrix(boot::logit(rates),1,dim(x)[1]) # Expression States Z = Y*0 Z[,pos_controls] = 1 Z[,!pos_controls] = .pzfn(Y[,!pos_controls], W,Alpha[,!pos_controls], X,Beta[,!pos_controls]) # Likelihood matA = .likfn(1,X,Beta[,!pos_controls]) # Expression matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection is_perfect = (matC == 0) | (matA == 0) | (matA == 1) EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) + sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) + sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE) return(list(W = W, X = X, Alpha = Alpha, Beta = Beta, fnr_character = t(fnr_character), p_nodrop = 1 - t((Y <= fp_tresh)*Z), expected_state = t(Z), loglik = EL2, convergence = convergences, glm_pval = glm_pval)) }