#
#' Likelihood Function of the Logistic Model
#'
#' @keywords internal
#'
#' @param Z data matrix
#' @param X sample-level values
#' @param Beta gene-level values
#'
.likfn = function(Z,X,Beta){
return((1-Z) + (2*Z - 1)/(1+exp(-X %*% Beta)))
}

#
#' Posterior probability of detection
#'
#' @keywords internal
#'
#' @param Y detection matrix.
#' @param W sample-level drop-out coefficients.
#' @param Alpha gene-level drop-out features.
#' @param X sample-level expression features.
#' @param Beta gene-level sample coefficients.
#'
.pzfn = function(Y,W,Alpha,X,Beta){

matA = .likfn(1,X,Beta) # Expression
matB = matA*.likfn(Y,W,Alpha) # Failure

o = matB/(matB + (Y == 0)*(1-matA))
o[Y > 0] = 1

return(o)
}

#' Parameter estimation of zero-inflated bernoulli model
#'
#' This function implements an expectation-maximization algorithm for a
#' zero-inflated bernoulli model of transcript detection, modeling gene
#' expression state (off of on) as a bernoulli draw on a gene-specific
#' expression rate (Z in {0,1}). Detection conditioned on expression is a
#' logistic function of gene-level features. The bernoulli model is modeled
#' numerically by a logistic model with an intercept.
#'
#' @param x matrix. An expression data matrix (genes in rows, cells in columns)
#' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1).
#'   Default 0.
#' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in
#'   rows, features in columns)
#' @param bulk_model logical. Use median log-expression of gene in detected
#'   fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is
#'   specified.
#' @param pos_controls logical. TRUE for all genes that are known to be
#'   expressed in all cells.
#' @param em_tol numeric. Convergence treshold on log-likelihood.
#' @param maxiter numeric. The maximum number of iterations. Default 100.
#' @param verbose logical. Whether or not to print the value of the likelihood
#'   at each iteration.
#'
#' @export
#'
#' @return a list with the following elements: \itemize{ \item{W}{ coefficients
#'   of sample-specific logistic drop-out model } \item{Alpha}{ intercept and
#'   gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{
#'   coefficient of gene-specific logistic expression model }
#'   \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)}
#'   \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in
#'   weighted PCA} \item{expected_state}{ the expected
#'   value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood}
#'   \item{convergence}{ 0 if the algorithm converged and 1 if maxiter was
#'   reached} }
#'
#' @examples
#' mat <- matrix(rpois(1000, lambda = 3), ncol=10)
#' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10)
#' ziber_out = suppressWarnings(estimate_ziber(mat,
#'    bulk_model = TRUE,
#'    pos_controls = 1:10))
#'

estimate_ziber = function(x, fp_tresh = 0,
gfeatM = NULL, bulk_model = FALSE,
pos_controls = NULL,
em_tol = 0.01, maxiter = 100,
verbose = FALSE){

Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)))

if(is.null(gfeatM) & bulk_model){
x2 = x
x2[x2 <= fp_tresh] = NA
Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE))))
}else{
if(is.null(gfeatM)){
stop("No gene-level features specified")
}
Alpha = t(gfeatM)
}

# Prepare Gene-Intrinsic Intercept
X = matrix(1,dim(x),1)
Beta = matrix(0,1,dim(x))

# Prepare Unwanted Variation and Sample-Intrinsic Intercept
Alpha = rbind(Alpha,rep(1,dim(x)))
K = dim(Alpha)
W = matrix(0,dim(x),K)

# Initialize Z
Z = .pzfn(Y,W,Alpha,X,Beta)

if(is.null(pos_controls)){
stop("Must supply positive controls genes to fit FNR characteristic")
}

# Fit W once using control genes
cY = Y[,pos_controls]
cAlpha = Alpha[,pos_controls]
glm_pval = rep(NA,nrow(Z))

for (i in 1:dim(Z)){
fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 +
t(cAlpha),family=binomial(logit)))
glm_pval[i] = summary(fit)$coef[,4] if((glm_pval[i] < .01) & fit$converged){
W[i,] = fit$coefficients } else { if(!fit$converged){
warning(paste0("Sample ",i," failed GLM fit, applying",
" expression independent model."))
}else{
warning(paste0("Sample ",i," exhibits expression dependence",
" consistent with null, applying expression ",
"independent model."))
}

# Only intercept with pseudocounts
W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) -
log((sum(1-cY[i,]) +
0.5)/(ncol(cY) + 1)))
}
}
Beta[,pos_controls] = NA
Z[,pos_controls] = 1

matA = .likfn(1,X,Beta[,!pos_controls]) # Expression
matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection

is_perfect = (matC == 0) | (matA == 0) | (matA == 1)

EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) +
sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) +
sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE)

if(verbose){print(EL2)}

EL1 = 2*EL2

niter = 1
while( is.infinite(EL1) | (EL2 - EL1) > em_tol ) {

EL1 = EL2

# Update Beta
cmz = colMeans(Z[,!pos_controls])
Beta[,!pos_controls] = t(matrix(log(cmz) - log(1-cmz)))

# Update Z
Z[,!pos_controls] = .pzfn(Y[,!pos_controls],
W,Alpha[,!pos_controls],
X,Beta[,!pos_controls])

matA = .likfn(1,X,Beta[,!pos_controls])
matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls])

is_perfect = (matC == 0) | (matA == 0) | (matA == 1)

EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) +
sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) +
sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE)

if(verbose){print(EL2)}
niter = niter + 1
if(niter >= maxiter){
return(list(W = W, X = X, Alpha = Alpha, Beta = Beta,
fnr_character = t(.likfn(1,W,Alpha)),
p_nodrop = 1 - t((Y <= fp_tresh)*Z),
expected_state = t(Z), loglik = EL2,
convergence = 1,glm_pval = glm_pval))    }
}

return(list(W = W, X = X, Alpha = Alpha, Beta = Beta,
fnr_character = t(.likfn(1,W,Alpha)),
p_nodrop = 1 - t((Y <= fp_tresh)*Z),
expected_state = t(Z), loglik = EL2,
convergence = 0, glm_pval = glm_pval))
}

#' Imputation of zero abundance based on general zero-inflated model
#'
#' This function is used to impute the data, weighted by probability of data
#' coming from the zero-inflation part of the distribution.
#'
#' @details The imputation is carried out with the following formula:
#'   y_{ij}* = y_{ij} * Pr( No Drop | y_{ij}) + mu_{i} * Pr( Drop | y_{ij}).
#'
#'   impute_args must contain 2 elements: 1) p_nodrop = posterior probability
#'   of data not having resulted from drop-out  (genes in rows, cells in
#'   columns) 2) mu = expected expression of dropped data (genes in rows, cells
#'   in columns)
#'
#' @export
#'
#' @param expression the data matrix (genes in rows, cells in columns)
#' @param impute_args arguments for imputation (see details)
#'
#' @return the imputed expression matrix.
#'
#' @examples
#' mat <- matrix(rpois(1000, lambda = 3), ncol=10)
#' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10)
#'
#' mu = matrix(rep(3/ppois(0,lambda = 3,lower.tail = FALSE),1000),ncol = 10)
#'
#' p_false = 1 / ( 1 + ppois(0, lambda = 3, lower.tail = TRUE ) /
#'     (0.01 * ppois(0, lambda = 3, lower.tail = FALSE) ) )
#'
#' p_nodrop = matrix(rep(1-p_false,1000),ncol = 10)
#' p_nodrop[mat > 0] = 1
#'
#' impute_args = list()
#' impute_args = list(mu = mu, p_nodrop = p_nodrop)
#'
#' imat = impute_expectation(mat,impute_args = impute_args)
#'

impute_expectation <- function(expression,impute_args) {
imputed <- expression * impute_args$p_nodrop + impute_args$mu * (1 - impute_args$p_nodrop) return(imputed) } #' Null or no-op imputation #' #' @export #' #' @param expression the data matrix (genes in rows, cells in columns) #' @param impute_args arguments for imputation (not used) #' #' @return the imputed expression matrix. #' #' @examples #' mat <- matrix(rpois(1000, lambda = 5), ncol=10) #' imat = impute_null(mat) #' impute_null <- function(expression,impute_args) { return(expression) } #' Fast parameter estimation of zero-inflated bernoulli model #' #' This function implements Newton's method for solving zero of #' Expectation-Maximization equation at the limit of parameter convergence: a #' zero-inflated bernoulli model of transcript detection, modeling gene #' expression state (off of on) as a bernoulli draw on a gene-specific #' expression rate (Z in {0,1}). Detection conditioned on expression is a #' logistic function of gene-level features. The bernoulli model is modeled #' numerically by a logistic model with an intercept. #' #' @param x matrix. An expression data matrix (genes in rows, cells in columns) #' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1). #' Default 0. #' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in #' rows, features in columns) #' @param bulk_model logical. Use median log-expression of gene in detected #' fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is #' specified. #' @param pos_controls logical. TRUE for all genes that are known to be #' expressed in all cells. #' @param rate_tol numeric. Convergence treshold on expression rates (0-1). #' @param maxiter numeric. The maximum number of steps per gene. Default #' 100. #' @param verbose logical. Whether or not to print the value of the likelihood #' at each iteration. #' #' @export #' #' @return a list with the following elements: \itemize{ \item{W}{ coefficients #' of sample-specific logistic drop-out model } \item{Alpha}{ intercept and #' gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{ #' coefficient of gene-specific logistic expression model } #' \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)} #' \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in #' weighted PCA} \item{expected_state}{ the expected #' value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} #' \item{convergence}{for all genes, 0 if the algorithm converged and #' 1 if maxiter was reached} } #' #' @examples #' mat <- matrix(rpois(1000, lambda = 3), ncol=10) #' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10) #' ziber_out = suppressWarnings(fast_estimate_ziber(mat, #' bulk_model = TRUE, #' pos_controls = 1:10)) #' fast_estimate_ziber = function(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE, pos_controls = NULL, rate_tol = 0.01, maxiter = 100, verbose = FALSE){ Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x))) if(is.null(gfeatM) & bulk_model){ x2 = x x2[x2 <= fp_tresh] = NA Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE)))) }else{ if(is.null(gfeatM)){ stop("No gene-level features specified") } Alpha = t(gfeatM) } # Prepare Unwanted Variation and Sample-Intrinsic Intercept Alpha = rbind(Alpha,rep(1,dim(x))) K = dim(Alpha) W = matrix(0,dim(x),K) if(is.null(pos_controls)){ stop("Must supply positive controls genes to fit FNR characteristic") } # Fit W once using control genes cY = Y[,pos_controls] cAlpha = Alpha[,pos_controls] glm_pval = rep(NA,nrow(Y)) for (i in 1:nrow(Y)){ fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),family=binomial(logit))) glm_pval[i] = summary(fit)$coef[,4]

if((glm_pval[i] < .01) & fit$converged){ W[i,] = fit$coefficients
} else {
if(!fit\$converged){
warning(paste0("Sample ",i," failed GLM fit, applying",
" expression independent model."))
}else{
warning(paste0("Sample ",i," exhibits expression dependence",
" consistent with null, applying expression ",
"independent model."))
}

# Only intercept with pseudocounts
W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) -
log((sum(1-cY[i,]) +
0.5)/(ncol(cY) + 1)))
}
}

fnr_character = .likfn(1,W,Alpha)

f_fun = function(x,y,theta){
x - mean(1/(1+(1-y)*((1/x - 1)/theta)))
}
df_fun = function(x,y,theta){
1 - mean((theta - y*theta)/(1 + y*(-1 + x) + (-1 + theta)*x)^2)
}

rates = rep(0.5,ncol(fnr_character))
rates[pos_controls] = 1
convergences = rep(0,ncol(fnr_character))
convergences[pos_controls] = NA
for(i in which(!pos_controls)){
rate_est = rates[i]
y = Y[,i]
theta = 1 - fnr_character[,i]
dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
niter = 0
while(abs(dr) > rate_tol){
niter = niter + 1
rate_est = rate_est + dr
dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
if(niter >= maxiter){
convergences[i] = 1
break
}
}
rates[i] = rate_est
}

# Prepare Gene-Intrinsic Intercept
X = matrix(1,dim(x),1)
Beta = matrix(boot::logit(rates),1,dim(x))

# Expression States
Z = Y*0
Z[,pos_controls] = 1
Z[,!pos_controls] = .pzfn(Y[,!pos_controls],
W,Alpha[,!pos_controls],
X,Beta[,!pos_controls])

# Likelihood
matA = .likfn(1,X,Beta[,!pos_controls]) # Expression
matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection
is_perfect = (matC == 0) | (matA == 0) | (matA == 1)
EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) +
sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) +
sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE)

return(list(W = W, X = X, Alpha = Alpha, Beta = Beta,
fnr_character = t(fnr_character),
p_nodrop = 1 - t((Y <= fp_tresh)*Z),
expected_state = t(Z), loglik = EL2,
convergence = convergences, glm_pval = glm_pval))
}