#### Merge branch 'develop'

Davide Risso authored on 30/06/2017 20:14:32
Showing 4 changed files

 ... ... @@ -12,7 +12,7 @@ Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph] 12 12  Maintainer: Michael Cole  13 13  License: Artistic-2.0 14 14  Depends: 15 - R (>= 3.4), 15 + R (>= 3.3), 16 16  methods, 17 17  SummarizedExperiment 18 18  Imports:
 ... ... @@ -13,6 +13,7 @@ export(biplot_color) 13 13  export(biplot_interactive) 14 14  export(estimate_ziber) 15 15  export(factor_sample_filter) 16 +export(fast_estimate_ziber) 16 17  export(impute_expectation) 17 18  export(impute_null) 18 19  export(lm_adjust)
 ... ... @@ -259,3 +259,165 @@ impute_expectation <- function(expression,impute_args) { 259 259  impute_null <- function(expression,impute_args) { 260 260  return(expression) 261 261  } 262 + 263 +#' Fast parameter estimation of zero-inflated bernoulli model 264 +#'  265 +#' This function implements Newton's method for solving zero of  266 +#' Expectation-Maximization equation at the limit of parameter convergence: a  267 +#' zero-inflated bernoulli model of transcript detection, modeling gene  268 +#' expression state (off of on) as a bernoulli draw on a gene-specific  269 +#' expression rate (Z in {0,1}). Detection conditioned on expression is a  270 +#' logistic function of gene-level features. The bernoulli model is modeled  271 +#' numerically by a logistic model with an intercept. 272 +#'  273 +#' @param x matrix. An expression data matrix (genes in rows, cells in columns) 274 +#' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1). 275 +#' Default 0. 276 +#' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in  277 +#' rows, features in columns) 278 +#' @param bulk_model logical. Use median log-expression of gene in detected  279 +#' fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is  280 +#' specified. 281 +#' @param pos_controls logical. TRUE for all genes that are known to be  282 +#' expressed in all cells. 283 +#' @param rate_tol numeric. Convergence treshold on expression rates (0-1). 284 +#' @param maxiter numeric. The maximum number of steps per gene. Default 285 +#' 100. 286 +#' @param verbose logical. Whether or not to print the value of the likelihood  287 +#' at each iteration. 288 +#'  289 +#' @export 290 +#'  291 +#' @return a list with the following elements: \itemize{ \item{W}{ coefficients 292 +#' of sample-specific logistic drop-out model } \item{Alpha}{ intercept and  293 +#' gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{  294 +#' coefficient of gene-specific logistic expression model }  295 +#' \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)}  296 +#' \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in  297 +#' weighted PCA} \item{expected_state}{ the expected 298 +#' value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood}  299 +#' \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was  300 +#' reached} } 301 +#'  302 +#' @examples 303 +#' mat <- matrix(rpois(1000, lambda = 3), ncol=10) 304 +#' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10) 305 +#' ziber_out = suppressWarnings(fast_estimate_ziber(mat, 306 +#' bulk_model = TRUE, 307 +#' pos_controls = 1:10)) 308 +#'  309 + 310 +fast_estimate_ziber = function(x, fp_tresh = 0,  311 + gfeatM = NULL, bulk_model = FALSE,  312 + pos_controls = NULL, 313 + rate_tol = 0.01, maxiter = 100, 314 + verbose = FALSE){ 315 +  316 + Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)[1])) 317 +  318 + if(is.null(gfeatM) & bulk_model){ 319 + x2 = x 320 + x2[x2 <= fp_tresh] = NA 321 + Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE)))) 322 + }else{ 323 + if(is.null(gfeatM)){ 324 + stop("No gene-level features specified") 325 + } 326 + Alpha = t(gfeatM) 327 + } 328 +  329 + # Prepare Unwanted Variation and Sample-Intrinsic Intercept 330 + Alpha = rbind(Alpha,rep(1,dim(x)[1])) 331 + K = dim(Alpha)[1] 332 + W = matrix(0,dim(x)[2],K) 333 +  334 + if(is.null(pos_controls)){ 335 + stop("Must supply positive controls genes to fit FNR characteristic") 336 + } 337 +  338 + # Fit W once using control genes 339 + cY = Y[,pos_controls] 340 + cAlpha = Alpha[,pos_controls] 341 + glm_pval = rep(NA,nrow(Y)) 342 +  343 + for (i in 1:nrow(Y)){ 344 +  345 + fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 + 346 + t(cAlpha),family=binomial(logit))) 347 + glm_pval[i] = summary(fit)$coef[,4][1] 348 +  349 + if((glm_pval[i] < .01) & fit$converged){ 350 + W[i,] = fit$coefficients 351 + } else { 352 + if(!fit$converged){ 353 + warning(paste0("Sample ",i," failed GLM fit, applying", 354 + " expression independent model.")) 355 + }else{ 356 + warning(paste0("Sample ",i," exhibits expression dependence", 357 + " consistent with null, applying expression ", 358 + "independent model.")) 359 + } 360 +  361 + # Only intercept with pseudocounts 362 + W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) - 363 + log((sum(1-cY[i,]) +  364 + 0.5)/(ncol(cY) + 1)))  365 + } 366 + } 367 +  368 + fnr_character = .likfn(1,W,Alpha) 369 +  370 + f_fun = function(x,y,theta){ 371 + x - mean(1/(1+(1-y)*((1/x - 1)/theta))) 372 + } 373 + df_fun = function(x,y,theta){ 374 + 1 - mean((theta - y*theta)/(1 + y*(-1 + x) + (-1 + theta)*x)^2) 375 + } 376 +  377 + rates = rep(0.5,ncol(fnr_character)) 378 + rates[pos_controls] = 1 379 + convergences = rep(0,ncol(fnr_character)) 380 + convergences[pos_controls] = NA 381 + for(i in which(!pos_controls)){ 382 + rate_est = rates[i] 383 + y = Y[,i] 384 + theta = 1 - fnr_character[,i] 385 + dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta) 386 + niter = 0 387 + while(abs(dr) > rate_tol){ 388 + niter = niter + 1 389 + rate_est = rate_est + dr 390 + dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta) 391 + if(niter >= maxiter){ 392 + convergences[i] = 1 393 + break 394 + } 395 + } 396 + rates[i] = rate_est 397 + } 398 +  399 + # Prepare Gene-Intrinsic Intercept 400 + X = matrix(1,dim(x)[2],1) 401 + Beta = matrix(boot::logit(rates),1,dim(x)[1]) 402 +  403 + # Expression States 404 + Z = Y*0 405 + Z[,pos_controls] = 1 406 + Z[,!pos_controls] = .pzfn(Y[,!pos_controls], 407 + W,Alpha[,!pos_controls], 408 + X,Beta[,!pos_controls]) 409 +  410 + # Likelihood 411 + matA = .likfn(1,X,Beta[,!pos_controls]) # Expression 412 + matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection 413 + is_perfect = (matC == 0) | (matA == 0) | (matA == 1) 414 + EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) +  415 + sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) +  416 + sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE) 417 +  418 + return(list(W = W, X = X, Alpha = Alpha, Beta = Beta, 419 + fnr_character = t(fnr_character), 420 + p_nodrop = 1 - t((Y <= fp_tresh)*Z),  421 + expected_state = t(Z), loglik = EL2, 422 + convergence = convergences, glm_pval = glm_pval)) 423 +} 262 424 \ No newline at end of file
 263 425 new file mode 100644 ... ... @@ -0,0 +1,62 @@ 1 +% Generated by roxygen2: do not edit by hand 2 +% Please edit documentation in R/ziber.R 3 +\name{fast_estimate_ziber} 4 +\alias{fast_estimate_ziber} 5 +\title{Fast parameter estimation of zero-inflated bernoulli model} 6 +\usage{ 7 +fast_estimate_ziber(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE, 8 + pos_controls = NULL, rate_tol = 0.01, maxiter = 100, verbose = FALSE) 9 +} 10 +\arguments{ 11 +\item{x}{matrix. An expression data matrix (genes in rows, cells in columns)} 12 + 13 +\item{fp_tresh}{numeric. Threshold for calling a positive detection (D = 1). 14 +Default 0.} 15 + 16 +\item{gfeatM}{matrix. Numeric gene level determinants of drop-out (genes in  17 +rows, features in columns)} 18 + 19 +\item{bulk_model}{logical. Use median log-expression of gene in detected  20 +fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is  21 +specified.} 22 + 23 +\item{pos_controls}{logical. TRUE for all genes that are known to be  24 +expressed in all cells.} 25 + 26 +\item{rate_tol}{numeric. Convergence treshold on expression rates (0-1).} 27 + 28 +\item{maxiter}{numeric. The maximum number of steps per gene. Default 29 +100.} 30 + 31 +\item{verbose}{logical. Whether or not to print the value of the likelihood  32 +at each iteration.} 33 +} 34 +\value{ 35 +a list with the following elements: \itemize{ \item{W}{ coefficients 36 + of sample-specific logistic drop-out model } \item{Alpha}{ intercept and  37 + gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{  38 + coefficient of gene-specific logistic expression model }  39 + \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)}  40 + \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in  41 + weighted PCA} \item{expected_state}{ the expected 42 + value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood}  43 + \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was  44 + reached} } 45 +} 46 +\description{ 47 +This function implements Newton's method for solving zero of  48 +Expectation-Maximization equation at the limit of parameter convergence: a  49 +zero-inflated bernoulli model of transcript detection, modeling gene  50 +expression state (off of on) as a bernoulli draw on a gene-specific  51 +expression rate (Z in {0,1}). Detection conditioned on expression is a  52 +logistic function of gene-level features. The bernoulli model is modeled  53 +numerically by a logistic model with an intercept. 54 +} 55 +\examples{ 56 +mat <- matrix(rpois(1000, lambda = 3), ncol=10) 57 +mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10) 58 +ziber_out = suppressWarnings(fast_estimate_ziber(mat, 59 + bulk_model = TRUE, 60 + pos_controls = 1:10)) 61 + 62 +}