Browse code

Added fast ziber

mbcole authored on 12/05/2017 07:08:59
Showing 1 changed files

... ...
@@ -259,3 +259,165 @@ impute_expectation <- function(expression,impute_args) {
259 259
 impute_null <- function(expression,impute_args) {
260 260
   return(expression)
261 261
 }
262
+
263
+#' Fast parameter estimation of zero-inflated bernoulli model
264
+#' 
265
+#' This function implements Newton's method for solving zero of 
266
+#' Expectation-Maximization equation at the limit of parameter convergence: a 
267
+#' zero-inflated bernoulli model of transcript detection, modeling gene 
268
+#' expression state (off of on) as a bernoulli draw on a gene-specific 
269
+#' expression rate (Z in {0,1}). Detection conditioned on expression is a 
270
+#' logistic function of gene-level features. The bernoulli model is modeled 
271
+#' numerically by a logistic model with an intercept.
272
+#' 
273
+#' @param x matrix. An expression data matrix (genes in rows, cells in columns)
274
+#' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1).
275
+#'   Default 0.
276
+#' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in 
277
+#'   rows, features in columns)
278
+#' @param bulk_model logical. Use median log-expression of gene in detected 
279
+#'   fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is 
280
+#'   specified.
281
+#' @param pos_controls logical. TRUE for all genes that are known to be 
282
+#'   expressed in all cells.
283
+#' @param rate_tol numeric. Convergence treshold on expression rates (0-1).
284
+#' @param maxiter numeric. The maximum number of steps per gene. Default
285
+#'   100.
286
+#' @param verbose logical. Whether or not to print the value of the likelihood 
287
+#'   at each iteration.
288
+#'   
289
+#' @export
290
+#' 
291
+#' @return a list with the following elements: \itemize{ \item{W}{ coefficients
292
+#'   of sample-specific logistic drop-out model } \item{Alpha}{ intercept and 
293
+#'   gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{ 
294
+#'   coefficient of gene-specific logistic expression model } 
295
+#'   \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)} 
296
+#'   \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in 
297
+#'   weighted PCA} \item{expected_state}{ the expected
298
+#'   value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} 
299
+#'   \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was 
300
+#'   reached} }
301
+#'   
302
+#' @examples
303
+#' mat <- matrix(rpois(1000, lambda = 3), ncol=10)
304
+#' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10)
305
+#' ziber_out = suppressWarnings(fast_estimate_ziber(mat,
306
+#'    bulk_model = TRUE,
307
+#'    pos_controls = 1:10))
308
+#' 
309
+
310
+fast_estimate_ziber = function(x, fp_tresh = 0, 
311
+                               gfeatM = NULL, bulk_model = FALSE, 
312
+                               pos_controls = NULL,
313
+                               rate_tol = 0.01, maxiter = 100,
314
+                               verbose = FALSE){
315
+  
316
+  Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)[1]))
317
+  
318
+  if(is.null(gfeatM) & bulk_model){
319
+    x2 = x
320
+    x2[x2 <= fp_tresh] = NA
321
+    Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE))))
322
+  }else{
323
+    if(is.null(gfeatM)){
324
+      stop("No gene-level features specified")
325
+    }
326
+    Alpha = t(gfeatM)
327
+  }
328
+  
329
+  # Prepare Unwanted Variation and Sample-Intrinsic Intercept
330
+  Alpha = rbind(Alpha,rep(1,dim(x)[1]))
331
+  K = dim(Alpha)[1]
332
+  W = matrix(0,dim(x)[2],K)
333
+  
334
+  if(is.null(pos_controls)){
335
+    stop("Must supply positive controls genes to fit FNR characteristic")
336
+  }
337
+  
338
+  # Fit W once using control genes
339
+  cY = Y[,pos_controls]
340
+  cAlpha = Alpha[,pos_controls]
341
+  glm_pval = rep(NA,nrow(Y))
342
+  
343
+  for (i in 1:nrow(Y)){
344
+    
345
+    fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 +
346
+                                 t(cAlpha),family=binomial(logit)))
347
+    glm_pval[i] = summary(fit)$coef[,4][1]
348
+    
349
+    if((glm_pval[i] < .01) & fit$converged){
350
+      W[i,] = fit$coefficients
351
+    } else {
352
+      if(!fit$converged){
353
+        warning(paste0("Sample ",i," failed GLM fit, applying",
354
+                       " expression independent model."))
355
+      }else{
356
+        warning(paste0("Sample ",i," exhibits expression dependence",
357
+                       " consistent with null, applying expression ",
358
+                       "independent model."))
359
+      }
360
+      
361
+      # Only intercept with pseudocounts
362
+      W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) -
363
+                  log((sum(1-cY[i,]) + 
364
+                         0.5)/(ncol(cY) + 1))) 
365
+    }
366
+  }
367
+  
368
+  fnr_character = .likfn(1,W,Alpha)
369
+  
370
+  f_fun = function(x,y,theta){
371
+    x - mean(1/(1+(1-y)*((1/x - 1)/theta)))
372
+  }
373
+  df_fun = function(x,y,theta){
374
+    1 - mean((theta - y*theta)/(1 + y*(-1 + x) + (-1 + theta)*x)^2)
375
+  }
376
+  
377
+  rates = rep(0.5,ncol(fnr_character))
378
+  rates[pos_controls] = 1
379
+  convergences = rep(0,ncol(fnr_character))
380
+  convergences[pos_controls] = NA
381
+  for(i in which(!pos_controls)){
382
+    rate_est = rates[i]
383
+    y = Y[,i]
384
+    theta = 1 - fnr_character[,i]
385
+    dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
386
+    niter = 0
387
+    while(abs(dr) > rate_tol){
388
+      niter = niter + 1
389
+      rate_est = rate_est + dr
390
+      dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
391
+      if(niter >= maxiter){
392
+        convergences[i] = 1
393
+        break
394
+      }
395
+    }
396
+    rates[i] = rate_est
397
+  }
398
+  
399
+  # Prepare Gene-Intrinsic Intercept
400
+  X = matrix(1,dim(x)[2],1)
401
+  Beta = matrix(boot::logit(rates),1,dim(x)[1])
402
+  
403
+  # Expression States
404
+  Z = Y*0
405
+  Z[,pos_controls] = 1
406
+  Z[,!pos_controls] = .pzfn(Y[,!pos_controls],
407
+                            W,Alpha[,!pos_controls],
408
+                            X,Beta[,!pos_controls])
409
+  
410
+  # Likelihood
411
+  matA = .likfn(1,X,Beta[,!pos_controls]) # Expression
412
+  matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection
413
+  is_perfect = (matC == 0) | (matA == 0) | (matA == 1)
414
+  EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) + 
415
+    sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) + 
416
+    sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE)
417
+  
418
+  return(list(W = W, X = X, Alpha = Alpha, Beta = Beta,
419
+              fnr_character = t(fnr_character),
420
+              p_nodrop = 1 - t((Y <= fp_tresh)*Z), 
421
+              expected_state = t(Z), loglik = EL2,
422
+              convergence = convergences, glm_pval = glm_pval))
423
+}
262 424
\ No newline at end of file