Browse code

Merge branch 'develop'

Davide Risso authored on 30/06/2017 20:14:32
Showing 4 changed files

... ...
@@ -12,7 +12,7 @@ Author: Michael Cole [aut, cre, cph], Davide Risso [aut, cph]
12 12
 Maintainer: Michael Cole <mbeloc@gmail.com>
13 13
 License: Artistic-2.0
14 14
 Depends:
15
-  R (>= 3.4),
15
+  R (>= 3.3),
16 16
   methods,
17 17
   SummarizedExperiment
18 18
 Imports:
... ...
@@ -13,6 +13,7 @@ export(biplot_color)
13 13
 export(biplot_interactive)
14 14
 export(estimate_ziber)
15 15
 export(factor_sample_filter)
16
+export(fast_estimate_ziber)
16 17
 export(impute_expectation)
17 18
 export(impute_null)
18 19
 export(lm_adjust)
... ...
@@ -259,3 +259,165 @@ impute_expectation <- function(expression,impute_args) {
259 259
 impute_null <- function(expression,impute_args) {
260 260
   return(expression)
261 261
 }
262
+
263
+#' Fast parameter estimation of zero-inflated bernoulli model
264
+#' 
265
+#' This function implements Newton's method for solving zero of 
266
+#' Expectation-Maximization equation at the limit of parameter convergence: a 
267
+#' zero-inflated bernoulli model of transcript detection, modeling gene 
268
+#' expression state (off of on) as a bernoulli draw on a gene-specific 
269
+#' expression rate (Z in {0,1}). Detection conditioned on expression is a 
270
+#' logistic function of gene-level features. The bernoulli model is modeled 
271
+#' numerically by a logistic model with an intercept.
272
+#' 
273
+#' @param x matrix. An expression data matrix (genes in rows, cells in columns)
274
+#' @param fp_tresh numeric. Threshold for calling a positive detection (D = 1).
275
+#'   Default 0.
276
+#' @param gfeatM matrix. Numeric gene level determinants of drop-out (genes in 
277
+#'   rows, features in columns)
278
+#' @param bulk_model logical. Use median log-expression of gene in detected 
279
+#'   fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is 
280
+#'   specified.
281
+#' @param pos_controls logical. TRUE for all genes that are known to be 
282
+#'   expressed in all cells.
283
+#' @param rate_tol numeric. Convergence treshold on expression rates (0-1).
284
+#' @param maxiter numeric. The maximum number of steps per gene. Default
285
+#'   100.
286
+#' @param verbose logical. Whether or not to print the value of the likelihood 
287
+#'   at each iteration.
288
+#'   
289
+#' @export
290
+#' 
291
+#' @return a list with the following elements: \itemize{ \item{W}{ coefficients
292
+#'   of sample-specific logistic drop-out model } \item{Alpha}{ intercept and 
293
+#'   gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{ 
294
+#'   coefficient of gene-specific logistic expression model } 
295
+#'   \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)} 
296
+#'   \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in 
297
+#'   weighted PCA} \item{expected_state}{ the expected
298
+#'   value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} 
299
+#'   \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was 
300
+#'   reached} }
301
+#'   
302
+#' @examples
303
+#' mat <- matrix(rpois(1000, lambda = 3), ncol=10)
304
+#' mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10)
305
+#' ziber_out = suppressWarnings(fast_estimate_ziber(mat,
306
+#'    bulk_model = TRUE,
307
+#'    pos_controls = 1:10))
308
+#' 
309
+
310
+fast_estimate_ziber = function(x, fp_tresh = 0, 
311
+                               gfeatM = NULL, bulk_model = FALSE, 
312
+                               pos_controls = NULL,
313
+                               rate_tol = 0.01, maxiter = 100,
314
+                               verbose = FALSE){
315
+  
316
+  Y = t(matrix(as.numeric( x > fp_tresh ),nrow = dim(x)[1]))
317
+  
318
+  if(is.null(gfeatM) & bulk_model){
319
+    x2 = x
320
+    x2[x2 <= fp_tresh] = NA
321
+    Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE))))
322
+  }else{
323
+    if(is.null(gfeatM)){
324
+      stop("No gene-level features specified")
325
+    }
326
+    Alpha = t(gfeatM)
327
+  }
328
+  
329
+  # Prepare Unwanted Variation and Sample-Intrinsic Intercept
330
+  Alpha = rbind(Alpha,rep(1,dim(x)[1]))
331
+  K = dim(Alpha)[1]
332
+  W = matrix(0,dim(x)[2],K)
333
+  
334
+  if(is.null(pos_controls)){
335
+    stop("Must supply positive controls genes to fit FNR characteristic")
336
+  }
337
+  
338
+  # Fit W once using control genes
339
+  cY = Y[,pos_controls]
340
+  cAlpha = Alpha[,pos_controls]
341
+  glm_pval = rep(NA,nrow(Y))
342
+  
343
+  for (i in 1:nrow(Y)){
344
+    
345
+    fit = suppressWarnings(glm(cbind(cY[i,],1-cY[i,]) ~ 0 +
346
+                                 t(cAlpha),family=binomial(logit)))
347
+    glm_pval[i] = summary(fit)$coef[,4][1]
348
+    
349
+    if((glm_pval[i] < .01) & fit$converged){
350
+      W[i,] = fit$coefficients
351
+    } else {
352
+      if(!fit$converged){
353
+        warning(paste0("Sample ",i," failed GLM fit, applying",
354
+                       " expression independent model."))
355
+      }else{
356
+        warning(paste0("Sample ",i," exhibits expression dependence",
357
+                       " consistent with null, applying expression ",
358
+                       "independent model."))
359
+      }
360
+      
361
+      # Only intercept with pseudocounts
362
+      W[i,] = c(0,log((sum(cY[i,]) + 0.5)/(ncol(cY) + 1)) -
363
+                  log((sum(1-cY[i,]) + 
364
+                         0.5)/(ncol(cY) + 1))) 
365
+    }
366
+  }
367
+  
368
+  fnr_character = .likfn(1,W,Alpha)
369
+  
370
+  f_fun = function(x,y,theta){
371
+    x - mean(1/(1+(1-y)*((1/x - 1)/theta)))
372
+  }
373
+  df_fun = function(x,y,theta){
374
+    1 - mean((theta - y*theta)/(1 + y*(-1 + x) + (-1 + theta)*x)^2)
375
+  }
376
+  
377
+  rates = rep(0.5,ncol(fnr_character))
378
+  rates[pos_controls] = 1
379
+  convergences = rep(0,ncol(fnr_character))
380
+  convergences[pos_controls] = NA
381
+  for(i in which(!pos_controls)){
382
+    rate_est = rates[i]
383
+    y = Y[,i]
384
+    theta = 1 - fnr_character[,i]
385
+    dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
386
+    niter = 0
387
+    while(abs(dr) > rate_tol){
388
+      niter = niter + 1
389
+      rate_est = rate_est + dr
390
+      dr = -f_fun(rate_est,y,theta)/df_fun(rate_est,y,theta)
391
+      if(niter >= maxiter){
392
+        convergences[i] = 1
393
+        break
394
+      }
395
+    }
396
+    rates[i] = rate_est
397
+  }
398
+  
399
+  # Prepare Gene-Intrinsic Intercept
400
+  X = matrix(1,dim(x)[2],1)
401
+  Beta = matrix(boot::logit(rates),1,dim(x)[1])
402
+  
403
+  # Expression States
404
+  Z = Y*0
405
+  Z[,pos_controls] = 1
406
+  Z[,!pos_controls] = .pzfn(Y[,!pos_controls],
407
+                            W,Alpha[,!pos_controls],
408
+                            X,Beta[,!pos_controls])
409
+  
410
+  # Likelihood
411
+  matA = .likfn(1,X,Beta[,!pos_controls]) # Expression
412
+  matC = .likfn(Y[,!pos_controls],W,Alpha[,!pos_controls]) # Detection
413
+  is_perfect = (matC == 0) | (matA == 0) | (matA == 1)
414
+  EL2 = sum((Z[,!pos_controls]*log( matC ))[!is_perfect],na.rm = FALSE) + 
415
+    sum((Z[,!pos_controls]*log( matA ))[!is_perfect],na.rm = FALSE) + 
416
+    sum(((1-Z[,!pos_controls])*log( 1 - matA ))[!is_perfect],na.rm = FALSE)
417
+  
418
+  return(list(W = W, X = X, Alpha = Alpha, Beta = Beta,
419
+              fnr_character = t(fnr_character),
420
+              p_nodrop = 1 - t((Y <= fp_tresh)*Z), 
421
+              expected_state = t(Z), loglik = EL2,
422
+              convergence = convergences, glm_pval = glm_pval))
423
+}
262 424
\ No newline at end of file
263 425
new file mode 100644
... ...
@@ -0,0 +1,62 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/ziber.R
3
+\name{fast_estimate_ziber}
4
+\alias{fast_estimate_ziber}
5
+\title{Fast parameter estimation of zero-inflated bernoulli model}
6
+\usage{
7
+fast_estimate_ziber(x, fp_tresh = 0, gfeatM = NULL, bulk_model = FALSE,
8
+  pos_controls = NULL, rate_tol = 0.01, maxiter = 100, verbose = FALSE)
9
+}
10
+\arguments{
11
+\item{x}{matrix. An expression data matrix (genes in rows, cells in columns)}
12
+
13
+\item{fp_tresh}{numeric. Threshold for calling a positive detection (D = 1).
14
+Default 0.}
15
+
16
+\item{gfeatM}{matrix. Numeric gene level determinants of drop-out (genes in 
17
+rows, features in columns)}
18
+
19
+\item{bulk_model}{logical. Use median log-expression of gene in detected 
20
+fraction as sole gene-level feature. Default FALSE. Ignored if gfeatM is 
21
+specified.}
22
+
23
+\item{pos_controls}{logical. TRUE for all genes that are known to be 
24
+expressed in all cells.}
25
+
26
+\item{rate_tol}{numeric. Convergence treshold on expression rates (0-1).}
27
+
28
+\item{maxiter}{numeric. The maximum number of steps per gene. Default
29
+100.}
30
+
31
+\item{verbose}{logical. Whether or not to print the value of the likelihood 
32
+at each iteration.}
33
+}
34
+\value{
35
+a list with the following elements: \itemize{ \item{W}{ coefficients
36
+  of sample-specific logistic drop-out model } \item{Alpha}{ intercept and 
37
+  gene-level parameter matrix } \item{X}{ intercept } \item{Beta}{ 
38
+  coefficient of gene-specific logistic expression model } 
39
+  \item{fnr_character}{ the probability, per gene, of P(D=0|E=1)} 
40
+  \item{p_nodrop}{ 1 - the probability P(drop|Y), useful as weights in 
41
+  weighted PCA} \item{expected_state}{ the expected
42
+  value E[Z] (1 = "on")} \item{loglik}{ the log-likelihood} 
43
+  \item{convergence}{for all genes, 0 if the algorithm converged and 1 if maxiter was 
44
+  reached} }
45
+}
46
+\description{
47
+This function implements Newton's method for solving zero of 
48
+Expectation-Maximization equation at the limit of parameter convergence: a 
49
+zero-inflated bernoulli model of transcript detection, modeling gene 
50
+expression state (off of on) as a bernoulli draw on a gene-specific 
51
+expression rate (Z in {0,1}). Detection conditioned on expression is a 
52
+logistic function of gene-level features. The bernoulli model is modeled 
53
+numerically by a logistic model with an intercept.
54
+}
55
+\examples{
56
+mat <- matrix(rpois(1000, lambda = 3), ncol=10)
57
+mat = mat * matrix(1-rbinom(1000, size = 1, prob = .01), ncol=10)
58
+ziber_out = suppressWarnings(fast_estimate_ziber(mat,
59
+   bulk_model = TRUE,
60
+   pos_controls = 1:10))
61
+
62
+}