Browse code

Remove ... param in model functions

The ... param was allowing users to misspell key arguments to the
model parameters, essentially causing silent failures (see
to the celda() wrapper function, removed ... from all of the relevant
signatures, and updated the docs to reflect the new landscape.

Also updated Josh's copyright while I was at it.

Fixes #212 Fixes #211


Former-commit-id: 4bba23567a4672520f3dcea16781929b0bcd1ddd

Sean Corbett authored on 28/03/2018 16:45:13
Showing 10 changed files

... ...
@@ -12,7 +12,6 @@ Imports:
12 12
     plyr,
13 13
     dplyr,
14 14
     foreach,
15
-    gtools,
16 15
     ggplot2,
17 16
     plotly,
18 17
     RColorBrewer,
... ...
@@ -1,6 +1,6 @@
1 1
 MIT License
2 2
 
3
-Copyright (c) 2017 Joshua D Campbell
3
+Copyright (c) 2018 Joshua D Campbell
4 4
 
5 5
 Permission is hereby granted, free of charge, to any person obtaining a copy
6 6
 of this software and associated documentation files (the "Software"), to deal
... ...
@@ -10,27 +10,41 @@ available_models = c("celda_C", "celda_G", "celda_CG")
10 10
 #' @param counts A count matrix
11 11
 #' @param model Which celda sub-model to run. Options include "celda_C" (cell clustering), "celda_G" (gene clustering), "celda_CG" (gene and cell clustering)
12 12
 #' @param sample.label A numeric vector, character vector, or factor indicating the originating sample for each cell (column) in the count matrix. By default, every cell will be assumed to be from an independent sample.
13
-#' @param nchains The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1
13
+#' @param K Number of desired cell subpopulation clusters. Required for celda_C and celda_CG models.
14
+#' @param L Number of desired gene clusters. Required for celda_G and celda_CG models.
15
+#' @param alpha Non-zero concentration parameter for sample Dirichlet distribution (celda_C / celda_CG only)
16
+#' @param beta Non-zero concentration parameter for gene Dirichlet distribution
17
+#' @param gamma The Dirichlet distribution parameter for Psi; adds a pseudocount to each gene within each transcriptional state (celda_G / celda_CG only)
18
+#' @param delta The Dirichlet distribution parameter for Eta; adds a gene pseudocount to the numbers of genes each state (celda_G / celda_CG only)
19
+#' @param nchains The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1 (celda_G / celda_CG only)
14 20
 #' @param bestChainsOnly Return only the best chain (by final log-likelihood) per K/L combination.
15 21
 #' @param cores The number of cores to use for parallell Gibbs sampling. Defaults to 1.
16 22
 #' @param seed The base seed for random number generation
17 23
 #' @param verbose Print log messages during celda chain execution
18 24
 #' @param logfile_prefix Prefix for log files from worker threads and main process. 
19
-#' @param ... Model specific parameters
20 25
 #' @return Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters
21 26
 #' @import foreach
22 27
 #' @export
23
-celda = function(counts, model, sample.label=NULL, nchains=1, bestChainsOnly=TRUE, cores=1, seed=12345, verbose=FALSE, logfile_prefix="Celda", ...) {
28
+celda = function(counts, model, sample.label=NULL, K=NULL, L=NULL, alpha=1, beta=1, 
29
+                 delta=1, gamma=1, max.iter=200, z.init=NULL, y.init=NULL,
30
+                 stop.iter=10, split.on.iter=10, process.counts=FALSE, 
31
+                 nchains=1, bestChainsOnly=TRUE, cores=1, 
32
+                 seed=12345, verbose=FALSE, logfile_prefix="Celda") {
33
+ 
24 34
   message(paste(Sys.time(), "Starting celda."))
25
-  validateArgs(counts, model, sample.label, nchains, cores, seed, ...)
35
+  params.list = buildParamList(counts, model, sample.label, K, L, alpha, beta, delta,
36
+                               gamma, max.iter, z.init, y.init, stop.iter, split.on.iter,
37
+                               process.counts, nchains, cores, seed)
38
+  
26 39
   
27 40
   # Redirect stderr from the worker threads if user asks for verbose
28 41
   logfile = paste0(logfile_prefix, "_main_log.txt")
42
+  params.list$logfile = logfile
29 43
   cl = if (verbose) parallel::makeCluster(cores, outfile=logfile) else parallel::makeCluster(cores)
30 44
   doParallel::registerDoParallel(cl)
31 45
   
32 46
   # Details for each model parameter / chain combination 
33
-  runs = expand.grid(chain=1:nchains, ...)
47
+  runs = expand.grid(plyr::compact(list(chain=1:nchains, K=K, L=L)))
34 48
   runs$index = as.numeric(rownames(runs))
35 49
   if (verbose) print(runs)
36 50
   
... ...
@@ -42,20 +56,22 @@ celda = function(counts, model, sample.label=NULL, nchains=1, bestChainsOnly=TRU
42 56
   # count matrix was used.
43 57
   counts = processCounts(counts)
44 58
   count.checksum = digest::digest(counts, algo="md5")
59
+  params.list$count.checksum = count.checksum
60
+   
61
+   res.list = foreach(i = 1:nrow(runs), .export=model, .combine = c, .multicombine=TRUE) %dopar% {
62
+    chain.params = params.list
63
+    chain.params$seed = all.seeds[ifelse(i %% nchains == 0, nchains, i %% nchains)]
45 64
     
46
-  
47
-  res.list = foreach(i = 1:nrow(runs), .export=model, .combine = c, .multicombine=TRUE) %dopar% {
48
-    chain.seed = all.seeds[ifelse(i %% nchains == 0, nchains, i %% nchains)]
49
-    
50
-    if (verbose) {
65
+    if (isTRUE(verbose)) {
51 66
       ## Generate a unique log file name based on given prefix and parameters
52
-      logfile = paste0(logfile_prefix, "_", paste(paste(colnames(runs), runs[i,], sep="-"), collapse="_"), "_Seed-", chain.seed, "_log.txt")
53
-      res = do.call(model, c(list(counts=counts, sample.label=sample.label, count.checksum=count.checksum, seed=chain.seed, logfile=logfile, process.counts=F), c(runs[i,-1])))
67
+      chain.params$logfile = paste0(logfile_prefix, "_",  paste(paste(colnames(runs), runs[i,], sep="-"), collapse="_"),  "_Seed-", chain.params$seed, "_log.txt")
68
+      res = do.call(model, chain.params)
54 69
     } else {
55
-      res = suppressMessages(do.call(model, c(list(counts=counts, sample.label=sample.label, count.checksum=count.checksum, seed=chain.seed, logfile=NULL, process.counts=F), c(runs[i,-1]))))
70
+      chain.params$logfile = NULL
71
+      res = suppressMessages(do.call(model, chain.params))
56 72
     }
57 73
     return(list(res))
58
-  }  
74
+  }
59 75
   parallel::stopCluster(cl)
60 76
   celda.res = list(run.params=runs, res.list=res.list, 
61 77
                    content.type=model, count.checksum=count.checksum)
... ...
@@ -80,10 +96,43 @@ celda = function(counts, model, sample.label=NULL, nchains=1, bestChainsOnly=TRU
80 96
 }
81 97
 
82 98
 
99
+# Build a list of parameters tailored to the specific celda model being run,
100
+# validating the provided parameters along the way
101
+buildParamList = function(counts, model, sample.label, K, L, alpha, beta, delta,
102
+                          gamma, max.iter, z.init, y.init, stop.iter, split.on.iter,
103
+                          process.counts, nchains, cores, seed) {
104
+  
105
+  validateArgs(counts, model, sample.label, nchains, cores, seed, K=K, L=L)
106
+  
107
+  params.list = list(counts=counts,
108
+                     sample.label=sample.label,
109
+                     max.iter=max.iter,
110
+                     stop.iter=stop.iter,
111
+                     split.on.iter=split.on.iter,
112
+                     process.counts=process.counts)
113
+  
114
+  if (model %in% c("celda_C", "celda_CG")) {
115
+    params.list$alpha = alpha
116
+    params.list$beta = beta
117
+    params.list$K = K
118
+    z.init=z.init
119
+  } 
120
+  if (model %in% c("celda_G", "celda_CG")) {
121
+    params.list$beta = beta
122
+    params.list$delta = delta
123
+    params.list$gamma = gamma
124
+    params.list$L = L
125
+    y.init=y.init
126
+  }
127
+  
128
+  return(params.list)
129
+}
130
+
131
+
83 132
 # Sanity check arguments to celda() to ensure a smooth run.
84 133
 # See parameter descriptions from celda() documentation.
85 134
 validateArgs = function(counts, model, sample.label, 
86
-                         nchains, cores, seed, K=NULL, L=NULL, ...) {
135
+                         nchains, cores, seed, K=NULL, L=NULL) { #, ...) {
87 136
   model_args = names(formals(model))
88 137
   if ("K" %in% model_args && is.null(K)) {
89 138
     stop("Must provide a K parameter when running a celda_C or celda_CG model")
... ...
@@ -84,15 +84,13 @@ simulateCells.celda_C = function(model, S=10, C.Range=c(10, 100), N.Range=c(100,
84 84
 #' @param z.init Initial values of z. If NULL, z will be randomly sampled. Default NULL.
85 85
 #' @param process.counts Whether to cast the counts matrix to integer and round(). Defaults to TRUE.
86 86
 #' @param logfile If NULL, messages will be displayed as normal. If set to a file name, messages will be redirected messages to the file. Default NULL.
87
-#' @param ... additonal parameters
88 87
 #' @return An object of class celda_C with clustering results and Gibbs sampling statistics
89 88
 #' @export
90
-celda_C = function(counts, sample.label=NULL, K,
91
-					alpha=1, beta=1, 
92
-                   	stop.iter = 10, split.on.iter=10, max.iter=200, 
93
-                   	count.checksum=NULL, seed=12345,
94
-                   	z.init = NULL, process.counts=TRUE, logfile=NULL, ...) {
95
-
89
+celda_C = function(counts, sample.label=NULL, K, alpha=1, beta=1, 
90
+                 	 stop.iter = 10, split.on.iter=10, max.iter=200, 
91
+                 	 count.checksum=NULL, seed=12345,
92
+                 	 z.init = c(), process.counts=TRUE, logfile=NULL) {
93
+  
96 94
   ## Error checking and variable processing
97 95
   if (isTRUE(process.counts)) {
98 96
     counts = processCounts(counts)  
... ...
@@ -130,6 +128,7 @@ celda_C = function(counts, sample.label=NULL, K,
130 128
   iter = 1L
131 129
   num.iter.without.improvement = 0L
132 130
   do.cell.split = TRUE
131
+  logMessages(date(), "Max iter:", max.iter, logfile=logfile, append=TRUE)
133 132
   while(iter <= max.iter & num.iter.without.improvement <= stop.iter) {
134 133
     
135 134
     next.z = cC.calcGibbsProbZ(counts=counts, m.CP.by.S=m.CP.by.S, n.G.by.CP=n.G.by.CP, n.by.C=n.by.C, n.CP=n.CP, z=z, s=s, K=K, nG=nG, nM=nM, alpha=alpha, beta=beta)
... ...
@@ -323,13 +323,12 @@ simulateCells.celda_CG = function(model, S=10, C.Range=c(50,100), N.Range=c(500,
323 323
 #' @param y.init Initial values of y. If NULL, y will be randomly sampled. Default NULL.
324 324
 #' @param logfile The name of the logfile to redirect messages to.
325 325
 #' @param count.checksum An MD5 checksum for the provided counts matrix
326
-#' @param ... Additional parameters
327 326
 #' @export
328 327
 celda_CG = function(counts, sample.label=NULL, K, L,
329
-					alpha=1, beta=1, delta=1, gamma=1, 
328
+                    alpha=1, beta=1, delta=1, gamma=1, 
330 329
                     max.iter=200, stop.iter = 10, split.on.iter=10,
331 330
                     seed=12345, count.checksum=NULL,
332
-			        z.init = NULL, y.init = NULL, logfile=NULL, ...) {
331
+                    z.init = NULL, y.init = NULL, logfile=NULL) {
333 332
   
334 333
   ## Error checking and variable processing
335 334
   if (isTRUE(processCounts)) {
... ...
@@ -198,14 +198,12 @@ cG.calcGibbsProbY = function(counts.t, n.C.by.TS, n.by.TS, nG.by.TS, n.by.G, y,
198 198
 #' @param y.init Initial values of y. If NULL, y will be randomly sampled. Default NULL.
199 199
 #' @param process.counts Whether to cast the counts matrix to integer and round(). Defaults to TRUE.
200 200
 #' @param logfile The name of the logfile to redirect messages to.
201
-#' @param ...  Additional parameters
202 201
 #' @keywords LDA gene clustering gibbs
203 202
 #' @export
204
-celda_G = function(counts, L,
205
-					beta=1, delta=1, gamma=1,
203
+celda_G = function(counts, L, beta=1, delta=1, gamma=1,
206 204
 					stop.iter=10, split.on.iter=10, max.iter=200,
207
-                    count.checksum=NULL, seed=12345, 
208
-                    y.init=NULL, process.counts=TRUE, logfile=NULL, ...) {
205
+          count.checksum=NULL, seed=12345, 
206
+          y.init=NULL, process.counts=TRUE, logfile=NULL) {
209 207
 
210 208
   ## Error checking and variable processing
211 209
   if (isTRUE(process.counts)) {
... ...
@@ -4,9 +4,11 @@
4 4
 \alias{celda}
5 5
 \title{Run the celda Bayesian hierarchical model on a matrix of counts.}
6 6
 \usage{
7
-celda(counts, model, sample.label = NULL, nchains = 1,
8
-  bestChainsOnly = TRUE, cores = 1, seed = 12345, verbose = FALSE,
9
-  logfile_prefix = "Celda", ...)
7
+celda(counts, model, sample.label = NULL, K = NULL, L = NULL, alpha = 1,
8
+  beta = 1, delta = 1, gamma = 1, max.iter = 200, z.init = NULL,
9
+  y.init = NULL, stop.iter = 10, split.on.iter = 10,
10
+  process.counts = FALSE, nchains = 1, bestChainsOnly = TRUE, cores = 1,
11
+  seed = 12345, verbose = FALSE, logfile_prefix = "Celda")
10 12
 }
11 13
 \arguments{
12 14
 \item{counts}{A count matrix}
... ...
@@ -15,7 +17,19 @@ celda(counts, model, sample.label = NULL, nchains = 1,
15 17
 
16 18
 \item{sample.label}{A numeric vector, character vector, or factor indicating the originating sample for each cell (column) in the count matrix. By default, every cell will be assumed to be from an independent sample.}
17 19
 
18
-\item{nchains}{The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1}
20
+\item{K}{Number of desired cell subpopulation clusters. Required for celda_C and celda_CG models.}
21
+
22
+\item{L}{Number of desired gene clusters. Required for celda_G and celda_CG models.}
23
+
24
+\item{alpha}{Non-zero concentration parameter for sample Dirichlet distribution (celda_C / celda_CG only)}
25
+
26
+\item{beta}{Non-zero concentration parameter for gene Dirichlet distribution}
27
+
28
+\item{delta}{The Dirichlet distribution parameter for Eta; adds a gene pseudocount to the numbers of genes each state (celda_G / celda_CG only)}
29
+
30
+\item{gamma}{The Dirichlet distribution parameter for Psi; adds a pseudocount to each gene within each transcriptional state (celda_G / celda_CG only)}
31
+
32
+\item{nchains}{The number of chains of Gibbs sampling to run for every combination of K/L parameters. Defaults to 1 (celda_G / celda_CG only)}
19 33
 
20 34
 \item{bestChainsOnly}{Return only the best chain (by final log-likelihood) per K/L combination.}
21 35
 
... ...
@@ -26,8 +40,6 @@ celda(counts, model, sample.label = NULL, nchains = 1,
26 40
 \item{verbose}{Print log messages during celda chain execution}
27 41
 
28 42
 \item{logfile_prefix}{Prefix for log files from worker threads and main process.}
29
-
30
-\item{...}{Model specific parameters}
31 43
 }
32 44
 \value{
33 45
 Object of class "celda_list", which contains results for all model parameter combinations and summaries of the run parameters
... ...
@@ -6,8 +6,8 @@
6 6
 \usage{
7 7
 celda_C(counts, sample.label = NULL, K, alpha = 1, beta = 1,
8 8
   stop.iter = 10, split.on.iter = 10, max.iter = 200,
9
-  count.checksum = NULL, seed = 12345, z.init = NULL,
10
-  process.counts = TRUE, logfile = NULL, ...)
9
+  count.checksum = NULL, seed = 12345, z.init = c(),
10
+  process.counts = TRUE, logfile = NULL)
11 11
 }
12 12
 \arguments{
13 13
 \item{counts}{A numeric count matrix}
... ...
@@ -35,8 +35,6 @@ celda_C(counts, sample.label = NULL, K, alpha = 1, beta = 1,
35 35
 \item{process.counts}{Whether to cast the counts matrix to integer and round(). Defaults to TRUE.}
36 36
 
37 37
 \item{logfile}{If NULL, messages will be displayed as normal. If set to a file name, messages will be redirected messages to the file. Default NULL.}
38
-
39
-\item{...}{additonal parameters}
40 38
 }
41 39
 \value{
42 40
 An object of class celda_C with clustering results and Gibbs sampling statistics
... ...
@@ -7,7 +7,7 @@
7 7
 celda_CG(counts, sample.label = NULL, K, L, alpha = 1, beta = 1,
8 8
   delta = 1, gamma = 1, max.iter = 200, stop.iter = 10,
9 9
   split.on.iter = 10, seed = 12345, count.checksum = NULL,
10
-  z.init = NULL, y.init = NULL, logfile = NULL, ...)
10
+  z.init = NULL, y.init = NULL, logfile = NULL)
11 11
 }
12 12
 \arguments{
13 13
 \item{counts}{A numeric count matrix.}
... ...
@@ -41,8 +41,6 @@ celda_CG(counts, sample.label = NULL, K, L, alpha = 1, beta = 1,
41 41
 \item{y.init}{Initial values of y. If NULL, y will be randomly sampled. Default NULL.}
42 42
 
43 43
 \item{logfile}{The name of the logfile to redirect messages to.}
44
-
45
-\item{...}{Additional parameters}
46 44
 }
47 45
 \description{
48 46
 celda Cell and Gene Clustering Model
... ...
@@ -6,8 +6,7 @@
6 6
 \usage{
7 7
 celda_G(counts, L, beta = 1, delta = 1, gamma = 1, stop.iter = 10,
8 8
   split.on.iter = 10, max.iter = 200, count.checksum = NULL,
9
-  seed = 12345, y.init = NULL, process.counts = TRUE, logfile = NULL,
10
-  ...)
9
+  seed = 12345, y.init = NULL, process.counts = TRUE, logfile = NULL)
11 10
 }
12 11
 \arguments{
13 12
 \item{counts}{A numeric count matrix}
... ...
@@ -35,8 +34,6 @@ celda_G(counts, L, beta = 1, delta = 1, gamma = 1, stop.iter = 10,
35 34
 \item{process.counts}{Whether to cast the counts matrix to integer and round(). Defaults to TRUE.}
36 35
 
37 36
 \item{logfile}{The name of the logfile to redirect messages to.}
38
-
39
-\item{...}{Additional parameters}
40 37
 }
41 38
 \description{
42 39
 Provides cluster assignments for all genes in a provided single-cell