Browse code

Scone Wrapper Phase I

mbcole authored on 27/07/2016 16:40:13
Showing 5 changed files

... ...
@@ -24,11 +24,13 @@ Imports:
24 24
   fpc,
25 25
   gplots,
26 26
   grDevices,
27
+  hexbin,
27 28
   limma,
28 29
   MASS,
29 30
   matrixStats,
30 31
   mixtools,
31 32
   grDevices,
33
+  RColorBrewer,
32 34
   boot,
33 35
   shiny,
34 36
   miniUI,
... ...
@@ -21,12 +21,14 @@ export(lm_adjust)
21 21
 export(make_design)
22 22
 export(metric_sample_filter)
23 23
 export(scone)
24
+export(scone_wrapper)
24 25
 export(score_matrix)
25 26
 import(BiocParallel)
26 27
 import(gplots)
27 28
 importFrom(DESeq,estimateSizeFactorsForMatrix)
28 29
 importFrom(EDASeq,betweenLaneNormalization)
29 30
 importFrom(MASS,glm.nb)
31
+importFrom(RColorBrewer,brewer.pal)
30 32
 importFrom(RUVSeq,RUVg)
31 33
 importFrom(aroma.light,normalizeQuantileRank.matrix)
32 34
 importFrom(boot,inv.logit)
... ...
@@ -44,6 +46,8 @@ importFrom(graphics,hist)
44 46
 importFrom(graphics,par)
45 47
 importFrom(graphics,plot)
46 48
 importFrom(graphics,text)
49
+importFrom(hexbin,hexbin)
50
+importFrom(hexbin,plot)
47 51
 importFrom(limma,lmFit)
48 52
 importFrom(matrixStats,colIQRs)
49 53
 importFrom(matrixStats,colMedians)
50 54
deleted file mode 100644
... ...
@@ -1,420 +0,0 @@
1
-library(Biobase) #for AnnotatedDataFrame if scone is not loaded
2
-library(scone)
3
-library(RColorBrewer)
4
-library(boot) #for logit
5
-
6
-#A wrapper for filtering and running scone
7
-#currently supports only one-variable bio and batch
8
-scone_something = function(selected_eSet, housekeeping_list,
9
-                           batch, bio = NULL, de = NULL,
10
-                           adjust_bio = "no", adjust_batch = "yes", rezero = TRUE,
11
-                           out_dir = getwd(), stratified_pam = TRUE,
12
-                           selected_expression_in_eSet = "counts_table",
13
-                           results_image_filename = "sconeResults.Rda",
14
-                           plots_filename = "scone_plots.pdf") {
15
-
16
-printf <- function(...) cat(sprintf(...))
17
-set.seed(112233) ## reproducibility
18
-if (!file.exists(out_dir)) {
19
-  dir.create(out_dir, recursive=TRUE)
20
-}
21
-pdf(file.path(out_dir, plots_filename), onefile=TRUE)
22
-
23
-#keep only protein coding genes
24
-keepGenes = featureData(selected_eSet)$Transcript_Type == "protein_coding"
25
-selected_eSet = selected_eSet[keepGenes,]
26
-printf("Removed %d non-protein coding genes, %d protein-coding genes left\n", sum(!keepGenes), sum(keepGenes))
27
-
28
-
29
-#don't use exprs because we have several units (e.g., counts, tpm, fpkm) for each eSet
30
-#instead, dynamically access the assayData with the user's argument
31
-selected_unit_exprs = get(selected_expression_in_eSet, envir = assayData(selected_eSet))
32
-selected_unit_exprs[is.na(selected_unit_exprs)] = 0
33
-E = aggregate(selected_unit_exprs, by=list(featureData(selected_eSet)$Gene_Symbol),
34
-              FUN=max, na.rm=TRUE)
35
-rownames(E) = E[,1]
36
-E = E[, -1]
37
-E = data.matrix(E)
38
-E[E == -Inf] = 0 #cases in which the max had no non-NA option - should not happen because we turned NAs into 0's already
39
-stopifnot(all(is.finite(E)))
40
-printf("collapsed %d transcripts into %d genes by gene symbols\n", nrow(selected_unit_exprs), nrow(E))
41
-
42
-#now to remove duplicated features from the featuresData (we know that the same gene
43
-#symbol will have the same type, so no harm in just removing duplicate rows) and then
44
-#reordering it according to the new expression matrix
45
-featureDat = featureData(selected_eSet)
46
-featureDat = featureDat[!duplicated(featureDat$Gene_Symbol)]
47
-featureDat = featureDat[rownames(E), ]
48
-
49
-# Expression set object to make filtering easier
50
-chosen_units_eSet = new("ExpressionSet", exprs = E, featureData = featureDat, 
51
-                        protocolData = protocolData(selected_eSet), phenoData = phenoData(selected_eSet))
52
-
53
-# Remove failed samples:
54
-is.failed = apply(is.na(exprs(chosen_units_eSet)) | (exprs(chosen_units_eSet) == 0), 2, all) |
55
-  apply(is.na(pData(protocolData(chosen_units_eSet))), 1, all)
56
-chosen_units_eSet = chosen_units_eSet[,!is.failed ]
57
-printf("Removed %d failed samples\n", sum(is.failed))
58
-
59
-
60
-## ----- Pre-Filtering of Transcripts: ----
61
-# Select only coding transcripts and TCR segments
62
-is.expressed.sc = rowMeans(exprs(chosen_units_eSet)) > 0
63
-chosen_units_eSet = chosen_units_eSet[is.expressed.sc, ]
64
-printf("Removed %d undetected genes, %d left\n",sum(!is.expressed.sc), sum(is.expressed.sc))
65
-
66
-
67
-# Params for gene filtering
68
-# turns out that the filtering of the Th17 dataset was pretty robust to the choice of the quantile
69
-thresh_fail = quantile(exprs(chosen_units_eSet)[exprs(chosen_units_eSet) > 0], 0.2) #10
70
-num_fail = 10
71
-
72
-# Initial Gene Filtering
73
-init.gf.vec = rowSums(exprs(chosen_units_eSet) > thresh_fail) > num_fail
74
-chosen_units_eSet = chosen_units_eSet[init.gf.vec, ]
75
-printf("Kept only %d genes expressed in more than %.2f units in more than %d cells , excluded %d genes\n", sum(init.gf.vec), thresh_fail, num_fail, sum(!init.gf.vec))
76
-
77
-
78
-
79
-
80
-
81
-
82
-#The counts data frame contains feature-level read counts from tophat alignments of 96 single-cell libraries to the mm10 reference genome [@trapnell2009]. qc contains library alignment metrics obtained from Picard.
83
-#de and hk are positive and negative control gene sets derived from population studies. batch and bio are factors labeling batch and time point respectively. Consider the joint distribution of these factors:
84
-counts = exprs(chosen_units_eSet)
85
-qc = pData(protocolData(chosen_units_eSet))
86
-
87
-#prevent casing problems
88
-rownames(counts) = sapply(rownames(counts), toupper)
89
-housekeeping_genes = sapply(as.vector(as.matrix(read.table(housekeeping_list, stringsAsFactors = FALSE))), toupper)
90
-hk = intersect(housekeeping_genes, rownames(counts))
91
-
92
-
93
-
94
-#####################################
95
-#start to do real work
96
-#####################################
97
-unnecessaryQC_metrics = apply(is.na(qc), 2, any) | apply(qc, 2, sd) < 1e-5
98
-printf("remove %d QC metrics (%s) that either contained NAs or were constant across cells\n", sum(unnecessaryQC_metrics), paste(colnames(qc)[unnecessaryQC_metrics], collapse = "; "))
99
-qc = qc[, !unnecessaryQC_metrics]
100
-
101
-colnames(qc)
102
-
103
-## Joint distribution of batches and biological conditions (time after induction)
104
-if(!is.null(bio)) {
105
-  table(batch,bio)
106
-}
107
-
108
-# Color scheme
109
-cc <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"),brewer.pal(9,"Set3"))
110
-
111
-# Barplot of read proportion mapping to the genome
112
-barplot(qc$RALIGN[order(batch)], col=cc[batch][order(batch)], border=cc[batch][order(batch)], main="Percentage of mapped reads, colored by batch")
113
-legend("bottomleft", legend=levels(batch), fill=cc,cex=0.4)
114
-
115
-# Barplot of total read number
116
-barplot(qc$NREADS[order(batch)], col=cc[batch][order(batch)], border=cc[batch][order(batch)], main="Total number of reads, colored by batch")
117
-legend("topright", legend=levels(batch), fill=cc, cex=0.4)
118
-
119
-qpc = prcomp(qc, center = TRUE, scale = TRUE)
120
-barplot(cumsum(qpc$sdev^2)/sum(qpc$sdev^2), border="gray", xlab="PC", ylab="proportion of variance", main="Quality PCA")
121
-
122
-barplot(qpc$x[,1][order(batch)], col=cc[batch][order(batch)], border=cc[batch][order(batch)], main="Quality PC1, colored by batch")
123
-legend("bottomleft", legend=levels(batch), fill=cc, cex=0.8)
124
-
125
-
126
-# Mean log10(x+1) expression
127
-mu_obs = rowMeans(log10(counts[hk,]+1)) #0 if hk == NULL
128
-
129
-# Drop-outs
130
-drop_outs = counts[hk,] == 0
131
-
132
-# Logistic Regression Model of Failure
133
-logistic_coef = matrix(0,ncol(drop_outs),2)
134
-for (si in seq_len(ncol(drop_outs))){
135
-  fit = suppressWarnings(glm(cbind(drop_outs[,si],1 - drop_outs[,si]) ~ mu_obs,family=binomial(logit)))
136
-  if(fit$converged){
137
-    logistic_coef[si,] = fit$coefficients
138
-  } else {
139
-    logistic_coef[si,1] = logit(drop_rate[si])
140
-  }
141
-}
142
-
143
-
144
-par(mfrow=c(1,2))
145
-
146
-# Plot Failure Curves and Calculate AUC
147
-plot(NULL, main = "False Negative Rate Curves",ylim = c(0,1),xlim = c(0,6), ylab = "Failure Probability", xlab = "Mean log10 Expression")
148
-x = (0:60)/10
149
-AUC = NULL
150
-for(si in 1:ncol(counts)){
151
-  y = 1/(exp(-logistic_coef[si,1] - logistic_coef[si,2] * x) + 1)
152
-  AUC[si] = sum(y)/10
153
-  lines(x, 1/(exp(-logistic_coef[si,1] - logistic_coef[si,2] * x) + 1), type = 'l', lwd = 2, col = cc[batch][si])
154
-}
155
-
156
-# Barplot of FNR AUC
157
-barplot(AUC[order(batch)], col=cc[batch][order(batch)], border=cc[batch][order(batch)], main="FNR AUC, colored by batch")
158
-legend("topleft", legend=levels(batch), fill=cc, cex=0.4)
159
-
160
-mfilt_report <- metric_sample_filter(expr = counts,
161
-                                     nreads = qc$NREADS,ralign = qc$RALIGN,
162
-                                     suff_nreads = 10^5,
163
-                                     suff_ralign = 90,
164
-                                     pos_controls = rownames(counts) %in% hk,
165
-                                     zcut = 3,mixture = FALSE, plot = TRUE)
166
-
167
-
168
-hist(qc$RALIGN, breaks = 0:100)
169
-# Hard threshold
170
-abline(v = 15, col = "yellow", lwd = 2) 
171
-# 3 (zcut) standard deviations below the mean ralign value
172
-abline(v = mean(qc$RALIGN) - 3*sd(qc$RALIGN), col = "green", lwd = 2) 
173
-# 3 (zcut) MADs below the median ralign value
174
-abline(v = median(qc$RALIGN) - 3*mad(qc$RALIGN), col = "red", lwd = 2)
175
-# Sufficient threshold
176
-abline(v = 90, col = "grey", lwd = 2)
177
-# Final threshold is the minimum of 1) the sufficient threshold and 2) the max of all others
178
-thresh = min(90,max(c(15,mean(qc$RALIGN) - 3*sd(qc$RALIGN),median(qc$RALIGN) - 3*mad(qc$RALIGN))))
179
-abline(v = thresh, col = "blue", lwd = 2, lty = 2)
180
-
181
-legend("topleft",legend = c("Hard","SD","MAD","Sufficient","Final"),lwd = 2, col = c("yellow","green","red","grey","blue"),lty = c(1,1,1,1,2), cex = .5)
182
-
183
-
184
-# Which thresholds are missing? (breadth)
185
-is_na_filt = unlist(lapply(is.na(mfilt_report),any)) 
186
-
187
-# Identify samples failing any threshold
188
-m_sampfilter = !apply(simplify2array(mfilt_report[!is_na_filt]),1,any)
189
-
190
-# Filter Samples
191
-fcounts = counts[,m_sampfilter]
192
-fqc = qc[m_sampfilter,]
193
-fbatch = batch[m_sampfilter]
194
-fbio = bio[m_sampfilter] #if bio==NULL then we get from this line fbio=NULL
195
-
196
-# # Simple gene filter --> redundant with what I have above
197
-# filterCount <- function(counts, nRead=5, nCell=5){
198
-#   filter <- apply(counts, 1, function(x) length(x[x>=nRead])>=nCell)
199
-#   return(filter)
200
-# }
201
-# genefilter <- filterCount(fcounts)
202
-# fcounts = fcounts[genefilter,]
203
-
204
-# skip filtering genes again
205
-fcounts = counts
206
-fhk = hk[hk %in% rownames(fcounts)]
207
-fde = de[de %in% rownames(fcounts)] #if de==NULL then we get from this line fde=NULL
208
-
209
-
210
-#FNR
211
-#Michael: I see you recommend bulk_model=true even though the default is false? Why?
212
-#I'm not clear on what this parameter does - what other gene features are possible if gFeatM is not specified?
213
-fnr_out = estimate_ziber(x = fcounts, bulk_model = TRUE,
214
-                         pos_controls = rownames(fcounts) %in% fhk,
215
-                         verbose = TRUE, maxiter = 1000)
216
-
217
-
218
-SUM_FN = function (ei) 
219
-{
220
-  sums = colSums(ei)
221
-  eo = t(t(ei)*mean(sums)/sums)
222
-  return(eo)
223
-}
224
-
225
-MED_FN_POS = function (ei) 
226
-{
227
-  zei = ei
228
-  zei[ei == 0] = NA
229
-  q = apply(zei, 2, quantile, 0.5, na.rm = TRUE)
230
-  zeo = t(t(zei)/q) * mean(q)
231
-  eo = zeo
232
-  eo[ei == 0] = 0
233
-  return(eo)
234
-}
235
-
236
-EFF_FN = function (ei) 
237
-{
238
-  sums = colSums(ei > 0)
239
-  eo = t(t(ei)*sums/mean(sums))
240
-  return(eo)
241
-}
242
-
243
-imputation = list(none=impute_null,expect=impute_expectation)
244
-impute_args = list(p_nodrop = fnr_out$p_nodrop, mu = exp(fnr_out$Alpha[1,]))
245
-
246
-scaling = list(none=identity,
247
-             sum = SUM_FN,
248
-             eff = EFF_FN,
249
-             tmm = TMM_FN,
250
-             uq = UQ_FN,
251
-             uqp = UQ_FN_POS,
252
-             fq = FQT_FN,
253
-             fqp = FQ_FN_POS,
254
-             deseq=DESEQ_FN,
255
-             deseqp=DESEQ_FN_POS)
256
-
257
-
258
-
259
-# Generate Params (RUN = FALSE)
260
-#Michael: in your example qc=ppq which is ppq = scale(qc,center = TRUE,scale = TRUE)
261
-#but I see in scone's code that it expects QC matrix and does the prcomp itself
262
-params <- scone(expr = as.matrix(fcounts), scaling = scaling,
263
-                imputation = imputation, impute_args = impute_args,
264
-                ruv_negcon = fhk, k_ruv = 3,
265
-                qc = as.matrix(fqc), k_qc = 3,
266
-                bio = fbio, adjust_bio = adjust_bio,
267
-                batch = fbatch, adjust_batch = adjust_batch,
268
-                run = FALSE)
269
-head(params)
270
-
271
-apply(params,2,unique)
272
-
273
-#exclide batch adjustment and no adjustment by bio and similar not-making-sense scenarios
274
-is_screened = ((params$imputation_method == "none") & (params$scaling_method %in% c("deseq","uq","tmm"))) | 
275
-  ((params$imputation_method == "expect") & (params$scaling_method %in% c("deseqp","uqp","fqp","eff"))) |
276
-  ((params$adjust_biology == "bio") & (params$adjust_batch != "batch"))
277
-
278
-params = params[!is_screened,]
279
-
280
-##NO MORE PLOTS!
281
-dev.off()
282
-
283
-# Generate Scores and Ranking
284
-tic = proc.time()
285
-res = scone(expr = as.matrix(fcounts), scaling = scaling,
286
-            imputation = imputation, impute_args = impute_args,
287
-            ruv_negcon = fhk, k_ruv = 3,
288
-            qc = as.matrix(fqc), k_qc = 3,
289
-            bio = fbio, adjust_bio = adjust_bio,
290
-            batch = fbatch, adjust_batch = adjust_batch,
291
-            run = TRUE, params = params, verbose = TRUE,
292
-            eval_poscon = fde, eval_kclust = 2:3, stratified_pam = stratified_pam,
293
-            rezero = rezero, return_norm = "in_memory")
294
-toc = proc.time()
295
-
296
-printf("time elapsed:\n")
297
-print(toc-tic)
298
-
299
-save.image(file.path(out_dir, results_image_filename))
300
-
301
-
302
-printf("Scone finished successfully and image saved")
303
-
304
-}
305
-
306
-
307
-#code to run this method and test it
308
-if(FALSE) {
309
-  rm(list=ls())
310
-  #library(bioc2016singlecell)
311
-  set.seed(112233) ## for reproducibility
312
-  printf <- function(...) cat(sprintf(...))
313
-  
314
-  library(Biobase) #for AnnotatedDataFrame if scone is not loaded
315
-  library(scone)
316
-  library(RColorBrewer)
317
-  
318
-  BiocParallel::register(BiocParallel::SerialParam())
319
-  
320
-  setwd("/data/yosef2/Published_Data/TH17/code")
321
-  source("scone_something.R")
322
-  
323
-  load("/data/yosef2/Published_Data/TH17/code/combined_th17_metadata.Rdata")
324
-  housekeeping_list = "/data/yosef/CD8_effector_diff/src/SummaryPipeline/house_keeping_mouse_TitleCase.txt"
325
-  out_dir = "/data/yosef2/Published_Data/TH17processed_aw20160712/scone/only_T16"
326
-  
327
-  #only T16 from the non-GFP mouse
328
-  keep = combinedMetadata$michael_condition == "TGFB1_IL6-48h" &
329
-    !combinedMetadata$is_population
330
-  
331
-  combinedMetadata = combinedMetadata[keep,]
332
-  
333
-  #tophaht + featureCounts
334
-  collect_dir="/data/yosef2/Published_Data/TH17/processed_aw20160712/collect"
335
-  load(file.path(collect_dir, "collectedRNASeqStudy.RData"))
336
-  selected_eSet = collectedRNASeqStudy$cuff_eSet
337
-  #featureNames(selected_eSet) = toupper(featureNames(selected_eSet))
338
-  
339
-  selected_eSet = selected_eSet[, combinedMetadata$SRR]
340
-  batch = combinedMetadata$michael_batch
341
-  scone_something(selected_eSet, housekeeping_list = housekeeping_list,
342
-                  selected_expression_in_eSet = "counts_table",
343
-                  batch = batch, bio = NULL, de = NULL,
344
-                  adjust_bio = "no", adjust_batch = "yes", rezero = TRUE,
345
-                  out_dir = getwd(), stratified_pam = TRUE,
346
-                  results_image_filename = "sconeResults_tophatFeatureCounts.Rda",
347
-                  plots_filename = "scone_plot_tophatFeatureCounts.pdf")
348
-  
349
-  stop("finished")
350
-  
351
-}
352
-
353
-
354
-#code to inspect scone results
355
-if(FALSE) {
356
-  rm(list=ls())
357
-  set.seed(4455)
358
-  out_dir = "/home/eecs/allonwag/data2/TFH/processed_20160628/scone"
359
-  load(file.path(out_dir, "sconeResults.Rda"))
360
-  
361
-  
362
-  names(res)
363
-  
364
-  if(is.null(bio)) {
365
-    #no positive control genes (de) so no evaluation of positive controls
366
-    BIO_DEPENDENT_EVALUATION_METRICS = c("BIO_SIL", "EXP_WV_COR")
367
-    res$scores = res$scores[, !(colnames(res$scores) %in% BIO_DEPENDENT_EVALUATION_METRICS)]
368
-    res$metrics = res$metrics[, !(colnames(res$metrics) %in% BIO_DEPENDENT_EVALUATION_METRICS)]
369
-  }
370
-  
371
-  head(res$scores)
372
-  
373
-  pc_obj = prcomp(res$scores[,-ncol(res$scores)],center = TRUE,scale = FALSE)
374
-  bp_obj = biplot_colored(pc_obj,y = -res$scores[,ncol(res$scores)],expand = .6)
375
-  
376
-  #library(scone)
377
-  #if I don't have this library installed
378
-  source("biplot.R")
379
-  bp_obj = biplot_colored(pc_obj,y = -res$scores[,ncol(res$scores)],expand = .6)
380
-  
381
-  points(bp_obj[grepl(",bio,batch",rownames(bp_obj)),], pch = 1, col = "red", cex = 1)
382
-  points(bp_obj[grepl(",bio,batch",rownames(bp_obj)),], pch = 1, col = "red", cex = 1.5)
383
-  
384
-  
385
-  bp_obj = biplot_colored(pc_obj,y = -res$scores[,ncol(res$scores)],expand = .6)
386
-  
387
-  points(bp_obj[grepl(",no_bio,batch",rownames(bp_obj)),], pch = 1, col = "red", cex = 1)
388
-  points(bp_obj[grepl(",no_bio,batch",rownames(bp_obj)),], pch = 1, col = "red", cex = 1.5)
389
-  
390
-  #my addition to scone's tutorial
391
-  bp_obj = biplot_colored(pc_obj,y = -res$scores[,ncol(res$scores)],expand = .6)
392
-  
393
-  points(bp_obj[grepl(",fq,",rownames(bp_obj)),], pch = 1, col = "green", cex = 1.5)
394
-  points(bp_obj[grepl(",tmm,",rownames(bp_obj)),], pch = 1, col = "cyan", cex = 1.5)
395
-  points(bp_obj[grepl(",deseq,",rownames(bp_obj)),], pch = 1, col = "magenta", cex = 1.5)
396
-  points(bp_obj[grepl(",uqp,",rownames(bp_obj)),], pch = 1, col = "black", cex = 1.5)
397
-  #end my addition
398
-  
399
-  bp_obj = biplot_colored(pc_obj,y = -res$scores[,ncol(res$scores)],expand = .6)
400
-  
401
-  points(t(bp_obj[1,]), pch = 1, col = "red", cex = 1)
402
-  points(t(bp_obj[1,]), pch = 1, col = "red", cex = 1.5)
403
-  
404
-  points(t(bp_obj[rownames(bp_obj) == rownames(params)[1],]), pch = 1, col = "blue", cex = 1)
405
-  points(t(bp_obj[rownames(bp_obj) == rownames(params)[1],]), pch = 1, col = "blue", cex = 1.5)
406
-  
407
-  arrows(bp_obj[rownames(bp_obj) == rownames(params)[1],][1],
408
-         bp_obj[rownames(bp_obj) == rownames(params)[1],][2],
409
-         bp_obj[1,][1],
410
-         bp_obj[1,][2],
411
-         lty = 2, lwd = 2)
412
-  
413
-  printf("(Trumpets): the selected method is %s", rownames(res$scores)[1])
414
-  normalized_expression = res$normalized_data[[1]]
415
-  selected_normalization_method = names(res$normalized_data)[1]
416
-  save(normalized_expression, selected_normalization_method, file=file.path(out_dir, "sconeNormalizedExpression.Rda"))
417
-  
418
-  sessionInfo()
419
-  
420
-}
421 0
new file mode 100644
... ...
@@ -0,0 +1,236 @@
1
+#' Wrapper for Running Essential SCONE Modules
2
+#'
3
+#' @param expr matrix. The expression data matrix (genes in rows, cells in columns).
4
+#' @param qc matrix. The quality control (QC) matrix (cells in rows, metrics in columns)
5
+#'   to be used for filtering, normalization, and evaluation.
6
+#' @param bio factor. The biological condition to be modeled in the Adjustment Step
7
+#'   as variation to be preserved. If adjust_bio="no", it will not be used
8
+#'   for normalization, but only for evaluation.
9
+#' @param batch factor. The known batch variable to be included in the
10
+#'   adjustment model as variation to be removed. If adjust_batch="no", it will
11
+#'   not be used for normalization, but only for evaluation.
12
+#' @param negcon character. The genes to be used as negative controls for filtering, 
13
+#'   normalization, and evaluation. These genes should be expressed uniformily across
14
+#'   the biological phenomenon of interest.
15
+#'   Default NULL.
16
+#' @param verbose character. Verbosity level.
17
+#'   Default "0".
18
+#' @param out_dir character. Output directory. 
19
+#'   Default getwd().
20
+#' @param seed numeric. Random seed. 
21
+#'   Default 112233.
22
+#'   
23
+#' @param filt_genes logical. Should genes be filtered post-sample filtering?
24
+#'   Default TRUE.
25
+#'   
26
+#' @param fnr_maxiter numeric. Maximum number of iterations in EM estimation of expression posteriors.
27
+#'    Dafault 1000.
28
+#'  
29
+#' @param norm_impute character. Should imputation be included in the comparison? 
30
+#'   If 'force', only imputed normalizations will be run.
31
+#'   Default "yes."
32
+#' @param norm_scaling character. Scaling options to be included in the Scaling Step. 
33
+#'   Default c("sum", "deseq", "tmm", "uq", "fq"). See details.
34
+#' @param norm_rezero logical. Restore imputed zeroes to zero following the Scaling Step?
35
+#'   Default TRUE.
36
+#' @param norm_k_max numeric. Maximum number (norm_k_max) of factors of unwanted variation modeled
37
+#'   in the Adjustment Step.
38
+#'   Default NULL.
39
+#' @param norm_qc_expl numeric. In automatic selection of norm_k_max, what
40
+#'   fraction of variation must be explained by the first norm_k_max PCs of qc? 
41
+#'   Default 0.5. Ignored if norm_k_max is not NULL.
42
+#' @param norm_adjust_bio character. If 'no' it will not be included in the model; if
43
+#'   'yes', both models with and without 'bio' will be run; if 'force', only
44
+#'   models with 'bio' will be run.
45
+#'   Default "yes."
46
+#' @param norm_adjust_batch character. If 'no' it will not be modeled in the Adjustment Step;
47
+#'   if 'yes', both models with and without 'batch' will be run; if 'force',
48
+#'   only models with 'batch' will be run.
49
+#'   Default "yes."
50
+#'   
51
+#' @param eval_dim numeric. The number of principal components to use for evaluation.
52
+#'   Default NULL.
53
+#' @param eval_expr_expl numeric. In automatic selection of eval_dim, what
54
+#'   fraction of variation must be explained by the first eval_dim PCs of expr? 
55
+#'   Default 0.2. Ignored if eval_dim is not NULL.
56
+#' @param eval_poscon character. The genes to be used as positive controls for
57
+#'   evaluation. These genes should be expected to change according to the
58
+#'   biological phenomenon of interest.
59
+#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam
60
+#'   tightness evaluation. If an array of integers, largest average silhouette
61
+#'   width (tightness) will be reported. If NULL, tightness will be returned NA.
62
+#' @param eval_stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is
63
+#'   separately computed for each biological-cross-batch condition (accepting
64
+#'   NAs), and a weighted average is returned as PAM_SIL.
65
+#'   Default TRUE.
66
+#'
67
+#' @export
68
+#' @importFrom RColorBrewer brewer.pal
69
+#' @importFrom boot inv.logit
70
+#' @importFrom hexbin plot hexbin
71
+#'
72
+#' @details "ADD DESCRIPTION"
73
+#'
74
+#' @return Directory structure "ADD DESCRIPTION"
75
+#'
76
+
77
+scone_wrapper <- function(expr, qc,
78
+                  bio=NULL, batch=NULL, negcon=NULL,
79
+                  verbose=c("0","1","2"), out_dir=getwd(), seed = 112233,
80
+                  
81
+                  filt_genes=TRUE,
82
+                  
83
+                  fnr_maxiter=1000,
84
+                  
85
+                  norm_impute=c("yes", "no", "force"),
86
+                  norm_scaling=c("sum", "deseq", "tmm", "uq", "fq"),
87
+                  norm_rezero=TRUE, norm_k_max=NULL, norm_qc_expl=0.5,
88
+                  norm_adjust_bio=c("yes", "no", "force"),
89
+                  norm_adjust_batch=c("yes", "no", "force"), 
90
+                  
91
+                  eval_dim=NULL, eval_expr_expl=0.2,
92
+                  eval_poscon=NULL, eval_kclust = 2:10, eval_stratified_pam = TRUE) {
93
+
94
+  verbose = match.arg(verbose)
95
+  verbose = as.numeric(verbose)
96
+  
97
+  printf <- function(...) cat(sprintf(...))
98
+  set.seed(seed)
99
+  
100
+  if(!is.matrix(expr)){stop("expr must be a matrix")}
101
+  
102
+  # Primary Directory
103
+  if (!file.exists(out_dir)) {
104
+    dir.create(out_dir, recursive=TRUE)
105
+  }
106
+  
107
+  # Misc Directory
108
+  misc_dir = paste0(out_dir,"/misc")
109
+  if (!file.exists(misc_dir)) {
110
+    dir.create(misc_dir)
111
+  }
112
+  
113
+  ## ------ Data Filtering Module ------
114
+  
115
+  # Initial Gene Filtering: Select "common" transcripts.
116
+  thresh_fail = quantile(expr[expr > 0], 0.2) ###Make argument###
117
+  num_fail = 10 ###Make argument###
118
+  init.gf.vec = rowSums(expr > thresh_fail) > num_fail
119
+  if(verbose > 0){printf("Data Filtering Module: Initial filter - Kept only %d genes expressed in more than %.2f units in more than %d cells, excluded %d genes\n", sum(init.gf.vec), thresh_fail, num_fail, sum(!init.gf.vec))}
120
+  
121
+  # Initial-Filtering Negative Controls
122
+  small_negcon = intersect(negcon,rownames(expr)[init.gf.vec])
123
+  if(verbose > 0){printf("Data Filtering Module: %d negative control genes to be used for sample filtering\n", length(small_negcon))}
124
+  
125
+  # Metric-based Filtering and Report
126
+  pdf(paste0(misc_dir,"/filter_report.pdf"))
127
+  mfilt = metric_sample_filter(expr, mixture = TRUE, plot = TRUE, hist_breaks = 100,
128
+                               zcut = 4, ###Make argument###
129
+                               pos_controls = rownames(ct) %in% small_negcon,
130
+                               gene_filter = init.gf.vec,
131
+                               nreads = qc$NREADS,ralign = qc$RALIGN) ###Make argument###
132
+  dev.off()
133
+  mfilt = !apply(simplify2array(mfilt[!is.na(mfilt)]),1,any)
134
+  if(verbose > 0){printf("Data Filtering Module: %d samples to be used in downstream analysis, excluded %d samples\n", sum(mfilt),  sum(!mfilt))}
135
+  
136
+  # Implement sample filter
137
+  expr = expr[,mfilt]
138
+  qc = qc[mfilt,]
139
+  batch = droplevels(batch[mfilt])
140
+  bio = droplevels(bio[mfilt])
141
+
142
+  if(filt_genes){
143
+    thresh_fail = quantile(expr[expr > 0], 0.2) ###Make argument###
144
+    num_fail = 10 ###Make argument###
145
+    final.gf.vec = rowSums(expr > thresh_fail) > num_fail
146
+    if(verbose > 0){printf("Data Filtering Module: Kept only %d genes expressed in more than %.2f units in more than %d cells, excluded %d genes\n", sum(final.gf.vec), thresh_fail, num_fail, sum(!final.gf.vec))}
147
+  
148
+    # Implement gene filter
149
+    expr = expr[final.gf.vec,]
150
+    if(!is.null(negcon)){
151
+      negcon = negcon[negcon %in% rownames(expr)]
152
+      if(verbose > 0){printf("Data Filtering Module: Kept only %d negative control genes\n", length(negcon))}
153
+    }
154
+    if(!is.null(eval_poscon)){
155
+      eval_poscon = eval_poscon[eval_poscon %in% rownames(expr)]
156
+      if(verbose > 0){printf("Data Filtering Module: Kept only %d positive control genes\n", length(eval_poscon))}
157
+    }
158
+  }
159
+  
160
+  if(verbose > 0){printf("Data Filtering Module: COMPLETE\n\n")}
161
+  ## ------ False-Negative Rate Inference Module ------
162
+  
163
+  if(verbose > 0){printf("False-Negative Rate Inference Module: Estimating posterior expression probability...\n")}
164
+  fnr_out = estimate_ziber(x = expr, 
165
+                           bulk_model = TRUE, ###Make argument###
166
+                           pos_controls = rownames(expr) %in% hk,
167
+                           verbose = (verbose > 1), 
168
+                           maxiter = fnr_maxiter)
169
+  if(fnr_out$convergence == 1){printf("False-Negative Rate Inference Module: WARNING - EM did not converge. Try increasing fnr_max_iter\n")}
170
+  if(verbose > 0){printf("False-Negative Rate Inference Module: Producing report...\n")}
171
+  
172
+  # FNR Report
173
+  cc <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"),brewer.pal(9,"Set3"))
174
+  
175
+  logistic_coef = simple_FNR_params(expr = expr, pos_controls = rownames(expr) %in% negcon) # "Non-DE" are expected to be expressed in all cells
176
+  
177
+  pdf(paste0(misc_dir,"/fnr_report.pdf"),width = 12,height = 6)
178
+  par(mfrow=c(1,2))
179
+  
180
+  plot(NULL, main = "False Negative Rate Curves",ylim = c(0,1),xlim = c(0,15), ylab = "Failure Probability", xlab = "Median log-Expression")
181
+  x = (0:150)/10
182
+  AUC = NULL
183
+  for(si in 1:ncol(expr)){
184
+    y = inv.logit(logistic_coef[si,1] + logistic_coef[si,2] * x)
185
+    AUC[si] = sum(y)/10
186
+    if(!is.null(batch)){
187
+      colchoice = cc[batch][si]
188
+    }else{
189
+      colchoice = "black"
190
+    }
191
+    lines(x, y, type = 'l', lwd = 2, col = colchoice)
192
+  }
193
+  
194
+  # Barplot of FNR AUC
195
+  if(!is.null(batch)){
196
+    oAUC = order(AUC)
197
+    sAUC = AUC[oAUC]
198
+    sbatch = batch[oAUC]
199
+    barplot(sAUC[order(sbatch)], col=cc[sbatch][order(sbatch)], border=cc[sbatch][order(sbatch)], main="FNR AUC, colored by batch", ylim = c(0,2*max(AUC)))
200
+    legend("topleft", legend=levels(sbatch), fill=cc, cex=0.4)
201
+  }else{
202
+    barplot(sort(AUC), col="black", border="black", main="FNR AUC",ylim = c(0,2*max(AUC)))
203
+  }
204
+  
205
+  hexbin::plot(hexbin(rowMeans(expr > 0)[!is.na(fnr_out$Beta)],inv.logit(fnr_out$Beta[!is.na(fnr_out$Beta)])), xlab = "Detection Rate", ylab = "Expression Rate")
206
+  hexbin::plot(hexbin(fnr_out$Alpha[1,][!is.na(fnr_out$Beta)],inv.logit(fnr_out$Beta[!is.na(fnr_out$Beta)])), ylab = "Expression Rate", xlab = "Median log-Expression")
207
+  
208
+  if(verbose > 0){printf("False-Negative Rate Inference Module: COMPLETE\n\n")}
209
+  
210
+  dev.off()
211
+  
212
+  ui <- fluidPage(
213
+    h1("Example app"),
214
+    sidebarLayout(
215
+      sidebarPanel(
216
+        actionButton("qtBtn", "Quit")
217
+      ),
218
+      mainPanel(
219
+      )
220
+    )
221
+  )
222
+
223
+  server <- function(input, output, session) {
224
+    
225
+    observeEvent(input$qtBtn,{
226
+      if(input$qtBtn > 0){
227
+        stopApp("hello world")
228
+      }
229
+    })
230
+  }
231
+  
232
+  print(runApp(list(ui = ui, server = server)))
233
+
234
+  sessionInfo()
235
+  
236
+}
0 237
\ No newline at end of file
1 238
new file mode 100644
... ...
@@ -0,0 +1,107 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/scone_wrap.R
3
+\name{scone_wrapper}
4
+\alias{scone_wrapper}
5
+\title{Wrapper for Running Essential SCONE Modules}
6
+\usage{
7
+scone_wrapper(expr, qc, bio = NULL, batch = NULL, negcon = NULL,
8
+  verbose = c("0", "1", "2"), out_dir = getwd(), seed = 112233,
9
+  filt_genes = TRUE, fnr_maxiter = 1000, norm_impute = c("yes", "no",
10
+  "force"), norm_scaling = c("sum", "deseq", "tmm", "uq", "fq"),
11
+  norm_rezero = TRUE, norm_k_max = NULL, norm_qc_expl = 0.5,
12
+  norm_adjust_bio = c("yes", "no", "force"), norm_adjust_batch = c("yes",
13
+  "no", "force"), eval_dim = NULL, eval_expr_expl = 0.2,
14
+  eval_poscon = NULL, eval_kclust = 2:10, eval_stratified_pam = TRUE)
15
+}
16
+\arguments{
17
+\item{expr}{matrix. The expression data matrix (genes in rows, cells in columns).}
18
+
19
+\item{qc}{matrix. The quality control (QC) matrix (cells in rows, metrics in columns)
20
+to be used for filtering, normalization, and evaluation.}
21
+
22
+\item{bio}{factor. The biological condition to be modeled in the Adjustment Step
23
+as variation to be preserved. If adjust_bio="no", it will not be used
24
+for normalization, but only for evaluation.}
25
+
26
+\item{batch}{factor. The known batch variable to be included in the
27
+adjustment model as variation to be removed. If adjust_batch="no", it will
28
+not be used for normalization, but only for evaluation.}
29
+
30
+\item{negcon}{character. The genes to be used as negative controls for filtering, 
31
+normalization, and evaluation. These genes should be expressed uniformily across
32
+the biological phenomenon of interest.
33
+Default NULL.}
34
+
35
+\item{verbose}{character. Verbosity level.
36
+Default "0".}
37
+
38
+\item{out_dir}{character. Output directory. 
39
+Default getwd().}
40
+
41
+\item{seed}{numeric. Random seed. 
42
+Default 112233.}
43
+
44
+\item{filt_genes}{logical. Should genes be filtered post-sample filtering?
45
+Default TRUE.}
46
+
47
+\item{fnr_maxiter}{numeric. Maximum number of iterations in EM estimation of expression posteriors.
48
+Dafault 1000.}
49
+
50
+\item{norm_impute}{character. Should imputation be included in the comparison? 
51
+If 'force', only imputed normalizations will be run.
52
+Default "yes."}
53
+
54
+\item{norm_scaling}{character. Scaling options to be included in the Scaling Step. 
55
+Default c("sum", "deseq", "tmm", "uq", "fq"). See details.}
56
+
57
+\item{norm_rezero}{logical. Restore imputed zeroes to zero following the Scaling Step?
58
+Default TRUE.}
59
+
60
+\item{norm_k_max}{numeric. Maximum number (norm_k_max) of factors of unwanted variation modeled
61
+in the Adjustment Step.
62
+Default NULL.}
63
+
64
+\item{norm_qc_expl}{numeric. In automatic selection of norm_k_max, what
65
+fraction of variation must be explained by the first norm_k_max PCs of qc? 
66
+Default 0.5. Ignored if norm_k_max is not NULL.}
67
+
68
+\item{norm_adjust_bio}{character. If 'no' it will not be included in the model; if
69
+'yes', both models with and without 'bio' will be run; if 'force', only
70
+models with 'bio' will be run.
71
+Default "yes."}
72
+
73
+\item{norm_adjust_batch}{character. If 'no' it will not be modeled in the Adjustment Step;
74
+if 'yes', both models with and without 'batch' will be run; if 'force',
75
+only models with 'batch' will be run.
76
+Default "yes."}
77
+
78
+\item{eval_dim}{numeric. The number of principal components to use for evaluation.
79
+Default NULL.}
80
+
81
+\item{eval_expr_expl}{numeric. In automatic selection of eval_dim, what
82
+fraction of variation must be explained by the first eval_dim PCs of expr? 
83
+Default 0.2. Ignored if eval_dim is not NULL.}
84
+
85
+\item{eval_poscon}{character. The genes to be used as positive controls for
86
+evaluation. These genes should be expected to change according to the
87
+biological phenomenon of interest.}
88
+
89
+\item{eval_kclust}{numeric. The number of clusters (> 1) to be used for pam
90
+tightness evaluation. If an array of integers, largest average silhouette
91
+width (tightness) will be reported. If NULL, tightness will be returned NA.}
92
+
93
+\item{eval_stratified_pam}{logical. If TRUE then maximum ASW for PAM_SIL is
94
+separately computed for each biological-cross-batch condition (accepting
95
+NAs), and a weighted average is returned as PAM_SIL.
96
+Default TRUE.}
97
+}
98
+\value{
99
+Directory structure "ADD DESCRIPTION"
100
+}
101
+\description{
102
+Wrapper for Running Essential SCONE Modules
103
+}
104
+\details{
105
+"ADD DESCRIPTION"
106
+}
107
+