Browse code

cp from YosefCode

Michael Cole authored on 05/02/2016 02:05:20
Showing 30 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+Package: SCONE
2
+Version: 0.0.1
3
+Title: Single Cell Overview of Normalized Expression data
4
+Description: SCONE.
5
+Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
6
+	          role = c("aut", "cre", "cph")),
7
+	     person("Davide", "Risso", email = "risso.davide@gmail.com",
8
+	          role = c("aut")))
9
+Author: Michael Cole [aut, cre, cph] and Davide Risso [aut]
10
+Maintainer: Michael Cole <mbeloc@gmail.com>
11
+Date: 01-22-2016
12
+License: Artistic-2.0
13
+LazyLoad: yes
14
+RoxygenNote: 5.0.1
0 15
new file mode 100644
1 16
new file mode 100644
... ...
@@ -0,0 +1,532 @@
1
+## DR: if I understand correctly "nom" is used only for the name of the file to store the data.
2
+## There should be a better way to do this, using the name of the function (the opposite of get()?).
3
+
4
+source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/estimateFNR.R")
5
+source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE_DEFAULTS.R")
6
+library(BiocParallel,quietly = TRUE)
7
+library(cluster,quietly = TRUE)
8
+library(fpc,quietly = TRUE)
9
+
10
+runAdjustTask = function(task,evaluate_material,design,nested_model, to_store = FALSE){
11
+  
12
+#???  source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE)
13
+  
14
+  result = try({
15
+    
16
+    if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){
17
+      if(to_store){
18
+        load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))
19
+        score_out = scone_out$evaluation
20
+      
21
+        # Return values
22
+        if(!is.na(score_out)[1]){
23
+          scores = score_out$scores
24
+        }else{
25
+          scores = NA
26
+        }
27
+  
28
+        print(task$nom)
29
+        return(list(scores = scores))
30
+      }else{
31
+        print(task$nom)
32
+        return()
33
+      }
34
+    
35
+    }else{
36
+    
37
+      norm_lout = ADJUST_FN(task$ei,batch = task$batch,
38
+                            bio = task$condition,
39
+                            uv = task$uv,
40
+                            w = task$w,
41
+                            design = design, 
42
+                            nested_model = nested_model)
43
+      norm_out = exp(norm_lout) - 1
44
+      score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch,
45
+                            qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, 
46
+                            DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores,
47
+                            dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN,
48
+                            fnr_pi = evaluate_material$fnr_pi, nom = task$nom)
49
+    
50
+      scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom)
51
+      save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) 
52
+      if(to_store){
53
+      
54
+        # Return values
55
+        if(!is.na(score_out)[1]){
56
+          scores = score_out$scores
57
+        }else{
58
+          scores = NA
59
+        }
60
+    
61
+        print(task$nom)
62
+        return(list(scores = scores))
63
+      }else{
64
+        print(task$nom)
65
+        return()
66
+      }
67
+    
68
+    }
69
+  
70
+  })
71
+  
72
+  return(NA)
73
+  
74
+}
75
+
76
+runFactorFreeTask = function(task,evaluate_material,emat){
77
+  
78
+  ## ??? source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE)
79
+  
80
+  result = try({
81
+    if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){
82
+      load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))
83
+      norm_out = scone_out$exp_mat
84
+      score_out = scone_out$evaluation
85
+    
86
+      # Return values
87
+      if(!is.na(score_out)[1]){
88
+        scores = score_out$scores
89
+      }else{
90
+        scores = NA
91
+      }
92
+    
93
+      print(task$nom)
94
+      return(list(eo = norm_out, scores = scores))
95
+    
96
+      }else{
97
+        norm_out = task$FN(emat)
98
+
99
+        score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch,
100
+                          qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, 
101
+                          DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores,
102
+                          dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN,
103
+                          fnr_pi = evaluate_material$fnr_pi, nom = task$nom)
104
+  
105
+      scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom)
106
+      save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) 
107
+  
108
+      # Return values
109
+      if(!is.na(score_out)[1]){
110
+        scores = score_out$scores
111
+      }else{
112
+        scores = NA
113
+      }
114
+  
115
+      print(task$nom)
116
+      return(list(eo = norm_out, scores = scores))
117
+    }
118
+  
119
+  })
120
+  
121
+  return(NA)
122
+  
123
+}
124
+
125
+scoreMethod = function(e,condition = NULL, batch = NULL, 
126
+                       qual_scores = NULL, HK_scores = NULL,
127
+                       DE_scores = NULL, CC_scores = NULL,
128
+                       dim_eval = NULL, K_clust = NULL, K_NN = NULL,
129
+                       fnr_pi = NULL, nom = NULL){
130
+  if(any(is.na(e) | is.infinite(e) | is.nan(e))){
131
+    warning("NA/Inf/NaN Expression Values. Returning NA.")
132
+    return(NA)
133
+  }
134
+  # Project the data: PCA or wPCA
135
+  if(grepl("^IMPUTE",nom)){
136
+    pc_val = prcomp(t(log(e + 1)),center = TRUE,scale. = TRUE)$x[,1:dim_eval]
137
+  }else{
138
+    pc_val = wPCA(log(e + 1),1 - fnr_pi,nu = dim_eval,filt = TRUE)$x
139
+  }
140
+  
141
+  # Max cor with quality scores
142
+  EXP_QPC_COR = max(cor(pc_val,qual_scores,method = "spearman")^2)
143
+  
144
+  # Max cor with housekeeping scores
145
+  if(!is.null(HK_scores)){
146
+    EXP_HK_COR = max(cor(pc_val,HK_scores,method = "spearman")^2)
147
+  }else{
148
+    EXP_HK_COR = NA
149
+  }
150
+  
151
+  # Max cor with differentially expressed scores
152
+  if(!is.null(DE_scores)){
153
+    EXP_DE_COR = max(cor(pc_val,DE_scores,method = "spearman")^2)
154
+  }else{
155
+    EXP_DE_COR = NA
156
+  }
157
+  
158
+  # Max cor with cell cycle scores
159
+  if(!is.null(CC_scores)){
160
+    EXP_CC_COR = max(cor(pc_val,CC_scores,method = "spearman")^2)
161
+  }else{
162
+    EXP_CC_COR = NA
163
+  }
164
+  
165
+  d = as.matrix(dist(pc_val,method = "euclidean"))
166
+  
167
+  # K-NN Condition
168
+  if(!is.null(condition)){
169
+    m <- sapply(seq_len(nrow(d)), function(s) {
170
+      ds <- d[,s]
171
+      ra <- rank(ds)
172
+      return(mean(condition[(ra <= K_NN+1) & (ra != 1)] == condition[s]))
173
+    })
174
+    KNN_BIO = mean(m)
175
+  }else{
176
+    KNN_BIO = NA
177
+  }
178
+  
179
+  # K-NN Batch
180
+  if(!is.null(batch)){
181
+    m <- sapply(seq_len(nrow(d)), function(s) {
182
+      ds <- d[,s]
183
+      ra <- rank(ds)
184
+      return(mean(batch[(ra <= K_NN+1) & (ra != 1)] == batch[s]))
185
+    })
186
+    KNN_BATCH = mean(m)
187
+  }else{
188
+    KNN_BATCH = NA
189
+  }
190
+  
191
+  # PAM Silh
192
+  if(K_clust > 1){
193
+    pam_object = pam(pc_val,k = K_clust)
194
+    PAM_SIL = pam_object$silinfo$avg.width
195
+    clusters = pam_object$clustering
196
+  }else{
197
+    PAM_SIL = NA
198
+    clusters = NA
199
+  }
200
+  scores = c(EXP_QPC_COR,EXP_HK_COR,EXP_DE_COR,EXP_CC_COR,KNN_BIO,KNN_BATCH,PAM_SIL)
201
+  names(scores) = c("EXP_QPC_COR","EXP_HK_COR","EXP_DE_COR","EXP_CC_COR","KNN_BIO","KNN_BATCH","PAM_SIL")
202
+  return(list(scores = scores, pc_val = pc_val, clusters = clusters))
203
+}
204
+
205
+## ===== SCONE: Single-Cell Overview of Normalized Expression data =====
206
+# Intermediate wrapper is missing"
207
+# Decide model for the user - describe reason -"
208
+# One batch one condition - user friendly 
209
+# Check rank of design matrix - if nested - we're good - IFF?
210
+# Positive (evaluate) and negative controls (modeled + evaluate)
211
+# show correlation between q and hk - why we don't try all 
212
+# Default number of dim_UV
213
+# Factor free norms and adjust norms as lists = impute or not, positive data or not, only q , only hk
214
+# function 1, function 2, and k
215
+SCONE = function(e,condition = NULL, batch = NULL, 
216
+                 design = c("factorial","nested"), 
217
+                 nested_model = c("fixed","random"),
218
+                 qual = NULL, is_HK = NULL,
219
+                 is_DE = NULL,is_CC = NULL,
220
+                 dim_UV = 5,
221
+                 dim_eval = 3, K_clust = NULL, K_NN = 10, 
222
+                 out_dir = NULL,
223
+                 K_clust_MAX = 10,K_pseudobatch_MAX = 10,
224
+                 factor_free_only = FALSE, to_store = FALSE, n_workers = 10){
225
+  param = MulticoreParam(workers = n_workers)
226
+  ## ----- Set up output directory ----
227
+  if (file.exists(out_dir)){
228
+    #stop("Designated output directory already exists.")
229
+  } else {
230
+    dir.create(out_dir)
231
+  }
232
+  
233
+  ## ----- Imputation Step ----
234
+  print("Imputation Step...")
235
+  HK_default = FALSE
236
+  if(is.null(is_HK)){
237
+    warning("No control genes have been specified, using all genes for FNR estimation.")
238
+    HK_default = TRUE
239
+    is_HK = rep(TRUE ,nrow(e))
240
+  }
241
+  if(file.exists(paste0(out_dir,"/fnr_out.Rdata"))){
242
+    load(paste0(out_dir,"/fnr_out.Rdata"))
243
+  }else{
244
+    fnr_out = estimateFNR(e, bulk_model = TRUE, is.expressed = is_HK)
245
+    save(fnr_out,file = paste0(out_dir,"/fnr_out.Rdata"))
246
+  }
247
+  fnr_mu = exp(fnr_out$Alpha[1,])
248
+  fnr_pi = (e == 0) * fnr_out$Z 
249
+  imputed_e = e + (fnr_mu * fnr_pi)
250
+  print("Complete.")
251
+  
252
+  ## ----- Generate Unwanted Factors ----
253
+  print("Generating Unwanted Factors for Evaluation...")
254
+  if(file.exists(paste0(out_dir,"/evaluate_material.Rdata"))){
255
+    
256
+    load(paste0(out_dir,"/evaluate_material.Rdata"))
257
+    
258
+  }else{
259
+    
260
+  # Factors of alignment quality metrics
261
+  if(is.null(qual)){
262
+    warning("No quality metrics specified. Total counts and efficiency used.")
263
+    numba = colSums(e == 0)
264
+    beta = colMeans(e == 0)
265
+    qual = cbind(numba, log(beta/(1-beta)))
266
+  }
267
+  
268
+  if(is.null(dim_UV)){
269
+    dim_UV = min(ncol(qual), sum(is_HK))
270
+    warning(paste("Number of unwanted factors not specified. Selecting minimum of number of quality features and number of control genes:",dim_UV))
271
+    if(dim_UV >= ncol(e)){
272
+      stop("Number of unwanted factors >= number of samples!")
273
+    }
274
+  }
275
+  
276
+  qual_scores = prcomp(qual, center = TRUE, scale = TRUE)$x[,1:dim_UV]
277
+  
278
+  if(!HK_default){
279
+    HK_scores = wPCA(log(e[is_HK,] + 1), 1 - fnr_pi[is_HK,], nu = dim_UV, filt = TRUE)$x
280
+  }else{
281
+    HK_scores = NULL
282
+  }
283
+  
284
+  if(!is.null(is_DE)){
285
+    DE_scores = wPCA(log(e[is_DE,] + 1), 1 - fnr_pi[is_DE,], nu = dim_eval, filt = TRUE)$x
286
+  }else{
287
+    DE_scores = NULL
288
+  }
289
+  
290
+  if(!is.null(is_CC)){
291
+    CC_scores = wPCA(log(e[is_CC,] + 1), 1 - fnr_pi[is_CC,], nu = dim_eval, filt = TRUE)$x
292
+  }else{
293
+    CC_scores = NULL
294
+  }
295
+  
296
+  ## ----- Evaluation Criteria ----
297
+  if(is.null(K_clust)){
298
+    if(is.null(DE_scores)){
299
+      warning("No clustering K specified. Selecting K using PAM average silhouette width on raw wPCA.")
300
+      pc_val = wPCA(log(e + 1), 1 - fnr_pi, nu = dim_eval, filt = TRUE)$x
301
+    }else{
302
+      warning("No clustering K specified. Selecting K using PAM average silhouette width on DE scores.")
303
+      pc_val = DE_scores
304
+    }
305
+    
306
+    pamk.best <- pamk(data = pc_val, krange = 1:K_clust_MAX)
307
+    K_clust = pamk.best$nc
308
+  }
309
+  
310
+  data_dir = paste0(out_dir,"/data/")
311
+  if (file.exists(data_dir)){
312
+    #stop("Designated data directory already exists.")
313
+  } else {
314
+    dir.create(data_dir)
315
+  }
316
+  
317
+  evaluate_material = list(condition = condition, batch = batch, 
318
+                           qual_scores = qual_scores, HK_scores = HK_scores,
319
+                           DE_scores = DE_scores, CC_scores = CC_scores,
320
+                           dim_eval = dim_eval, K_clust = K_clust, K_NN = K_NN,
321
+                           fnr_pi = fnr_pi, out_dir = data_dir)
322
+  
323
+  save(evaluate_material,file = paste0(out_dir,"/evaluate_material.Rdata"))
324
+  
325
+  }
326
+  
327
+  print("Complete.")
328
+  
329
+  
330
+  ## ----- Factor-Free Normalization
331
+  ## DR: major change: the current code will copy over and over the same matrices (e and imputed_e).
332
+  ## There's no need to do so as we can infer what matrix to use from the name.
333
+  
334
+  task_list = list()
335
+  
336
+  # Generate Task List for Non-imputed
337
+  task_names = paste("NOIMPUTE",c("NONE","UQ","UQP","DESEQ","DESEQP","FQ","FQP","TMM"),sep = "_")
338
+  
339
+  ##DR: why not TMM__FN_POS?
340
+  task_FN = c(identity,UQ_FN,UQ_FN_POS,DESEQ_FN,DESEQ_FN_POS,FQ_FN,FQ_FN_POS,TMM_FN)
341
+  names(task_FN) = task_names
342
+  for (task in task_names){
343
+    task_list[[task]] = list(FN = task_FN[[task]], nom = task)
344
+  }
345
+  # Generate Task List for Imputed
346
+  task_names = paste("IMPUTE",c("NONE","UQ","DESEQ","FQ","TMM"),sep = "_")
347
+  task_FN = c(identity,UQ_FN,DESEQ_FN,FQ_FN,TMM_FN)
348
+  names(task_FN) = task_names
349
+  for (task in task_names){
350
+    task_list[[task]] = list(FN = task_FN[[task]], nom = task)
351
+  }
352
+  
353
+  # Parallel Normalization + Evaluation
354
+  print("Factor-Free Normalization and Evaluation...")
355
+  factor_free_out_noimpute = bplapply(task_list[grep("^NOIMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=e, BPPARAM = param)
356
+  factor_free_out_impute = bplapply(task_list[grep("^IMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=imputed_e, BPPARAM = param)
357
+  factor_free_out <- c(factor_free_out_noimpute, factor_free_out_impute)
358
+  print("Complete.")
359
+  
360
+  ## ----- Selection Step (Optional)
361
+  if(factor_free_only){
362
+    return(list(factor_free_out = factor_free_out, factor_based_out = NA))
363
+  }
364
+  ## ----- Factor-Based Normalization
365
+  
366
+  print("Selecting Base Tasks...")
367
+  base_task_list <- lapply(1:length(factor_free_out), function(i) {
368
+    if(!any(is.na(factor_free_out[[i]]$eo) | is.nan(factor_free_out[[i]]$eo) | is.infinite(factor_free_out[[i]]$eo))) {
369
+      return(list(ei = factor_free_out[[i]]$eo, nom = names(factor_free_out)[i]))
370
+    }
371
+  })
372
+  names(base_task_list) <- names(factor_free_out)
373
+  base_task_list <- base_task_list[!sapply(base_task_list, is.null)]
374
+  print("Complete.")
375
+  
376
+  
377
+  # WEIGHT
378
+  print("Extending Tasks by Weighting Scheme...")
379
+  ext_task_list = list()
380
+  for( task in names(base_task_list)){
381
+    base_nom = task
382
+    if(!grepl("^IMPUTE",task)){
383
+      ext_nom = paste(base_nom,"ZEROWEIGHT",sep = "_")
384
+      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
385
+      ext_task_list[[ext_nom]]$nom = ext_nom 
386
+      ext_task_list[[ext_nom]]$w = 0 + (ext_task_list[[ext_nom]]$ei > 0)
387
+    }
388
+    ext_nom = paste(base_nom,"NOWEIGHT",sep = "_")
389
+    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
390
+    ext_task_list[[ext_nom]]$nom = ext_nom 
391
+    ext_task_list[[ext_nom]]$w = NULL
392
+  }
393
+  print("Complete.")
394
+  
395
+  base_task_list = ext_task_list
396
+  
397
+  # Generate Method-Specific Factors (Should be parallel)
398
+  print("Computing Method-Specific Factors...")
399
+  
400
+  task_factor_list = list()  
401
+  for( task in names(base_task_list)){
402
+    
403
+    # HK
404
+    if(!HK_default){
405
+      if(!is.null(base_task_list[[task]]$w)){
406
+        task_factor_list[[task]]$HK_scores = wPCA(x = log(base_task_list[[task]]$ei[is_HK,]+1),w = base_task_list[[task]]$w[is_HK,],nu = dim_UV,filt = TRUE)$x
407
+      }else{
408
+        task_factor_list[[task]]$HK_scores = prcomp(t(log(base_task_list[[task]]$ei[is_HK,]+1)))$x[,1:dim_UV]
409
+      }
410
+      
411
+      if(is.null(K_pseudobatch)){
412
+        pamk_object = pamk(task_factor_list[[task]]$HK_scores,krange = 1:K_pseudobatch_MAX)
413
+        task_factor_list[[task]]$hk_nc = pamk_object$nc
414
+        task_factor_list[[task]]$pam_clustering =  pamk_object$pamobject$clustering
415
+      }else{
416
+        task_factor_list[[task]]$hk_nc = K_pseudobatch
417
+        task_factor_list[[task]]$pam_clustering = pam(task_factor_list[[task]]$HK_scores,k = K_pseudobatch)$clustering
418
+      }
419
+    }else{
420
+      task_factor_list[[task]]$HK_scores = NULL
421
+      task_factor_list[[task]]$hk_nc = NULL
422
+      task_factor_list[[task]]$pam_clustering = NULL
423
+    }
424
+  }
425
+  print("Complete.")
426
+  
427
+  # Generate General Factor Clusterings
428
+  print("Generating General Factor-Based Clusters...")
429
+  if(is.null(K_pseudobatch)){
430
+    pamk_object = pamk(evaluate_material$qual_scores,krange = 1:K_pseudobatch_MAX)
431
+    qual_nc = pamk_object$nc
432
+    qual_clustering =  as.factor(pamk_object$pamobject$clustering)
433
+  }else{
434
+    qual_nc = K_pseudobatch
435
+    qual_clustering = as.factor(pam(evaluate_material$qual_scores,k = K_pseudobatch)$clustering)
436
+  }
437
+  print("Complete.")
438
+  
439
+  # BIO
440
+  print("Extending Tasks by Biological Covariate...")
441
+  ext_task_list = list()
442
+  for( task in names(base_task_list)){
443
+    base_nom = task
444
+    
445
+    if(!is.null(condition)){
446
+      ext_nom = paste(base_nom,"BIO",sep = "_")
447
+      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
448
+      ext_task_list[[ext_nom]]$nom = ext_nom 
449
+      ext_task_list[[ext_nom]]$condition = condition
450
+    }
451
+    ext_nom = paste(base_nom,"NOBIO",sep = "_")
452
+    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
453
+    ext_task_list[[ext_nom]]$nom = ext_nom 
454
+    ext_task_list[[ext_nom]]$condition = NULL
455
+  }
456
+  print("Complete.")
457
+  
458
+  print("Extending Tasks by Batch Covariate...")
459
+  base_task_list = ext_task_list
460
+  # BATCH
461
+  ext_task_list = list()
462
+  for( task in names(base_task_list)){
463
+    base_nom = task
464
+    
465
+    if(!is.null(batch)){
466
+      ext_nom = paste(base_nom,"BATCH",sep = "_")
467
+      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
468
+      ext_task_list[[ext_nom]]$nom = ext_nom 
469
+      ext_task_list[[ext_nom]]$batch = batch
470
+    }
471
+    ext_nom = paste(base_nom,"NOBATCH",sep = "_")
472
+    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
473
+    ext_task_list[[ext_nom]]$nom = ext_nom 
474
+    ext_task_list[[ext_nom]]$batch = NULL
475
+  }
476
+  print("Complete.")
477
+  
478
+  print("Extending Tasks by UV Covariates...")
479
+  base_task_list = ext_task_list  
480
+  # UV
481
+  ext_task_list = list()
482
+  for( task in names(base_task_list)){
483
+    base_nom = task
484
+    method_class = gsub("WEIGHT_.*","WEIGHT",task)
485
+    
486
+    ext_nom = paste(base_nom,"NOUV",sep = "_")
487
+    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
488
+    ext_task_list[[ext_nom]]$nom = ext_nom 
489
+    ext_task_list[[ext_nom]]$uv = NULL
490
+    
491
+    # HK UV
492
+    if(!HK_default){
493
+      for(j in 1:dim_UV){
494
+        ext_nom = paste(base_nom,paste("HK",j,sep = "_"),sep = "_")
495
+        ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
496
+        ext_task_list[[ext_nom]]$nom = ext_nom 
497
+        ext_task_list[[ext_nom]]$uv = as.matrix(task_factor_list[[method_class]]$HK_scores[,1:j])
498
+      }  
499
+      #       if(task_factor_list[[method_class]]$hk_nc > 1){
500
+      #         ext_nom = paste(base_nom,"HKCLUST",sep = "_")
501
+      #         ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
502
+      #         ext_task_list[[ext_nom]]$nom = ext_nom 
503
+      #         ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(task_factor_list[[method_class]]$pam_clustering))
504
+      #       }
505
+    }
506
+    # Qual UV
507
+    for(j in 1:dim_UV){
508
+      ext_nom = paste(base_nom,paste("Q",j,sep = "_"),sep = "_")
509
+      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
510
+      ext_task_list[[ext_nom]]$nom = ext_nom 
511
+      ext_task_list[[ext_nom]]$uv = as.matrix(evaluate_material$qual_scores[,1:j])
512
+    }  
513
+    #     if(qual_nc > 1){
514
+    #       ext_nom = paste(base_nom,"QCLUST",sep = "_")
515
+    #       ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
516
+    #       ext_task_list[[ext_nom]]$nom = ext_nom 
517
+    #       ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(qual_clustering))
518
+    #     }
519
+  }
520
+  print("Complete.")
521
+  ext_task_list = ext_task_list[!grepl("NOBATCH_NOUV",names(ext_task_list))]
522
+  if(!to_store){
523
+    finished_tasks = gsub(".Rdata","",list.files(evaluate_material$out_dir))
524
+    ext_task_list = ext_task_list[! (names(ext_task_list) %in% finished_tasks)]
525
+  }
526
+  print(length(ext_task_list))
527
+  print("Factor-Based Adjustment Normalization and Evaluation...")
528
+  factor_based_out = bplapply(ext_task_list,FUN = runAdjustTask, evaluate_material = evaluate_material,design = design,nested_model = nested_model,BPPARAM = param)
529
+  print("Complete.")
530
+  
531
+  return(list(factor_free_out = factor_free_out, factor_based_out = factor_based_out))
532
+}
0 533
new file mode 100644
... ...
@@ -0,0 +1,392 @@
1
+library(DESeq,quietly = TRUE)
2
+library(EDASeq,quietly = TRUE)
3
+library(edgeR,quietly = TRUE)
4
+library(sva,quietly = TRUE)
5
+library(aroma.light,quietly = TRUE)
6
+library(lme4,quietly = TRUE)
7
+
8
+#' Upper-quartile normalization wrapper.
9
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
10
+#' @return Upper-quartile normalized matrix (scaled by sample UQ)
11
+UQ_FN = function(ei){
12
+  eo = betweenLaneNormalization(ei, which="upper", round = FALSE)
13
+  return(eo)
14
+}
15
+
16
+uq_f <- function(which="upper", round = FALSE){
17
+  function(x) {
18
+    betweenLaneNormalization(x, which = which, round = round)
19
+  }
20
+}
21
+
22
+# setMethod(f="UQ_FN",
23
+#          signature="SummarizedExperiment",
24
+#          definition=function(x) {
25
+#            exprs(x) <- UQ_FN(exprs(x))
26
+#            return(x)
27
+#          })
28
+# 
29
+# setMethod(f="UQ_FN",
30
+#           signature="matrix",
31
+#           definition= function(ei){
32
+#             eo = betweenLaneNormalization(ei, which="upper", round = FALSE)
33
+#             return(eo)
34
+#           })
35
+
36
+#' Upper-quartile normalization applied to positive data.
37
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
38
+#' @return Upper-quartile (positive) normalized matrix (scaled by sample UQ)
39
+UQ_FN_POS = function(ei){
40
+  zei = ei
41
+  zei[ei == 0] = NA
42
+  q = apply(zei, 2, quantile, 0.75, na.rm = TRUE)
43
+  zeo = t(t(zei)/q)*mean(q)
44
+  eo = zeo
45
+  eo[ei == 0] = 0
46
+  return(eo)
47
+}
48
+
49
+#' Full-Quantile normalization wrapper.
50
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
51
+#' @return FQ normalized matrix.
52
+FQ_FN = function(ei){
53
+  eo = normalizeQuantileRank.matrix(ei)
54
+  return(eo)
55
+}
56
+
57
+#' Full-Quantile normalization applied to positive data.
58
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
59
+#' @return FQ (positive) normalized matrix.
60
+FQ_FN_POS = function(ei){
61
+  
62
+  # Vector of integers used for computation
63
+  base_rank = 1:nrow(ei)
64
+  
65
+  # Quantile Index Matrix: Values between 0 and 1 corresponding to quantile
66
+  quant_mat = NULL
67
+  # Re-ordered Data Matrix
68
+  x_mat = NULL
69
+  print("Sorting Matrix...")
70
+  # For each sample:
71
+  for (i in 1:dim(ei)[2]){
72
+    # Sort data and replace zeroes with NA
73
+    x_mat = cbind(x_mat,rev(sort(ei[,i])))
74
+    x_mat[x_mat == 0 ] = NA
75
+    # Compute corresponding quantile indices for that sample
76
+    quant = base_rank/sum(ei[,i]>0)
77
+    quant[quant > 1] = NA
78
+    quant_mat = cbind(quant_mat,quant)
79
+  }
80
+  
81
+  # Vector form of quantile index matrix
82
+  quant_out = as.numeric(as.vector(quant_mat))
83
+  print("Complete.")
84
+  print("Spline Interpolation...")
85
+  # Interpolation Matrix (Values of all quantiles)
86
+  inter_mat = rep(0,length(quant_out))
87
+  ob_counts = rep(0,length(quant_out)) # Number of observations for averaging
88
+  # For each sample
89
+  for (i in 1:dim(ei)[2]){
90
+    # Produce spline interpolation for that sample, value ~ quantile index
91
+    x1 = na.omit(quant_mat[,i])
92
+    y1 = na.omit(x_mat[,i])
93
+    # Evaluated at all possible quantile indices
94
+    inter = approx(x1,y1,xout = quant_out, rule = 2)$y
95
+    ob_counts = ob_counts + !is.na(inter)
96
+    inter[is.na(inter)] = 0
97
+    inter_mat = inter_mat + inter
98
+  }
99
+  print("Complete.")
100
+  
101
+  # Average over the interpolated values from all samples
102
+  inter_mean = inter_mat/ob_counts
103
+  
104
+  ## Substituting Mean Interpolated Values for Expression Values and Return
105
+  eo = matrix(inter_mean,ncol = dim(ei)[2])
106
+  eo[is.na(eo)] = 0
107
+  for (i in 1:dim(ei)[2]){
108
+    eo[,i] = rev(eo[,i])[order(order(ei[,i]))]
109
+  }
110
+  print("Finished!")
111
+  return(eo)
112
+}
113
+
114
+#' DESeq normalization wrapper.
115
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
116
+#' @return DESeq normalized matrix (scaled by sample )
117
+DESEQ_FN = function(ei){
118
+  size_fac = estimateSizeFactorsForMatrix(ei)
119
+  eo = t(t(ei)/size_fac)
120
+  return(eo)
121
+}
122
+
123
+#' DESeq normalization applied to positive data.
124
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
125
+#' @return DESeq (positive) normalized matrix (scaled by sample )
126
+DESEQ_FN_POS = function(ei){
127
+  if(any(ei < 0)){
128
+    stop("Negative values in input.")
129
+  }
130
+  if(!is.null(dim(ei))){
131
+    y = ei
132
+    y[y == 0] = NA # Matrix with zeroes replaced w/ NA
133
+    geom_mean = exp(apply(log(y),1,sum,na.rm = TRUE)/rowSums(!is.na(y))) # Compute Geometric Mean of Expression for Each Gene (Use positive data only)
134
+  }else{
135
+    stop("Null imput matrix dimension.")
136
+  }
137
+  if(!any(geom_mean > 0)){
138
+    stop("Geometric mean non-positive for all genes.")
139
+  } 
140
+  ratios = ei / geom_mean # Divide each Expression Value by Geometric Mean of Corresponding Gene
141
+  if(any(is.infinite(geom_mean))){
142
+    stop("Infinite mean! This should never happen :-<")
143
+  }
144
+  ratios = ratios[geom_mean > 0,] # Ignore Genes with Zero Mean
145
+  y = ratios
146
+  y[y == 0] = NA
147
+  size = apply(y,2,median,na.rm = TRUE) # Size factor taken as median of ratios (positive data only)
148
+  if(any(size == 0)){
149
+    stop("Zero library size. This should never happen :-(")
150
+  }
151
+  eo = t(t(ei)/size)*mean(size)
152
+  return(eo)
153
+}
154
+
155
+#' TMM normalization wrapper.
156
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
157
+#' @return TMM normalized matrix (scaled by sample )
158
+TMM_FN = function(ei){
159
+  size_fac = calcNormFactors(ei,method = "TMM")
160
+  eo = t(t(ei)/size_fac)
161
+  return(eo)
162
+}
163
+
164
+#' Output residuals + intercept of fit to quality score matrix
165
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
166
+#' @param default_args_list = list of default arguments
167
+#' @param args_list = list of user-provided arguments
168
+#' @return Matrix adjusted for K quality scores
169
+
170
+QUALREG_FN = function(ei, default_args_list, args_list){
171
+  EPSILON = default_args_list$EPSILON
172
+  scores = prcomp(default_args_list$q,center = TRUE,scale = TRUE)$x[,1:default_args_list$K]
173
+  re = log(ei + EPSILON)
174
+  for (g in 1:dim(ei)[1]){
175
+    lin.mod = lm(re[g,] ~ scores)
176
+    re[g,] = lin.mod$coefficients[1] + lin.mod$residuals
177
+  }
178
+  re = exp(re)
179
+  re[e == 0] = 0 
180
+  return(re)
181
+}
182
+
183
+#' Output residuals + intercept of fit to control-gene derived factors
184
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
185
+#' @param default_args_list = list of default arguments
186
+#' @param args_list = list of user-provided arguments
187
+#' @return Matrix adjusted for K unwanted factors
188
+
189
+RUVG_FN = function(ei, default_args_list, args_list){
190
+  EPSILON = default_args_list$EPSILON
191
+  scores = prcomp(t(log(ei[default_args_list$control_genes,]+default_args_list$EPSILON)),center = TRUE,scale. = FALSE)$x[,1:default_args_list$K]   
192
+  re = log(ei + default_args_list$EPSILON)
193
+  for (g in 1:dim(ei)[1]){
194
+    lin.mod = lm(re[g,] ~ scores)
195
+    re[g,] = lin.mod$coefficients[1] + lin.mod$residuals
196
+  }
197
+  re = exp(re)
198
+  re[e == 0] = 0 
199
+  return(re)
200
+}
201
+
202
+#' ComBat Wrapper
203
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
204
+#' @param default_args_list = list of default arguments
205
+#' @param args_list = list of user-provided arguments
206
+#' @return Matrix adjusted for batch
207
+
208
+COMBAT_FN = function(ei, default_args_list, args_list){
209
+  EPSILON = default_args_list$EPSILON
210
+  SD_EPSILON = default_args_list$SD_EPSILON
211
+  ei = log(ei + default_args_list$EPSILON)
212
+  pheno = as.data.frame(cbind(default_args_list$tech_batch,default_args_list$bio_cond))
213
+  if(is.null(default_args_list$bio_cond) || any(is.na(default_args_list$bio_cond))){
214
+    colnames(pheno) = c("batch")
215
+    is_var = apply(ei,1,sd) > SD_EPSILON
216
+    mod = model.matrix(~ 1, data = pheno)
217
+  }else{
218
+    colnames(pheno) = c("batch","phenotype")
219
+    is_var = T
220
+    for (p in unique(default_args_list$bio_cond)){
221
+      is_var = is_var & (apply(ei[,default_args_list$bio_cond == p],1,sd) > SD_EPSILON)
222
+    }
223
+    mod = model.matrix(~ as.factor(phenotype), data = pheno)
224
+  }
225
+  eo = ei
226
+  eo[is_var,] = ComBat(dat=ei[is_var,], batch=default_args_list$tech_batch, mod=mod, par.prior=TRUE, prior.plots=FALSE)
227
+  eo = exp(eo)
228
+  eo[ei == 0] = 0
229
+  return(eo)
230
+}
231
+
232
+NESTED_BATCH_FIXED_FN = function(ei,bio,batch,uv = NULL,w = NULL){
233
+  if(!is.null(bio)){
234
+    bio = factor(bio, levels = sort(levels(bio)))
235
+    batch = factor(batch, levels = unique(batch[order(bio)]))
236
+
237
+    n_vec <- tapply(batch, bio, function(x) nlevels(droplevels(x)))
238
+  }else{
239
+    n_vec = nlevels(droplevels(batch))
240
+  }
241
+  
242
+  mat = matrix(0,nrow = sum(n_vec),ncol = sum(n_vec - 1))
243
+  xi = 1
244
+  yi = 1
245
+  for(i in 1:length(n_vec)){
246
+    if(n_vec[i] > 1){
247
+      cs = contr.sum(n_vec[i])
248
+      dd = dim(cs)
249
+      mat[xi:(xi + dd[1] - 1),yi:(yi + dd[2] - 1)] = cs
250
+      xi = xi + dd[1]
251
+      yi = yi + dd[2]
252
+    }else{
253
+      xi = xi + 1
254
+    }
255
+  }
256
+  
257
+  leo = log(ei+1)
258
+  if(is.null(uv)){
259
+    if(!is.null(bio)) {
260
+      design_mat <- model.matrix(~ bio + batch, contrasts=list(bio=contr.sum, batch=mat))
261
+    } else {
262
+      design_mat <- model.matrix(~ batch, contrasts=list(batch=mat))
263
+    }
264
+    lm_object <- lmFit(leo, design = design_mat, weights = w)
265
+
266
+    bind <- (ncol(lm_object$coefficients) - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients))
267
+    leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch]
268
+    } else {
269
+      if(!is.null(bio)) {
270
+        design_mat <- model.matrix(~ bio + batch + uv, contrasts=list(bio=contr.sum, batch=mat))
271
+      } else {
272
+        design_mat <- model.matrix(~ batch + uv, contrasts=list(batch=mat))
273
+      }
274
+      lm_object <- lmFit(leo, design = design_mat, weights = w)
275
+
276
+      uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
277
+      bind = (ncol(lm_object$coefficients) - dim(uv)[2] - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients) - dim(uv)[2])
278
+      leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind]))
279
+    }
280
+  return(leo)
281
+}
282
+
283
+NESTED_BATCH_RANDOM_FN = function(ei,bio,batch,uv = NULL,w = NULL){
284
+  design <- model.matrix(~batch - 1)
285
+  leo = log(ei+1)
286
+  if(is.null(uv)){
287
+    for(i in 1:dim(ei)[1]){
288
+      lm_object = lmer(leo[i,] ~ 1 + bio + (1 | batch),weights = w[i,])
289
+      betaj <- ranef(lm_object)[[1]][,1]
290
+      leo[i,] = leo[i,] - as.vector(design %*% betaj)
291
+    }
292
+  }else{
293
+    for(i in 1:dim(ei)[1]){
294
+      lm_object = lmer(leo[i,] ~ 1 + bio + uv + (1 | batch),
295
+                       weights = w[i,])
296
+      betaj <- ranef(lm_object)[[1]][,1]
297
+      uvind = (length(coef(lm_object)[[1]][1,]) - dim(uv)[2] + 1):(length(coef(lm_object)[[1]][1,]))
298
+      leo[i,] = leo[i,] - as.vector(design %*% betaj) - as.vector(uv %*% t(coef(lm_object)[[1]][1,][uvind]))
299
+    }
300
+  }
301
+  return(leo)
302
+}
303
+
304
+FACTORIAL_BATCH_FN = function(ei,bio,batch,uv = NULL,w = NULL){
305
+  contrasts <- contr.treatment(length(levels(batch)))
306
+  rownames(contrasts) = levels(batch)
307
+  leo = log(ei+1)
308
+  
309
+  if(is.null(bio)){
310
+    if(is.null(uv)){
311
+      
312
+      # Just Batch
313
+      design_mat <- model.matrix(~batch)
314
+      lm_object <- lmFit(leo, design = design_mat, weights = w)
315
+      bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients))
316
+      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch]
317
+    } else {
318
+      
319
+      # Batch + UV
320
+      design_mat <- model.matrix(~batch + uv)
321
+      lm_object <- lmFit(leo, design = design_mat, weights = w)
322
+      uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
323
+      bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2])
324
+      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind]))
325
+    }
326
+  } else {
327
+    if(is.null(uv)){
328
+      
329
+      # Batch + BIO
330
+      design_mat <- model.matrix(~bio + batch)
331
+      lm_object <- lmFit(leo, design = design_mat, weights = w)
332
+      bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients))
333
+      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch]
334
+
335
+    } else {
336
+      
337
+      # Batch + UV + BIO
338
+      design_mat <- model.matrix(~bio + batch + uv)
339
+      lm_object <- lmFit(leo, design = design_mat, weights = w)
340
+      uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
341
+      bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2])
342
+      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind]))
343
+    }
344
+  }
345
+  return(leo)
346
+}
347
+
348
+BATCHFREE_FN = function(ei,bio,uv = NULL,w = NULL){
349
+  leo = log(ei+1)
350
+  
351
+  if(is.null(bio)){
352
+    # Just UV
353
+    design_mat <- model.matrix(~uv)
354
+    lm_object <- lmFit(leo, design = design_mat, weights = w)
355
+    uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
356
+    leo = leo - t(uv %*% t(lm_object$coefficients[,uvind]))
357
+  } else {
358
+    # Bio + UV
359
+    design_mat <- model.matrix(~bio + uv)
360
+    lm_object <- lmFit(leo, design = design_mat, weights = w)
361
+    uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
362
+    leo2 = leo - t(uv %*% t(lm_object$coefficients[,uvind]))
363
+  }
364
+  return(leo)
365
+}
366
+
367
+ADJUST_FN = function(ei,batch = NULL,
368
+                     bio = NULL,
369
+                     uv = NULL,
370
+                     w = NULL,
371
+                     design = c("factorial","nested"), 
372
+                     nested_model = c("fixed","random")){
373
+  
374
+  if(!is.null(batch)){
375
+    design <- match.arg(design)
376
+    
377
+    if(design == "factorial"){
378
+      leo = FACTORIAL_BATCH_FN(ei,bio,batch,uv,w)
379
+    }else{
380
+      nested_model <- match.arg(nested_model)
381
+      
382
+      if(nested_model == "fixed"){
383
+        leo = NESTED_BATCH_FIXED_FN(ei,bio,batch,uv,w)
384
+      }else{
385
+        leo = NESTED_BATCH_RANDOM_FN(ei,bio,batch,uv,w)
386
+      }
387
+    }
388
+  }else{
389
+    leo = BATCHFREE_FN(ei,bio,uv,w)
390
+  }
391
+  return(leo)
392
+}
0 393
\ No newline at end of file
1 394
new file mode 100644
... ...
@@ -0,0 +1,69 @@
1
+out_dir = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data"
2
+flist = paste(out_dir,list.files(out_dir),sep = "/")
3
+
4
+# Scoring
5
+scores = NULL
6
+snames = NULL
7
+for(fil in flist){
8
+  load(fil)
9
+  nom = scone_out$method
10
+  sc = scone_out$evaluation
11
+  if(!is.na(sc)[1]){
12
+    print(nom)
13
+    snames = c(snames,nom)
14
+    scores = rbind(scores,sc$scores)
15
+  }else{
16
+    # print(nom)
17
+  }
18
+}
19
+rownames(scores) = snames
20
+escores = t(t(scores)*c(-1,-1,1,1,1,-1,1))
21
+colnames(escores) = paste(colnames(scores),c("_INV","")[1.5 + c(-1,-1,1,1,1,-1,1)/2],sep = "")
22
+
23
+scores = t(na.omit(t(scores)))
24
+escores = t(na.omit(t(escores)))
25
+
26
+# Score Heatmaps
27
+require(gplots)
28
+heatmap.2(t(apply(scores ,2,rank)[order(-apply((apply(escores ,2,rank)),1,min)),][1:20,]),density.info = 'n',trace = 'n',key.xlab = "Score",key.title = NA,
29
+          Colv = NA,col = colorRampPalette(c("purple","black","yellow"))(100),
30
+          margins = c(15,10),cexRow = .75,cexCol = .5)
31
+title(main = "Top 20 SCONE",cex.lab=0.5)
32
+
33
+my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min)))
34
+
35
+names(which(my_rank == 1))
36
+load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_NONE.Rdata")
37
+is_zero = scone_out$exp_mat == 0
38
+load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1.Rdata")
39
+myma = scone_out$exp_mat
40
+hk = read.table("~/YosefCode/packages/RNASeq/summary/EXAMPLE/reference_files/house_keeping_human_names.txt")$V1
41
+myma[is_zero] = 0
42
+fnr_out = estimateFNR(myma,bulk_model = T,is.expressed = rownames(myma) %in% as.character(hk) )
43
+w = 0 + !is_zero
44
+w[is_zero] = 1-fnr_out$Z[is_zero]
45
+wpc = wPCA(log(scone_out$exp_mat + 1),w,filt = T)
46
+plot(wpc$x[,1:2], col = cols)
47
+
48
+rownames(wpc$rotation) = rownames(scone_out$exp_mat)[wpc$to.pass]
49
+sort(-abs(wpc$rotation[,2]))[1:20]
50
+
51
+load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/evaluate_material.Rdata")
52
+pchs = 16
53
+cols = rainbow(length(levels(evaluate_material$batch)))[evaluate_material$batch]
54
+plot(scone_out$evaluation$pc_val[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells")
55
+legend("bottomleft",legend = (levels(evaluate_material$batch)),pch = 16, col = rainbow(length(levels(evaluate_material$batch))), cex = .5)
56
+outy = log(scone_out$exp_mat+1)
57
+cols = rainbow(2)[scone_out$evaluation$clusters]
58
+plot(scone_out$evaluation$pc_val[,c(1,3)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells")
59
+
60
+write.table(outy,file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1_log_counts.txt",quote = F,sep = "\t",col.names = T,row.names = T)
61
+  evaluate_material$
62
+evaluate_material$dim_eval
63
+plot(evaluate_material$DE_scores[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells")
64
+
65
+
66
+my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min)))
67
+write.table(cbind(scores,my_rank),file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/scores.txt",quote = F,sep = "\t")
68
+
69
+
0 70
new file mode 100644
... ...
@@ -0,0 +1,244 @@
1
+library(shiny)
2
+library(EDASeq)
3
+library(clusterCells)
4
+load("../shiny/tmp.rda")
5
+
6
+library(RColorBrewer)
7
+cc <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"),brewer.pal(12,"Set3"),
8
+        brewer.pal(8, "Accent"))
9
+
10
+choose_color <- function(input) {
11
+  if(input$color_code=="type") {
12
+    cols <- cc[type]
13
+  } else {
14
+    cols <- cc[batch]
15
+  }
16
+  return(cols)
17
+}
18
+
19
+server <- function(input, output) {
20
+    output$boxplot <- renderPlot({
21
+      set.seed(19883)
22
+      if(input$downsample) {
23
+        idx <- sample(1:100)
24
+      } else {
25
+        idx <- 1:ncol(norm)
26
+      }      
27
+      boxplot(log_norm[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]],
28
+              outline=FALSE, names=FALSE, ylab="log(count+1)", main="Log normalized counts")
29
+      lines(apply(log_norm[,idx], 2, median))
30
+    }
31
+    )
32
+    
33
+    output$rle <- renderPlot({
34
+      set.seed(19883)
35
+      if(input$downsample & ncol(norm)>100) {
36
+        idx <- sample(1:100)
37
+      } else {
38
+        idx <- 1:ncol(norm)
39
+      }
40
+      boxplot(rle[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]],
41
+              outline=FALSE, names=FALSE, ylab="Relative Log Expression", main="Relative Log Expression (RLE)")
42
+      abline(h=0, lty=2)
43
+    })
44
+    
45
+    output$pca <- renderPlot(
46
+      plot(pca$x[,1:2], col=choose_color(input), pch=19, main="All genes")
47
+    )
48
+    
49
+    output$pca_pc <- renderPlot(
50
+      plot(pca_pc$x[,1:2], col=choose_color(input), pch=19, main="Positive controls")
51
+    )
52
+    
53
+    output$pca_nc <- renderPlot(
54
+      plot(pca_nc$x[,1:2], col=choose_color(input), pch=19, main="Negative controls")
55
+    )
56
+
57
+    output$pca_mv <- renderPlot(
58
+      if(input$variance=="variance") {
59
+        plot(pca_var$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes")        
60
+      } else {
61
+        plot(pca_cv$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes")        
62
+      } 
63
+    )
64
+    
65
+    output$qc_plot <- renderPlot({
66
+      barplot(abs(cors), beside = TRUE, col=cc[1:ncol(qc)], border=cc[1:ncol(qc)], ylim=c(0, 1),
67
+              space=c(0, 4), main="Absolute correlation with QC scores")
68
+      legend("topright", colnames(qc), fill=cc, border=cc, cex=.5)  
69
+    }
70
+    )
71
+
72
+    output$qc_pca <- renderPlot({
73
+      plot(qc_pca$x[,1:2], pch=19, col=choose_color(input), main="Quality PCA")
74
+    }
75
+    )
76
+    
77
+    output$qc_corr <- renderTable(abs(cors[,1:5]))
78
+    
79
+    output$pam_cc <- renderPlot({
80
+      dualHeatmap(heatData = sub[[input$k]], main="Heatmap of co-clustering, based on 100 subsamplings",
81
+                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, 
82
+                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
83
+                  annColor=list(Batch=cc, Type=cc))
84
+    })
85
+    
86
+    output$pam_type <- renderTable({
87
+      table(pam[[input$k-1]]$clustering, type)      
88
+    })
89
+    
90
+    output$pam_batch <- renderTable({
91
+      table(pam[[input$k-1]]$clustering, batch)      
92
+    })
93
+    
94
+    output$pam_pca <- renderPlot({
95
+      plot(pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Expression PCA")
96
+      legend("topright", as.character(1:input$k), fill=cc, cex=.5)
97
+    })
98
+    
99
+    output$pam_qc <- renderPlot({
100
+      plot(qc_pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Quality PCA")
101
+      legend("topright", as.character(1:input$k), fill=cc, cex=.5)
102
+    })
103
+    
104
+    output$heatmap_pc <- renderPlot({
105
+      dualHeatmap(heatData = t(norm), main="Positive Control genes",
106
+                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=poscon,
107
+                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
108
+                  annColor=list(Batch=cc, Type=cc))
109
+    })
110
+    
111
+    output$heatmap_nc <- renderPlot({
112
+      dualHeatmap(heatData = t(norm), main="Negative Control genes",
113
+                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=negcon,
114
+                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
115
+                  annColor=list(Batch=cc, Type=cc))
116
+    })
117
+
118
+    output$heatmap_mv <- renderPlot({
119
+      if(input$variance=="variance") {
120
+        genes <- mv_var
121
+      } else {
122
+        genes <- mv_cv
123
+      } 
124
+      
125
+      dualHeatmap(heatData = t(norm), main="Top 100 most variable genes",
126
+                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes,
127
+                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
128
+                  annColor=list(Batch=cc, Type=cc))
129
+    })
130
+
131
+    output$heatmap_pc1 <- renderPlot({
132
+      genes <- names(sort(pca$rotation[,1], decreasing = TRUE)[1:100])
133
+      dualHeatmap(heatData = t(norm), main="Top 100 genes by PC1 loadings",
134
+                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes,
135
+                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
136
+                  annColor=list(Batch=cc, Type=cc))
137
+    })
138
+    
139
+    output$box_pc1_pc <- renderPlot({
140
+      boxplot(tapply(pca_pc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Positive Controls")
141
+    })
142
+
143
+    output$box_pc1_nc <- renderPlot({
144
+      boxplot(tapply(pca_nc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Negative Controls")
145
+    })
146
+    
147
+    output$box_pc1_qc <- renderPlot({
148
+      boxplot(tapply(qc_pca$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of quality scores")
149
+    })
150
+    
151
+    output$curve_dropout <- renderPlot({
152
+      plot(logistic(fitted[order(fnr$Alpha[1,]),1])~sort(fnr$Alpha[1,]), type='l', ylim=c(0, 1), col=rgb(0, 0, 0, 0.2), xlab="log average expression", ylab="Probability of drop-out", main="Drop-out Curves")
153
+      for(i in 2:ncol(fitted)) {
154
+        lines(logistic(fitted[order(fnr$Alpha[1,]),i])~sort(fnr$Alpha[1,]), type='l', col=rgb(0, 0, 0, 0.2))
155
+      }
156
+      lines(logistic(rowMeans(fitted)[order(fnr$Alpha[1,])])~sort(fnr$Alpha[1,]), col=2, lwd=2)
157
+    })
158
+        
159
+    output$box_detection <- renderPlot({
160
+      boxplot(tapply(colSums(norm>5), get(input$color_code), function(x) x), las=2, col=cc, ylab="Number of detected genes (> 5 reads)", main="Detected Genes")
161
+    })
162
+
163
+    output$box_dropout <- renderPlot({
164
+      boxplot(tapply(colMeans(1-w), get(input$color_code), function(x) x), las=2, col=cc, ylab="Drop-out rate", main="Drop-out Rate")
165
+    })
166
+    
167
+  }
168
+
169
+ui <- fluidPage(
170
+  titlePanel("In-depth Normalization Report"),
171
+  
172
+  sidebarLayout(
173
+    sidebarPanel(
174
+      selectInput("color_code", label = "Color code / stratify by",
175
+                  choices = list("Batch"="batch", "Cell Type"="type"),
176
+                  selected = "batch"),
177
+      selectInput("variance", label = "Select most variable genes by",
178
+                  choices = list("Variance"="variance", "Coefficient of variation"="cv"),
179
+                  selected = "cv"),
180
+      sliderInput("k", label = "Number of clusters",
181
+                  min=2, max=10,
182
+                  value = 7),
183
+      checkboxInput("downsample", label="Downsample to 100 cells for boxplot panel", value=TRUE),
184
+      helpText("RLE: relative log expression."),
185
+      helpText("PCA: principal component analysis."),
186
+      helpText("QC: quality control."),
187
+      helpText("The 'Boxplot' panel shows the read count distribution and the RLE plot."),
188
+      helpText("The 'PCA' panel shows the PCA of different subset of genes."),
189
+      helpText("The 'QC' panel shows the absolute correlation between the first 10 principal components of the expression matrix and the QC measures. It also shows the PCA of the QC measures. Note that the PCA score signs are not meaningful (e.g., 'good' cells could have a low value of PC1 and 'bad' cells a high value)."),
190
+      helpText("The 'Cluster stability' panel shows a heatmap of the PAM co-clustering based on 100 subsamples of the data. It shows the relation between the clusters obtained by PAM (on the original data) to cell type, batch, expression PCA, and QC PCA. Note that these results are reported for normalization evaluation only. A proper cluster analysis should be performed to find sub-populations in the data."),
191
+      helpText("The 'Heatmaps' panel shows the log expression of the positive controls, negative controls, genes with highest PC1 loadings, and most variable genes."),
192
+      helpText("The 'Confounding' panel shows the first PC of positive controls, negative controls, and QC stratified by either cell type or batch. This is useful to identify lower quality batches and potential confounding between quality and cell type."),
193
+      helpText("The 'Drop-out' panel shows the number of detected genes and the drop-out rate stratified by either cell type or batch. It also shows the probability of drop-out as a function of the average expression"),
194
+      helpText("The 'Number of clusters' slider controls the PAM clustering in the 'Cluster stability' tab."),
195
+      helpText("Please note that with many cells (>500), some plots may take several seconds (sometimes minutes) to appear."),
196
+      helpText("By default, we downsample the data to 100 cells for the boxplot visualization (only for the 'Boxplot' tab). Please uncheck it to see the full dataset (it may take some time).")
197
+    ),
198
+    
199
+    mainPanel(
200
+      tabsetPanel(type = "tabs",
201
+                  ## Add a "summary" tab (wait for Michael's text)
202
+                  tabPanel("Boxplot",
203
+                           plotOutput("boxplot"),
204
+                           plotOutput("rle")),
205
+                  tabPanel("PCA",
206
+                           plotOutput("pca"),
207
+                           plotOutput("pca_pc"),
208
+                           plotOutput("pca_nc"),
209
+                           plotOutput("pca_mv")),
210
+                  tabPanel("QC",
211
+                           plotOutput("qc_plot"),
212
+                           tableOutput("qc_corr"),
213
+                           plotOutput("qc_pca")),
214
+                  tabPanel("Cluster stability",
215
+                           plotOutput("pam_cc"),
216
+                           fluidRow(
217
+                             column(3,
218
+                                    h3("Type"),
219
+                                    br(),
220
+                                    tableOutput("pam_type")),
221
+                             column(3,
222
+                                    h3("Batch"),
223
+                                    br(),
224
+                                    tableOutput("pam_batch"))),
225
+                           plotOutput("pam_pca"),
226
+                           plotOutput("pam_qc")),
227
+                  tabPanel("Heatmaps",
228
+                           plotOutput("heatmap_pc"),
229
+                           plotOutput("heatmap_nc"),
230
+                           plotOutput("heatmap_pc1"),
231
+                           plotOutput("heatmap_mv")),
232
+                  tabPanel("Confounding",
233
+                           plotOutput("box_pc1_pc"),
234
+                           plotOutput("box_pc1_nc"),
235
+                           plotOutput("box_pc1_qc")),
236
+                  tabPanel("Drop-out",
237
+                           plotOutput("box_detection"),
238
+                           plotOutput("box_dropout"),
239
+                           plotOutput("curve_dropout"))
240
+                  )
241
+                  
242
+    )))
243
+
244
+shinyApp(ui = ui, server = server)
0 245
new file mode 100644
... ...
@@ -0,0 +1,93 @@
1
+Lik_Fun = function(Z,X,Beta){
2
+  return((1-Z) + (2*Z - 1)/(1+exp(-X %*% Beta)))
3
+}
4
+
5
+Post_Z = function(Y,W,Alpha,X,Beta){
6
+  return(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta)/(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta) + (Y == 0)*Lik_Fun(0,X,Beta)))
7
+}
8
+
9
+estimateFNR = function(x, bulk_model = FALSE, is.control = TRUE, K = 0, X = NULL, Alpha = NULL,
10
+                       is.expressed = NULL, niter = 100, ntresh = 0){
11
+  
12
+  require(matrixStats)
13
+  Y = t(matrix(as.numeric(x > ntresh ),nrow = dim(x)[1]))
14
+  
15
+  if(bulk_model){
16
+    x2 = x
17
+    x2[x2 <= ntresh] = NA
18
+    Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE))))
19
+  }
20
+  
21
+  # Prepare Wanted Variation and Gene-Intrinsic Intercept
22
+  X = cbind(X,rep(1,dim(x)[2]))
23
+  is.intercept = dim(X)[2] == 1
24
+  Beta = matrix(0,dim(X)[2],dim(x)[1])
25
+  
26
+  # Prepare Unwanted Variation (K factors) and Sample-Intrinsic Intercept
27
+  if( K == 0 || !is.null(Alpha) ){
28
+    Alpha = rbind(Alpha,rep(1,dim(x)[1]))
29
+    K = dim(Alpha)[1] - 1
30
+  }else{
31
+    M = 2*Y[is.control,] - 1
32
+    M = t(t(M - rowMeans(M)) - colMeans(M)) + mean(M)
33
+    Alpha = rbind(t(svd(M,nv = K)$v[,1:K]),rep(1,dim(x)[1]))
34
+  }
35
+  W = matrix(0,dim(x)[2],K+1)
36
+  
37
+  # Initialize Z
38
+  Z = Post_Z(Y,W,Alpha,X,Beta)
39
+  
40
+  # If control genes are supplied, W fit once, according to control genes
41
+  if(!is.null(is.expressed)){
42
+    cY = Y[,is.expressed]
43
+    cAlpha = Alpha[,is.expressed]
44
+    for (i in 1:dim(Z)[1]){
45
+      W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients
46
+    }   
47
+  }
48
+  
49
+  ## DR: it seems strange to me that this is a for loop and not a while !converged
50
+  for(n in 1:niter){    
51
+    # Update Beta
52
+    if(!is.intercept){
53
+      
54
+      if(is.null(is.expressed)){
55
+        for(i in 1:dim(Z)[2]){
56
+          Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients
57
+        }
58
+        
59
+      }else{ # Only over non-control genes
60
+        for(i in (1:dim(Z)[2])[!is.expressed]){
61
+          Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients
62
+        }
63
+      }
64
+      
65
+      
66
+    }else{
67
+      # Short-cut!
68
+      Beta = t(matrix(log(colMeans(Z)/(1-colMeans(Z)))))
69
+    }
70
+    
71
+    # Update Z
72
+    Z = Post_Z(Y,W,Alpha,X,Beta)
73
+  
74
+    if(is.null(is.expressed)){
75
+      # Update W
76
+      for (i in 1:dim(Z)[1]){
77
+        W[i,] = glm(cbind(Y[i,],(1-Y[i,])*Z[i,]) ~ 0 + t(Alpha),quasibinomial)$coefficients
78
+      }
79
+    
80
+      # Update Z
81
+      Z = Post_Z(Y,W,Alpha,X,Beta)
82
+    }
83
+  }
84
+  
85
+  # Stick to our initial assumptions
86
+  if(!is.null(is.expressed)){
87
+    Beta[,is.expressed] = NA
88
+    Z[,is.expressed] = 1
89
+  }
90
+  EL = sum(na.omit(as.vector(Z*log(Lik_Fun(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(Lik_Fun(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(Lik_Fun(0,X,Beta)))))
91
+  
92
+  return(list(Z = t(Z), EL = EL, P = t(Lik_Fun(1,W,Alpha)), W = W, X = X, Alpha = Alpha, Beta = Beta))
93
+}
0 94
\ No newline at end of file
1 95
new file mode 100644
... ...
@@ -0,0 +1,142 @@
1
+#' Internal Function
2
+#' 
3
+#' This function is used internally in scone to parse the variables used to generate the design matrices.
4
+#' 
5
+#'  
6
+#' @param pars character. A vector of parameters corresponding to a row of params.
7
+#' @param bio factor. The biological factor of interest.
8
+#' @param batch factor. The known batch effects.
9
+#' @param ruv_factors list. A list containing the factors of unwanted variation.
10
+#' @param qc matrix. The principal components of the QC metrics.
11
+#' 
12
+#' @return A list with the variables to be passed to make_design.
13
+parse_row <- function(pars, bio, batch, ruv_factors, qc) {
14
+  sc_name <- paste(pars[1:2], collapse="_")
15
+  
16
+  W <- out_bio <- out_batch <- NULL
17
+  
18
+  if(pars[3]!="no_uv") {
19
+    parsed <- strsplit(as.character(pars[3]), "=")[[1]]
20
+    if(grepl("ruv", parsed[1])) {
21
+      W <- ruv_factors[[sc_name]][,1:as.numeric(parsed[2])]
22
+    } else {
23
+      W <- qc[,1:as.numeric(parsed[2])]
24
+    }