Browse code

Moved files for package building

Davide Risso authored on 07/02/2016 06:34:07
Showing 23 changed files

... ...
@@ -1,4 +1,4 @@
1
-Package: SCONE
1
+Package: scone
2 2
 Version: 0.0.1
3 3
 Title: Single Cell Overview of Normalized Expression data
4 4
 Description: SCONE.
... ...
@@ -8,7 +8,9 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com",
8 8
 	          role = c("aut")))
9 9
 Author: Michael Cole [aut, cre, cph] and Davide Risso [aut]
10 10
 Maintainer: Michael Cole <mbeloc@gmail.com>
11
-Date: 01-22-2016
11
+Imports: BiocParallel, DESeq, EDASeq, MASS, RUVSeq, aroma.light, class,
12
+	 edgeR, fpc, limma, matrixStats, scde
13
+Date: 02-06-2016
12 14
 License: Artistic-2.0
13 15
 LazyLoad: yes
14 16
 RoxygenNote: 5.0.1
... ...
@@ -0,0 +1,27 @@
1
+# Generated by roxygen2: do not edit by hand
2
+
3
+export(DESEQ_FN)
4
+export(DESEQ_FN_POS)
5
+export(FQ_FN)
6
+export(FQ_FN_POS)
7
+export(TMM_FN)
8
+export(UQ_FN)
9
+export(UQ_FN_POS)
10
+export(estimate_zinb)
11
+export(impute_zinb)
12
+export(lm_adjust)
13
+export(make_design)
14
+export(scone)
15
+export(score_matrix)
16
+import(BiocParallel)
17
+importFrom(DESeq,estimateSizeFactorsForMatrix)
18
+importFrom(EDASeq,betweenLaneNormalization)
19
+importFrom(MASS,glm.nb)
20
+importFrom(RUVSeq,RUVg)
21
+importFrom(aroma.light,normalizeQuantileRank.matrix)
22
+importFrom(class,knn)
23
+importFrom(edgeR,calcNormFactors)
24
+importFrom(fpc,pamk)
25
+importFrom(limma,lmFit)
26
+importFrom(matrixStats,rowMedians)
27
+importFrom(scde,bwpca)
0 28
deleted file mode 100644
... ...
@@ -1,532 +0,0 @@
1
-## DR: if I understand correctly "nom" is used only for the name of the file to store the data.
2
-## There should be a better way to do this, using the name of the function (the opposite of get()?).
3
-
4
-source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/estimateFNR.R")
5
-source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE_DEFAULTS.R")
6
-library(BiocParallel,quietly = TRUE)
7
-library(cluster,quietly = TRUE)
8
-library(fpc,quietly = TRUE)
9
-
10
-runAdjustTask = function(task,evaluate_material,design,nested_model, to_store = FALSE){
11
-  
12
-#???  source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE)
13
-  
14
-  result = try({
15
-    
16
-    if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){
17
-      if(to_store){
18
-        load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))
19
-        score_out = scone_out$evaluation
20
-      
21
-        # Return values
22
-        if(!is.na(score_out)[1]){
23
-          scores = score_out$scores
24
-        }else{
25
-          scores = NA
26
-        }
27
-  
28
-        print(task$nom)
29
-        return(list(scores = scores))
30
-      }else{
31
-        print(task$nom)
32
-        return()
33
-      }
34
-    
35
-    }else{
36
-    
37
-      norm_lout = ADJUST_FN(task$ei,batch = task$batch,
38
-                            bio = task$condition,
39
-                            uv = task$uv,
40
-                            w = task$w,
41
-                            design = design, 
42
-                            nested_model = nested_model)
43
-      norm_out = exp(norm_lout) - 1
44
-      score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch,
45
-                            qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, 
46
-                            DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores,
47
-                            dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN,
48
-                            fnr_pi = evaluate_material$fnr_pi, nom = task$nom)
49
-    
50
-      scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom)
51
-      save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) 
52
-      if(to_store){
53
-      
54
-        # Return values
55
-        if(!is.na(score_out)[1]){
56
-          scores = score_out$scores
57
-        }else{
58
-          scores = NA
59
-        }
60
-    
61
-        print(task$nom)
62
-        return(list(scores = scores))
63
-      }else{
64
-        print(task$nom)
65
-        return()
66
-      }
67
-    
68
-    }
69
-  
70
-  })
71
-  
72
-  return(NA)
73
-  
74
-}
75
-
76
-runFactorFreeTask = function(task,evaluate_material,emat){
77
-  
78
-  ## ??? source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE)
79
-  
80
-  result = try({
81
-    if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){
82
-      load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))
83
-      norm_out = scone_out$exp_mat
84
-      score_out = scone_out$evaluation
85
-    
86
-      # Return values
87
-      if(!is.na(score_out)[1]){
88
-        scores = score_out$scores
89
-      }else{
90
-        scores = NA
91
-      }
92
-    
93
-      print(task$nom)
94
-      return(list(eo = norm_out, scores = scores))
95
-    
96
-      }else{
97
-        norm_out = task$FN(emat)
98
-
99
-        score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch,
100
-                          qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, 
101
-                          DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores,
102
-                          dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN,
103
-                          fnr_pi = evaluate_material$fnr_pi, nom = task$nom)
104
-  
105
-      scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom)
106
-      save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) 
107
-  
108
-      # Return values
109
-      if(!is.na(score_out)[1]){
110
-        scores = score_out$scores
111
-      }else{
112
-        scores = NA
113
-      }
114
-  
115
-      print(task$nom)
116
-      return(list(eo = norm_out, scores = scores))
117
-    }
118
-  
119
-  })
120
-  
121
-  return(NA)
122
-  
123
-}
124
-
125
-scoreMethod = function(e,condition = NULL, batch = NULL, 
126
-                       qual_scores = NULL, HK_scores = NULL,
127
-                       DE_scores = NULL, CC_scores = NULL,
128
-                       dim_eval = NULL, K_clust = NULL, K_NN = NULL,
129
-                       fnr_pi = NULL, nom = NULL){
130
-  if(any(is.na(e) | is.infinite(e) | is.nan(e))){
131
-    warning("NA/Inf/NaN Expression Values. Returning NA.")
132
-    return(NA)
133
-  }
134
-  # Project the data: PCA or wPCA
135
-  if(grepl("^IMPUTE",nom)){
136
-    pc_val = prcomp(t(log(e + 1)),center = TRUE,scale. = TRUE)$x[,1:dim_eval]
137
-  }else{
138
-    pc_val = wPCA(log(e + 1),1 - fnr_pi,nu = dim_eval,filt = TRUE)$x
139
-  }
140
-  
141
-  # Max cor with quality scores
142
-  EXP_QPC_COR = max(cor(pc_val,qual_scores,method = "spearman")^2)
143
-  
144
-  # Max cor with housekeeping scores
145
-  if(!is.null(HK_scores)){
146
-    EXP_HK_COR = max(cor(pc_val,HK_scores,method = "spearman")^2)
147
-  }else{
148
-    EXP_HK_COR = NA
149
-  }
150
-  
151
-  # Max cor with differentially expressed scores
152
-  if(!is.null(DE_scores)){
153
-    EXP_DE_COR = max(cor(pc_val,DE_scores,method = "spearman")^2)
154
-  }else{
155
-    EXP_DE_COR = NA
156
-  }
157
-  
158
-  # Max cor with cell cycle scores
159
-  if(!is.null(CC_scores)){
160
-    EXP_CC_COR = max(cor(pc_val,CC_scores,method = "spearman")^2)
161
-  }else{
162
-    EXP_CC_COR = NA
163
-  }
164
-  
165
-  d = as.matrix(dist(pc_val,method = "euclidean"))
166
-  
167
-  # K-NN Condition
168
-  if(!is.null(condition)){
169
-    m <- sapply(seq_len(nrow(d)), function(s) {
170
-      ds <- d[,s]
171
-      ra <- rank(ds)
172
-      return(mean(condition[(ra <= K_NN+1) & (ra != 1)] == condition[s]))
173
-    })
174
-    KNN_BIO = mean(m)
175
-  }else{
176
-    KNN_BIO = NA
177
-  }
178
-  
179
-  # K-NN Batch
180
-  if(!is.null(batch)){
181
-    m <- sapply(seq_len(nrow(d)), function(s) {
182
-      ds <- d[,s]
183
-      ra <- rank(ds)
184
-      return(mean(batch[(ra <= K_NN+1) & (ra != 1)] == batch[s]))
185
-    })
186
-    KNN_BATCH = mean(m)
187
-  }else{
188
-    KNN_BATCH = NA
189
-  }
190
-  
191
-  # PAM Silh
192
-  if(K_clust > 1){
193
-    pam_object = pam(pc_val,k = K_clust)
194
-    PAM_SIL = pam_object$silinfo$avg.width
195
-    clusters = pam_object$clustering
196
-  }else{
197
-    PAM_SIL = NA
198
-    clusters = NA
199
-  }
200
-  scores = c(EXP_QPC_COR,EXP_HK_COR,EXP_DE_COR,EXP_CC_COR,KNN_BIO,KNN_BATCH,PAM_SIL)
201
-  names(scores) = c("EXP_QPC_COR","EXP_HK_COR","EXP_DE_COR","EXP_CC_COR","KNN_BIO","KNN_BATCH","PAM_SIL")
202
-  return(list(scores = scores, pc_val = pc_val, clusters = clusters))
203
-}
204
-
205
-## ===== SCONE: Single-Cell Overview of Normalized Expression data =====
206
-# Intermediate wrapper is missing"
207
-# Decide model for the user - describe reason -"
208
-# One batch one condition - user friendly 
209
-# Check rank of design matrix - if nested - we're good - IFF?
210
-# Positive (evaluate) and negative controls (modeled + evaluate)
211
-# show correlation between q and hk - why we don't try all 
212
-# Default number of dim_UV
213
-# Factor free norms and adjust norms as lists = impute or not, positive data or not, only q , only hk
214
-# function 1, function 2, and k
215
-SCONE = function(e,condition = NULL, batch = NULL, 
216
-                 design = c("factorial","nested"), 
217
-                 nested_model = c("fixed","random"),
218
-                 qual = NULL, is_HK = NULL,
219
-                 is_DE = NULL,is_CC = NULL,
220
-                 dim_UV = 5,
221
-                 dim_eval = 3, K_clust = NULL, K_NN = 10, 
222
-                 out_dir = NULL,
223
-                 K_clust_MAX = 10,K_pseudobatch_MAX = 10,
224
-                 factor_free_only = FALSE, to_store = FALSE, n_workers = 10){
225
-  param = MulticoreParam(workers = n_workers)
226
-  ## ----- Set up output directory ----
227
-  if (file.exists(out_dir)){
228
-    #stop("Designated output directory already exists.")
229
-  } else {
230
-    dir.create(out_dir)
231
-  }
232
-  
233
-  ## ----- Imputation Step ----
234
-  print("Imputation Step...")
235
-  HK_default = FALSE
236
-  if(is.null(is_HK)){
237
-    warning("No control genes have been specified, using all genes for FNR estimation.")
238
-    HK_default = TRUE
239
-    is_HK = rep(TRUE ,nrow(e))
240
-  }
241
-  if(file.exists(paste0(out_dir,"/fnr_out.Rdata"))){
242
-    load(paste0(out_dir,"/fnr_out.Rdata"))
243
-  }else{
244
-    fnr_out = estimateFNR(e, bulk_model = TRUE, is.expressed = is_HK)
245
-    save(fnr_out,file = paste0(out_dir,"/fnr_out.Rdata"))
246
-  }
247
-  fnr_mu = exp(fnr_out$Alpha[1,])
248
-  fnr_pi = (e == 0) * fnr_out$Z 
249
-  imputed_e = e + (fnr_mu * fnr_pi)
250
-  print("Complete.")
251
-  
252
-  ## ----- Generate Unwanted Factors ----
253
-  print("Generating Unwanted Factors for Evaluation...")
254
-  if(file.exists(paste0(out_dir,"/evaluate_material.Rdata"))){
255
-    
256
-    load(paste0(out_dir,"/evaluate_material.Rdata"))
257
-    
258
-  }else{
259
-    
260
-  # Factors of alignment quality metrics
261
-  if(is.null(qual)){
262
-    warning("No quality metrics specified. Total counts and efficiency used.")
263
-    numba = colSums(e == 0)
264
-    beta = colMeans(e == 0)
265
-    qual = cbind(numba, log(beta/(1-beta)))
266
-  }
267
-  
268
-  if(is.null(dim_UV)){
269
-    dim_UV = min(ncol(qual), sum(is_HK))
270
-    warning(paste("Number of unwanted factors not specified. Selecting minimum of number of quality features and number of control genes:",dim_UV))
271
-    if(dim_UV >= ncol(e)){
272
-      stop("Number of unwanted factors >= number of samples!")
273
-    }
274
-  }
275
-  
276
-  qual_scores = prcomp(qual, center = TRUE, scale = TRUE)$x[,1:dim_UV]
277
-  
278
-  if(!HK_default){
279
-    HK_scores = wPCA(log(e[is_HK,] + 1), 1 - fnr_pi[is_HK,], nu = dim_UV, filt = TRUE)$x
280
-  }else{
281
-    HK_scores = NULL
282
-  }
283
-  
284
-  if(!is.null(is_DE)){
285
-    DE_scores = wPCA(log(e[is_DE,] + 1), 1 - fnr_pi[is_DE,], nu = dim_eval, filt = TRUE)$x
286
-  }else{
287
-    DE_scores = NULL
288
-  }
289
-  
290
-  if(!is.null(is_CC)){
291
-    CC_scores = wPCA(log(e[is_CC,] + 1), 1 - fnr_pi[is_CC,], nu = dim_eval, filt = TRUE)$x
292
-  }else{
293
-    CC_scores = NULL
294
-  }
295
-  
296
-  ## ----- Evaluation Criteria ----
297
-  if(is.null(K_clust)){
298
-    if(is.null(DE_scores)){
299
-      warning("No clustering K specified. Selecting K using PAM average silhouette width on raw wPCA.")
300
-      pc_val = wPCA(log(e + 1), 1 - fnr_pi, nu = dim_eval, filt = TRUE)$x
301
-    }else{
302
-      warning("No clustering K specified. Selecting K using PAM average silhouette width on DE scores.")
303
-      pc_val = DE_scores
304
-    }
305
-    
306
-    pamk.best <- pamk(data = pc_val, krange = 1:K_clust_MAX)
307
-    K_clust = pamk.best$nc
308
-  }
309
-  
310
-  data_dir = paste0(out_dir,"/data/")
311
-  if (file.exists(data_dir)){
312
-    #stop("Designated data directory already exists.")
313
-  } else {
314
-    dir.create(data_dir)
315
-  }
316
-  
317
-  evaluate_material = list(condition = condition, batch = batch, 
318
-                           qual_scores = qual_scores, HK_scores = HK_scores,
319
-                           DE_scores = DE_scores, CC_scores = CC_scores,
320
-                           dim_eval = dim_eval, K_clust = K_clust, K_NN = K_NN,
321
-                           fnr_pi = fnr_pi, out_dir = data_dir)
322
-  
323
-  save(evaluate_material,file = paste0(out_dir,"/evaluate_material.Rdata"))
324
-  
325
-  }
326
-  
327
-  print("Complete.")
328
-  
329
-  
330
-  ## ----- Factor-Free Normalization
331
-  ## DR: major change: the current code will copy over and over the same matrices (e and imputed_e).
332
-  ## There's no need to do so as we can infer what matrix to use from the name.
333
-  
334
-  task_list = list()
335
-  
336
-  # Generate Task List for Non-imputed
337
-  task_names = paste("NOIMPUTE",c("NONE","UQ","UQP","DESEQ","DESEQP","FQ","FQP","TMM"),sep = "_")
338
-  
339
-  ##DR: why not TMM__FN_POS?
340
-  task_FN = c(identity,UQ_FN,UQ_FN_POS,DESEQ_FN,DESEQ_FN_POS,FQ_FN,FQ_FN_POS,TMM_FN)
341
-  names(task_FN) = task_names
342
-  for (task in task_names){
343
-    task_list[[task]] = list(FN = task_FN[[task]], nom = task)
344
-  }
345
-  # Generate Task List for Imputed
346
-  task_names = paste("IMPUTE",c("NONE","UQ","DESEQ","FQ","TMM"),sep = "_")
347
-  task_FN = c(identity,UQ_FN,DESEQ_FN,FQ_FN,TMM_FN)
348
-  names(task_FN) = task_names
349
-  for (task in task_names){
350
-    task_list[[task]] = list(FN = task_FN[[task]], nom = task)
351
-  }
352
-  
353
-  # Parallel Normalization + Evaluation
354
-  print("Factor-Free Normalization and Evaluation...")
355
-  factor_free_out_noimpute = bplapply(task_list[grep("^NOIMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=e, BPPARAM = param)
356
-  factor_free_out_impute = bplapply(task_list[grep("^IMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=imputed_e, BPPARAM = param)
357
-  factor_free_out <- c(factor_free_out_noimpute, factor_free_out_impute)
358
-  print("Complete.")
359
-  
360
-  ## ----- Selection Step (Optional)
361
-  if(factor_free_only){
362
-    return(list(factor_free_out = factor_free_out, factor_based_out = NA))
363
-  }
364
-  ## ----- Factor-Based Normalization
365
-  
366
-  print("Selecting Base Tasks...")
367
-  base_task_list <- lapply(1:length(factor_free_out), function(i) {
368
-    if(!any(is.na(factor_free_out[[i]]$eo) | is.nan(factor_free_out[[i]]$eo) | is.infinite(factor_free_out[[i]]$eo))) {
369
-      return(list(ei = factor_free_out[[i]]$eo, nom = names(factor_free_out)[i]))
370
-    }
371
-  })
372
-  names(base_task_list) <- names(factor_free_out)
373
-  base_task_list <- base_task_list[!sapply(base_task_list, is.null)]
374
-  print("Complete.")
375
-  
376
-  
377
-  # WEIGHT
378
-  print("Extending Tasks by Weighting Scheme...")
379
-  ext_task_list = list()
380
-  for( task in names(base_task_list)){
381
-    base_nom = task
382
-    if(!grepl("^IMPUTE",task)){
383
-      ext_nom = paste(base_nom,"ZEROWEIGHT",sep = "_")
384
-      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
385
-      ext_task_list[[ext_nom]]$nom = ext_nom 
386
-      ext_task_list[[ext_nom]]$w = 0 + (ext_task_list[[ext_nom]]$ei > 0)
387
-    }
388
-    ext_nom = paste(base_nom,"NOWEIGHT",sep = "_")
389
-    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
390
-    ext_task_list[[ext_nom]]$nom = ext_nom 
391
-    ext_task_list[[ext_nom]]$w = NULL
392
-  }
393
-  print("Complete.")
394
-  
395
-  base_task_list = ext_task_list
396
-  
397
-  # Generate Method-Specific Factors (Should be parallel)
398
-  print("Computing Method-Specific Factors...")
399
-  
400
-  task_factor_list = list()  
401
-  for( task in names(base_task_list)){
402
-    
403
-    # HK
404
-    if(!HK_default){
405
-      if(!is.null(base_task_list[[task]]$w)){
406
-        task_factor_list[[task]]$HK_scores = wPCA(x = log(base_task_list[[task]]$ei[is_HK,]+1),w = base_task_list[[task]]$w[is_HK,],nu = dim_UV,filt = TRUE)$x
407
-      }else{
408
-        task_factor_list[[task]]$HK_scores = prcomp(t(log(base_task_list[[task]]$ei[is_HK,]+1)))$x[,1:dim_UV]
409
-      }
410
-      
411
-      if(is.null(K_pseudobatch)){
412
-        pamk_object = pamk(task_factor_list[[task]]$HK_scores,krange = 1:K_pseudobatch_MAX)
413
-        task_factor_list[[task]]$hk_nc = pamk_object$nc
414
-        task_factor_list[[task]]$pam_clustering =  pamk_object$pamobject$clustering
415
-      }else{
416
-        task_factor_list[[task]]$hk_nc = K_pseudobatch
417
-        task_factor_list[[task]]$pam_clustering = pam(task_factor_list[[task]]$HK_scores,k = K_pseudobatch)$clustering
418
-      }
419
-    }else{
420
-      task_factor_list[[task]]$HK_scores = NULL
421
-      task_factor_list[[task]]$hk_nc = NULL
422
-      task_factor_list[[task]]$pam_clustering = NULL
423
-    }
424
-  }
425
-  print("Complete.")
426
-  
427
-  # Generate General Factor Clusterings
428
-  print("Generating General Factor-Based Clusters...")
429
-  if(is.null(K_pseudobatch)){
430
-    pamk_object = pamk(evaluate_material$qual_scores,krange = 1:K_pseudobatch_MAX)
431
-    qual_nc = pamk_object$nc
432
-    qual_clustering =  as.factor(pamk_object$pamobject$clustering)
433
-  }else{
434
-    qual_nc = K_pseudobatch
435
-    qual_clustering = as.factor(pam(evaluate_material$qual_scores,k = K_pseudobatch)$clustering)
436
-  }
437
-  print("Complete.")
438
-  
439
-  # BIO
440
-  print("Extending Tasks by Biological Covariate...")
441
-  ext_task_list = list()
442
-  for( task in names(base_task_list)){
443
-    base_nom = task
444
-    
445
-    if(!is.null(condition)){
446
-      ext_nom = paste(base_nom,"BIO",sep = "_")
447
-      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
448
-      ext_task_list[[ext_nom]]$nom = ext_nom 
449
-      ext_task_list[[ext_nom]]$condition = condition
450
-    }
451
-    ext_nom = paste(base_nom,"NOBIO",sep = "_")
452
-    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
453
-    ext_task_list[[ext_nom]]$nom = ext_nom 
454
-    ext_task_list[[ext_nom]]$condition = NULL
455
-  }
456
-  print("Complete.")
457
-  
458
-  print("Extending Tasks by Batch Covariate...")
459
-  base_task_list = ext_task_list
460
-  # BATCH
461
-  ext_task_list = list()
462
-  for( task in names(base_task_list)){
463
-    base_nom = task
464
-    
465
-    if(!is.null(batch)){
466
-      ext_nom = paste(base_nom,"BATCH",sep = "_")
467
-      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
468
-      ext_task_list[[ext_nom]]$nom = ext_nom 
469
-      ext_task_list[[ext_nom]]$batch = batch
470
-    }
471
-    ext_nom = paste(base_nom,"NOBATCH",sep = "_")
472
-    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
473
-    ext_task_list[[ext_nom]]$nom = ext_nom 
474
-    ext_task_list[[ext_nom]]$batch = NULL
475
-  }
476
-  print("Complete.")
477
-  
478
-  print("Extending Tasks by UV Covariates...")
479
-  base_task_list = ext_task_list  
480
-  # UV
481
-  ext_task_list = list()
482
-  for( task in names(base_task_list)){
483
-    base_nom = task
484
-    method_class = gsub("WEIGHT_.*","WEIGHT",task)
485
-    
486
-    ext_nom = paste(base_nom,"NOUV",sep = "_")
487
-    ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
488
-    ext_task_list[[ext_nom]]$nom = ext_nom 
489
-    ext_task_list[[ext_nom]]$uv = NULL
490
-    
491
-    # HK UV
492
-    if(!HK_default){
493
-      for(j in 1:dim_UV){
494
-        ext_nom = paste(base_nom,paste("HK",j,sep = "_"),sep = "_")
495
-        ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
496
-        ext_task_list[[ext_nom]]$nom = ext_nom 
497
-        ext_task_list[[ext_nom]]$uv = as.matrix(task_factor_list[[method_class]]$HK_scores[,1:j])
498
-      }  
499
-      #       if(task_factor_list[[method_class]]$hk_nc > 1){
500
-      #         ext_nom = paste(base_nom,"HKCLUST",sep = "_")
501
-      #         ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
502
-      #         ext_task_list[[ext_nom]]$nom = ext_nom 
503
-      #         ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(task_factor_list[[method_class]]$pam_clustering))
504
-      #       }
505
-    }
506
-    # Qual UV
507
-    for(j in 1:dim_UV){
508
-      ext_nom = paste(base_nom,paste("Q",j,sep = "_"),sep = "_")
509
-      ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
510
-      ext_task_list[[ext_nom]]$nom = ext_nom 
511
-      ext_task_list[[ext_nom]]$uv = as.matrix(evaluate_material$qual_scores[,1:j])
512
-    }  
513
-    #     if(qual_nc > 1){
514
-    #       ext_nom = paste(base_nom,"QCLUST",sep = "_")
515
-    #       ext_task_list[[ext_nom]] = base_task_list[[base_nom]]
516
-    #       ext_task_list[[ext_nom]]$nom = ext_nom 
517
-    #       ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(qual_clustering))
518
-    #     }
519
-  }
520
-  print("Complete.")
521
-  ext_task_list = ext_task_list[!grepl("NOBATCH_NOUV",names(ext_task_list))]
522
-  if(!to_store){
523
-    finished_tasks = gsub(".Rdata","",list.files(evaluate_material$out_dir))
524
-    ext_task_list = ext_task_list[! (names(ext_task_list) %in% finished_tasks)]
525
-  }
526
-  print(length(ext_task_list))
527
-  print("Factor-Based Adjustment Normalization and Evaluation...")
528
-  factor_based_out = bplapply(ext_task_list,FUN = runAdjustTask, evaluate_material = evaluate_material,design = design,nested_model = nested_model,BPPARAM = param)
529
-  print("Complete.")
530
-  
531
-  return(list(factor_free_out = factor_free_out, factor_based_out = factor_based_out))
532
-}
... ...
@@ -1,11 +1,6 @@
1
-library(DESeq,quietly = TRUE)
2
-library(EDASeq,quietly = TRUE)
3
-library(edgeR,quietly = TRUE)
4
-library(sva,quietly = TRUE)
5
-library(aroma.light,quietly = TRUE)
6
-library(lme4,quietly = TRUE)
7
-
8 1
 #' Upper-quartile normalization wrapper.
2
+#' @importFrom EDASeq betweenLaneNormalization
3
+#' @export
9 4
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
10 5
 #' @return Upper-quartile normalized matrix (scaled by sample UQ)
11 6
 UQ_FN = function(ei){
... ...
@@ -34,6 +29,7 @@ uq_f <- function(which="upper", round = FALSE){
34 29
 #           })
35 30
 
36 31
 #' Upper-quartile normalization applied to positive data.
32
+#' @export
37 33
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
38 34
 #' @return Upper-quartile (positive) normalized matrix (scaled by sample UQ)
39 35
 UQ_FN_POS = function(ei){
... ...
@@ -47,6 +43,8 @@ UQ_FN_POS = function(ei){
47 43
 }
48 44
 
49 45
 #' Full-Quantile normalization wrapper.
46
+#' @importFrom aroma.light normalizeQuantileRank.matrix
47
+#' @export
50 48
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
51 49
 #' @return FQ normalized matrix.
52 50
 FQ_FN = function(ei){
... ...
@@ -55,6 +53,7 @@ FQ_FN = function(ei){
55 53
 }
56 54
 
57 55
 #' Full-Quantile normalization applied to positive data.
56
+#' @export
58 57
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
59 58
 #' @return FQ (positive) normalized matrix.
60 59
 FQ_FN_POS = function(ei){
... ...
@@ -112,6 +111,8 @@ FQ_FN_POS = function(ei){
112 111
 }
113 112
 
114 113
 #' DESeq normalization wrapper.
114
+#' @importFrom DESeq estimateSizeFactorsForMatrix
115
+#' @export
115 116
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
116 117
 #' @return DESeq normalized matrix (scaled by sample )
117 118
 DESEQ_FN = function(ei){
... ...
@@ -121,6 +122,7 @@ DESEQ_FN = function(ei){
121 122
 }
122 123
 
123 124
 #' DESeq normalization applied to positive data.
125
+#' @export
124 126
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
125 127
 #' @return DESeq (positive) normalized matrix (scaled by sample )
126 128
 DESEQ_FN_POS = function(ei){
... ...
@@ -153,6 +155,8 @@ DESEQ_FN_POS = function(ei){
153 155
 }
154 156
 
155 157
 #' TMM normalization wrapper.
158
+#' @importFrom edgeR calcNormFactors
159
+#' @export
156 160
 #' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
157 161
 #' @return TMM normalized matrix (scaled by sample )
158 162
 TMM_FN = function(ei){
... ...
@@ -161,232 +165,3 @@ TMM_FN = function(ei){
161 165
   return(eo)
162 166
 }
163 167
 
164
-#' Output residuals + intercept of fit to quality score matrix
165
-#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
166
-#' @param default_args_list = list of default arguments
167
-#' @param args_list = list of user-provided arguments
168
-#' @return Matrix adjusted for K quality scores
169
-
170
-QUALREG_FN = function(ei, default_args_list, args_list){
171
-  EPSILON = default_args_list$EPSILON
172
-  scores = prcomp(default_args_list$q,center = TRUE,scale = TRUE)$x[,1:default_args_list$K]
173
-  re = log(ei + EPSILON)
174
-  for (g in 1:dim(ei)[1]){
175
-    lin.mod = lm(re[g,] ~ scores)
176
-    re[g,] = lin.mod$coefficients[1] + lin.mod$residuals
177
-  }
178
-  re = exp(re)
179
-  re[e == 0] = 0 
180
-  return(re)
181
-}
182
-
183
-#' Output residuals + intercept of fit to control-gene derived factors
184
-#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
185
-#' @param default_args_list = list of default arguments
186
-#' @param args_list = list of user-provided arguments
187
-#' @return Matrix adjusted for K unwanted factors
188
-
189
-RUVG_FN = function(ei, default_args_list, args_list){
190
-  EPSILON = default_args_list$EPSILON
191
-  scores = prcomp(t(log(ei[default_args_list$control_genes,]+default_args_list$EPSILON)),center = TRUE,scale. = FALSE)$x[,1:default_args_list$K]   
192
-  re = log(ei + default_args_list$EPSILON)
193
-  for (g in 1:dim(ei)[1]){
194
-    lin.mod = lm(re[g,] ~ scores)
195
-    re[g,] = lin.mod$coefficients[1] + lin.mod$residuals
196
-  }
197
-  re = exp(re)
198
-  re[e == 0] = 0 
199
-  return(re)
200
-}
201
-
202
-#' ComBat Wrapper
203
-#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required.
204
-#' @param default_args_list = list of default arguments
205
-#' @param args_list = list of user-provided arguments
206
-#' @return Matrix adjusted for batch
207
-
208
-COMBAT_FN = function(ei, default_args_list, args_list){
209
-  EPSILON = default_args_list$EPSILON
210
-  SD_EPSILON = default_args_list$SD_EPSILON
211
-  ei = log(ei + default_args_list$EPSILON)
212
-  pheno = as.data.frame(cbind(default_args_list$tech_batch,default_args_list$bio_cond))
213
-  if(is.null(default_args_list$bio_cond) || any(is.na(default_args_list$bio_cond))){
214
-    colnames(pheno) = c("batch")
215
-    is_var = apply(ei,1,sd) > SD_EPSILON
216
-    mod = model.matrix(~ 1, data = pheno)
217
-  }else{
218
-    colnames(pheno) = c("batch","phenotype")
219
-    is_var = T
220
-    for (p in unique(default_args_list$bio_cond)){
221
-      is_var = is_var & (apply(ei[,default_args_list$bio_cond == p],1,sd) > SD_EPSILON)
222
-    }
223
-    mod = model.matrix(~ as.factor(phenotype), data = pheno)
224
-  }
225
-  eo = ei
226
-  eo[is_var,] = ComBat(dat=ei[is_var,], batch=default_args_list$tech_batch, mod=mod, par.prior=TRUE, prior.plots=FALSE)
227
-  eo = exp(eo)
228
-  eo[ei == 0] = 0
229
-  return(eo)
230
-}
231
-
232
-NESTED_BATCH_FIXED_FN = function(ei,bio,batch,uv = NULL,w = NULL){
233
-  if(!is.null(bio)){
234
-    bio = factor(bio, levels = sort(levels(bio)))
235
-    batch = factor(batch, levels = unique(batch[order(bio)]))
236
-
237
-    n_vec <- tapply(batch, bio, function(x) nlevels(droplevels(x)))
238
-  }else{
239
-    n_vec = nlevels(droplevels(batch))
240
-  }
241
-  
242
-  mat = matrix(0,nrow = sum(n_vec),ncol = sum(n_vec - 1))
243
-  xi = 1
244
-  yi = 1
245
-  for(i in 1:length(n_vec)){
246
-    if(n_vec[i] > 1){
247
-      cs = contr.sum(n_vec[i])
248
-      dd = dim(cs)
249
-      mat[xi:(xi + dd[1] - 1),yi:(yi + dd[2] - 1)] = cs
250
-      xi = xi + dd[1]
251
-      yi = yi + dd[2]
252
-    }else{
253
-      xi = xi + 1
254
-    }
255
-  }
256
-  
257
-  leo = log(ei+1)
258
-  if(is.null(uv)){
259
-    if(!is.null(bio)) {
260
-      design_mat <- model.matrix(~ bio + batch, contrasts=list(bio=contr.sum, batch=mat))
261
-    } else {
262
-      design_mat <- model.matrix(~ batch, contrasts=list(batch=mat))
263
-    }
264
-    lm_object <- lmFit(leo, design = design_mat, weights = w)
265
-
266
-    bind <- (ncol(lm_object$coefficients) - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients))
267
-    leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch]
268
-    } else {
269
-      if(!is.null(bio)) {
270
-        design_mat <- model.matrix(~ bio + batch + uv, contrasts=list(bio=contr.sum, batch=mat))
271
-      } else {
272
-        design_mat <- model.matrix(~ batch + uv, contrasts=list(batch=mat))
273
-      }
274
-      lm_object <- lmFit(leo, design = design_mat, weights = w)
275
-
276
-      uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
277
-      bind = (ncol(lm_object$coefficients) - dim(uv)[2] - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients) - dim(uv)[2])
278
-      leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind]))
279
-    }
280
-  return(leo)
281
-}
282
-
283
-NESTED_BATCH_RANDOM_FN = function(ei,bio,batch,uv = NULL,w = NULL){
284
-  design <- model.matrix(~batch - 1)
285
-  leo = log(ei+1)
286
-  if(is.null(uv)){
287
-    for(i in 1:dim(ei)[1]){
288
-      lm_object = lmer(leo[i,] ~ 1 + bio + (1 | batch),weights = w[i,])
289
-      betaj <- ranef(lm_object)[[1]][,1]
290
-      leo[i,] = leo[i,] - as.vector(design %*% betaj)
291
-    }
292
-  }else{
293
-    for(i in 1:dim(ei)[1]){
294
-      lm_object = lmer(leo[i,] ~ 1 + bio + uv + (1 | batch),
295
-                       weights = w[i,])
296
-      betaj <- ranef(lm_object)[[1]][,1]
297
-      uvind = (length(coef(lm_object)[[1]][1,]) - dim(uv)[2] + 1):(length(coef(lm_object)[[1]][1,]))
298
-      leo[i,] = leo[i,] - as.vector(design %*% betaj) - as.vector(uv %*% t(coef(lm_object)[[1]][1,][uvind]))
299
-    }
300
-  }
301
-  return(leo)
302
-}
303
-
304
-FACTORIAL_BATCH_FN = function(ei,bio,batch,uv = NULL,w = NULL){
305
-  contrasts <- contr.treatment(length(levels(batch)))
306
-  rownames(contrasts) = levels(batch)
307
-  leo = log(ei+1)
308
-  
309
-  if(is.null(bio)){
310
-    if(is.null(uv)){
311
-      
312
-      # Just Batch
313
-      design_mat <- model.matrix(~batch)
314
-      lm_object <- lmFit(leo, design = design_mat, weights = w)
315
-      bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients))
316
-      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch]
317
-    } else {
318
-      
319
-      # Batch + UV
320
-      design_mat <- model.matrix(~batch + uv)
321
-      lm_object <- lmFit(leo, design = design_mat, weights = w)
322
-      uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
323
-      bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2])
324
-      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind]))
325
-    }
326
-  } else {
327
-    if(is.null(uv)){
328
-      
329
-      # Batch + BIO
330
-      design_mat <- model.matrix(~bio + batch)
331
-      lm_object <- lmFit(leo, design = design_mat, weights = w)
332
-      bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients))
333
-      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch]
334
-
335
-    } else {
336
-      
337
-      # Batch + UV + BIO
338
-      design_mat <- model.matrix(~bio + batch + uv)
339
-      lm_object <- lmFit(leo, design = design_mat, weights = w)
340
-      uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
341
-      bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2])
342
-      leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind]))
343
-    }
344
-  }
345
-  return(leo)
346
-}
347
-
348
-BATCHFREE_FN = function(ei,bio,uv = NULL,w = NULL){
349
-  leo = log(ei+1)
350
-  
351
-  if(is.null(bio)){
352
-    # Just UV
353
-    design_mat <- model.matrix(~uv)
354
-    lm_object <- lmFit(leo, design = design_mat, weights = w)
355
-    uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
356
-    leo = leo - t(uv %*% t(lm_object$coefficients[,uvind]))
357
-  } else {
358
-    # Bio + UV
359
-    design_mat <- model.matrix(~bio + uv)
360
-    lm_object <- lmFit(leo, design = design_mat, weights = w)
361
-    uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients))
362
-    leo2 = leo - t(uv %*% t(lm_object$coefficients[,uvind]))
363
-  }
364
-  return(leo)
365
-}
366
-
367
-ADJUST_FN = function(ei,batch = NULL,
368
-                     bio = NULL,
369
-                     uv = NULL,
370
-                     w = NULL,
371
-                     design = c("factorial","nested"), 
372
-                     nested_model = c("fixed","random")){
373
-  
374
-  if(!is.null(batch)){
375
-    design <- match.arg(design)
376
-    
377
-    if(design == "factorial"){
378
-      leo = FACTORIAL_BATCH_FN(ei,bio,batch,uv,w)
379
-    }else{
380
-      nested_model <- match.arg(nested_model)
381
-      
382
-      if(nested_model == "fixed"){
383
-        leo = NESTED_BATCH_FIXED_FN(ei,bio,batch,uv,w)
384
-      }else{
385
-        leo = NESTED_BATCH_RANDOM_FN(ei,bio,batch,uv,w)
386
-      }
387
-    }
388
-  }else{
389
-    leo = BATCHFREE_FN(ei,bio,uv,w)
390
-  }
391
-  return(leo)
392
-}
393 168
\ No newline at end of file
394 169
deleted file mode 100644
... ...
@@ -1,69 +0,0 @@
1
-out_dir = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data"
2
-flist = paste(out_dir,list.files(out_dir),sep = "/")
3
-
4
-# Scoring
5
-scores = NULL
6
-snames = NULL
7
-for(fil in flist){
8
-  load(fil)
9
-  nom = scone_out$method
10
-  sc = scone_out$evaluation
11
-  if(!is.na(sc)[1]){
12
-    print(nom)
13
-    snames = c(snames,nom)
14
-    scores = rbind(scores,sc$scores)
15
-  }else{
16
-    # print(nom)
17
-  }
18
-}
19
-rownames(scores) = snames
20
-escores = t(t(scores)*c(-1,-1,1,1,1,-1,1))
21
-colnames(escores) = paste(colnames(scores),c("_INV","")[1.5 + c(-1,-1,1,1,1,-1,1)/2],sep = "")
22
-
23
-scores = t(na.omit(t(scores)))
24
-escores = t(na.omit(t(escores)))
25
-
26
-# Score Heatmaps
27
-require(gplots)
28
-heatmap.2(t(apply(scores ,2,rank)[order(-apply((apply(escores ,2,rank)),1,min)),][1:20,]),density.info = 'n',trace = 'n',key.xlab = "Score",key.title = NA,
29
-          Colv = NA,col = colorRampPalette(c("purple","black","yellow"))(100),
30
-          margins = c(15,10),cexRow = .75,cexCol = .5)
31
-title(main = "Top 20 SCONE",cex.lab=0.5)
32
-
33
-my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min)))
34
-
35
-names(which(my_rank == 1))
36
-load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_NONE.Rdata")
37
-is_zero = scone_out$exp_mat == 0
38
-load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1.Rdata")
39
-myma = scone_out$exp_mat
40
-hk = read.table("~/YosefCode/packages/RNASeq/summary/EXAMPLE/reference_files/house_keeping_human_names.txt")$V1
41
-myma[is_zero] = 0
42
-fnr_out = estimateFNR(myma,bulk_model = T,is.expressed = rownames(myma) %in% as.character(hk) )
43
-w = 0 + !is_zero
44
-w[is_zero] = 1-fnr_out$Z[is_zero]
45
-wpc = wPCA(log(scone_out$exp_mat + 1),w,filt = T)
46
-plot(wpc$x[,1:2], col = cols)
47
-
48
-rownames(wpc$rotation) = rownames(scone_out$exp_mat)[wpc$to.pass]
49
-sort(-abs(wpc$rotation[,2]))[1:20]
50
-
51
-load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/evaluate_material.Rdata")
52
-pchs = 16
53
-cols = rainbow(length(levels(evaluate_material$batch)))[evaluate_material$batch]
54
-plot(scone_out$evaluation$pc_val[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells")
55
-legend("bottomleft",legend = (levels(evaluate_material$batch)),pch = 16, col = rainbow(length(levels(evaluate_material$batch))), cex = .5)
56
-outy = log(scone_out$exp_mat+1)
57
-cols = rainbow(2)[scone_out$evaluation$clusters]
58
-plot(scone_out$evaluation$pc_val[,c(1,3)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells")
59
-
60
-write.table(outy,file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1_log_counts.txt",quote = F,sep = "\t",col.names = T,row.names = T)
61
-  evaluate_material$
62
-evaluate_material$dim_eval
63
-plot(evaluate_material$DE_scores[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells")
64
-
65
-
66
-my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min)))
67
-write.table(cbind(scores,my_rank),file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/scores.txt",quote = F,sep = "\t")
68
-
69
-
70 0
deleted file mode 100644
... ...
@@ -1,244 +0,0 @@
1
-library(shiny)
2
-library(EDASeq)
3
-library(clusterCells)
4
-load("../shiny/tmp.rda")
5
-
6
-library(RColorBrewer)
7
-cc <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"),brewer.pal(12,"Set3"),
8
-        brewer.pal(8, "Accent"))
9
-
10
-choose_color <- function(input) {
11
-  if(input$color_code=="type") {
12
-    cols <- cc[type]
13
-  } else {
14
-    cols <- cc[batch]
15
-  }
16
-  return(cols)
17
-}
18
-
19
-server <- function(input, output) {
20
-    output$boxplot <- renderPlot({
21
-      set.seed(19883)
22
-      if(input$downsample) {
23
-        idx <- sample(1:100)
24
-      } else {
25
-        idx <- 1:ncol(norm)
26
-      }      
27
-      boxplot(log_norm[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]],
28
-              outline=FALSE, names=FALSE, ylab="log(count+1)", main="Log normalized counts")
29
-      lines(apply(log_norm[,idx], 2, median))
30
-    }
31
-    )
32
-    
33
-    output$rle <- renderPlot({
34
-      set.seed(19883)
35
-      if(input$downsample & ncol(norm)>100) {
36
-        idx <- sample(1:100)
37
-      } else {
38
-        idx <- 1:ncol(norm)
39
-      }
40
-      boxplot(rle[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]],
41
-              outline=FALSE, names=FALSE, ylab="Relative Log Expression", main="Relative Log Expression (RLE)")
42
-      abline(h=0, lty=2)
43
-    })
44
-    
45
-    output$pca <- renderPlot(
46
-      plot(pca$x[,1:2], col=choose_color(input), pch=19, main="All genes")
47
-    )
48
-    
49
-    output$pca_pc <- renderPlot(
50
-      plot(pca_pc$x[,1:2], col=choose_color(input), pch=19, main="Positive controls")
51
-    )
52
-    
53
-    output$pca_nc <- renderPlot(
54
-      plot(pca_nc$x[,1:2], col=choose_color(input), pch=19, main="Negative controls")
55
-    )
56
-
57
-    output$pca_mv <- renderPlot(
58
-      if(input$variance=="variance") {
59
-        plot(pca_var$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes")        
60
-      } else {
61
-        plot(pca_cv$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes")        
62
-      } 
63
-    )
64
-    
65
-    output$qc_plot <- renderPlot({
66
-      barplot(abs(cors), beside = TRUE, col=cc[1:ncol(qc)], border=cc[1:ncol(qc)], ylim=c(0, 1),
67
-              space=c(0, 4), main="Absolute correlation with QC scores")
68
-      legend("topright", colnames(qc), fill=cc, border=cc, cex=.5)  
69
-    }
70
-    )
71
-
72
-    output$qc_pca <- renderPlot({
73
-      plot(qc_pca$x[,1:2], pch=19, col=choose_color(input), main="Quality PCA")
74
-    }
75
-    )
76
-    
77
-    output$qc_corr <- renderTable(abs(cors[,1:5]))
78
-    
79
-    output$pam_cc <- renderPlot({
80
-      dualHeatmap(heatData = sub[[input$k]], main="Heatmap of co-clustering, based on 100 subsamplings",
81
-                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, 
82
-                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
83
-                  annColor=list(Batch=cc, Type=cc))
84
-    })
85
-    
86
-    output$pam_type <- renderTable({
87
-      table(pam[[input$k-1]]$clustering, type)      
88
-    })
89
-    
90
-    output$pam_batch <- renderTable({
91
-      table(pam[[input$k-1]]$clustering, batch)      
92
-    })
93
-    
94
-    output$pam_pca <- renderPlot({
95
-      plot(pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Expression PCA")
96
-      legend("topright", as.character(1:input$k), fill=cc, cex=.5)
97
-    })
98
-    
99
-    output$pam_qc <- renderPlot({
100
-      plot(qc_pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Quality PCA")
101
-      legend("topright", as.character(1:input$k), fill=cc, cex=.5)
102
-    })
103
-    
104
-    output$heatmap_pc <- renderPlot({
105
-      dualHeatmap(heatData = t(norm), main="Positive Control genes",
106
-                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=poscon,
107
-                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
108
-                  annColor=list(Batch=cc, Type=cc))
109
-    })
110
-    
111
-    output$heatmap_nc <- renderPlot({
112
-      dualHeatmap(heatData = t(norm), main="Negative Control genes",
113
-                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=negcon,
114
-                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
115
-                  annColor=list(Batch=cc, Type=cc))
116
-    })
117
-
118
-    output$heatmap_mv <- renderPlot({
119
-      if(input$variance=="variance") {
120
-        genes <- mv_var
121
-      } else {
122
-        genes <- mv_cv
123
-      } 
124
-      
125
-      dualHeatmap(heatData = t(norm), main="Top 100 most variable genes",
126
-                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes,
127
-                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
128
-                  annColor=list(Batch=cc, Type=cc))
129
-    })
130
-
131
-    output$heatmap_pc1 <- renderPlot({
132
-      genes <- names(sort(pca$rotation[,1], decreasing = TRUE)[1:100])
133
-      dualHeatmap(heatData = t(norm), main="Top 100 genes by PC1 loadings",
134
-                  clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes,
135
-                  annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type),
136
-                  annColor=list(Batch=cc, Type=cc))
137
-    })
138
-    
139
-    output$box_pc1_pc <- renderPlot({
140
-      boxplot(tapply(pca_pc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Positive Controls")
141
-    })
142
-
143
-    output$box_pc1_nc <- renderPlot({
144
-      boxplot(tapply(pca_nc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Negative Controls")
145
-    })
146
-    
147
-    output$box_pc1_qc <- renderPlot({
148
-      boxplot(tapply(qc_pca$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of quality scores")
149
-    })
150
-    
151
-    output$curve_dropout <- renderPlot({
152
-      plot(logistic(fitted[order(fnr$Alpha[1,]),1])~sort(fnr$Alpha[1,]), type='l', ylim=c(0, 1), col=rgb(0, 0, 0, 0.2), xlab="log average expression", ylab="Probability of drop-out", main="Drop-out Curves")
153
-      for(i in 2:ncol(fitted)) {
154
-        lines(logistic(fitted[order(fnr$Alpha[1,]),i])~sort(fnr$Alpha[1,]), type='l', col=rgb(0, 0, 0, 0.2))
155
-      }
156
-      lines(logistic(rowMeans(fitted)[order(fnr$Alpha[1,])])~sort(fnr$Alpha[1,]), col=2, lwd=2)
157
-    })
158
-        
159
-    output$box_detection <- renderPlot({
160
-      boxplot(tapply(colSums(norm>5), get(input$color_code), function(x) x), las=2, col=cc, ylab="Number of detected genes (> 5 reads)", main="Detected Genes")
161
-    })
162
-
163
-    output$box_dropout <- renderPlot({
164
-      boxplot(tapply(colMeans(1-w), get(input$color_code), function(x) x), las=2, col=cc, ylab="Drop-out rate", main="Drop-out Rate")
165
-    })
166
-    
167
-  }
168
-
169
-ui <- fluidPage(
170
-  titlePanel("In-depth Normalization Report"),
171
-  
172
-  sidebarLayout(
173
-    sidebarPanel(
174
-      selectInput("color_code", label = "Color code / stratify by",
175
-                  choices = list("Batch"="batch", "Cell Type"="type"),
176
-                  selected = "batch"),
177
-      selectInput("variance", label = "Select most variable genes by",
178
-                  choices = list("Variance"="variance", "Coefficient of variation"="cv"),
179
-                  selected = "cv"),
180
-      sliderInput("k", label = "Number of clusters",
181
-                  min=2, max=10,
182
-                  value = 7),
183
-      checkboxInput("downsample", label="Downsample to 100 cells for boxplot panel", value=TRUE),
184
-      helpText("RLE: relative log expression."),
185
-      helpText("PCA: principal component analysis."),
186
-      helpText("QC: quality control."),
187
-      helpText("The 'Boxplot' panel shows the read count distribution and the RLE plot."),
188
-      helpText("The 'PCA' panel shows the PCA of different subset of genes."),
189
-      helpText("The 'QC' panel shows the absolute correlation between the first 10 principal components of the expression matrix and the QC measures. It also shows the PCA of the QC measures. Note that the PCA score signs are not meaningful (e.g., 'good' cells could have a low value of PC1 and 'bad' cells a high value)."),
190
-      helpText("The 'Cluster stability' panel shows a heatmap of the PAM co-clustering based on 100 subsamples of the data. It shows the relation between the clusters obtained by PAM (on the original data) to cell type, batch, expression PCA, and QC PCA. Note that these results are reported for normalization evaluation only. A proper cluster analysis should be performed to find sub-populations in the data."),
191
-      helpText("The 'Heatmaps' panel shows the log expression of the positive controls, negative controls, genes with highest PC1 loadings, and most variable genes."),
192
-      helpText("The 'Confounding' panel shows the first PC of positive controls, negative controls, and QC stratified by either cell type or batch. This is useful to identify lower quality batches and potential confounding between quality and cell type."),
193
-      helpText("The 'Drop-out' panel shows the number of detected genes and the drop-out rate stratified by either cell type or batch. It also shows the probability of drop-out as a function of the average expression"),
194
-      helpText("The 'Number of clusters' slider controls the PAM clustering in the 'Cluster stability' tab."),
195
-      helpText("Please note that with many cells (>500), some plots may take several seconds (sometimes minutes) to appear."),
196
-      helpText("By default, we downsample the data to 100 cells for the boxplot visualization (only for the 'Boxplot' tab). Please uncheck it to see the full dataset (it may take some time).")
197
-    ),
198
-    
199
-    mainPanel(
200
-      tabsetPanel(type = "tabs",
201
-                  ## Add a "summary" tab (wait for Michael's text)
202
-                  tabPanel("Boxplot",
203
-                           plotOutput("boxplot"),
204
-                           plotOutput("rle")),
205
-                  tabPanel("PCA",
206
-                           plotOutput("pca"),
207
-                           plotOutput("pca_pc"),
208
-                           plotOutput("pca_nc"),
209
-                           plotOutput("pca_mv")),
210
-                  tabPanel("QC",
211
-                           plotOutput("qc_plot"),
212
-                           tableOutput("qc_corr"),
213
-                           plotOutput("qc_pca")),
214
-                  tabPanel("Cluster stability",
215
-                           plotOutput("pam_cc"),
216
-                           fluidRow(
217
-                             column(3,
218
-                                    h3("Type"),
219
-                                    br(),
220
-                                    tableOutput("pam_type")),
221
-                             column(3,
222
-                                    h3("Batch"),
223
-                                    br(),
224
-                                    tableOutput("pam_batch"))),
225
-                           plotOutput("pam_pca"),
226
-                           plotOutput("pam_qc")),
227
-                  tabPanel("Heatmaps",
228
-                           plotOutput("heatmap_pc"),
229
-                           plotOutput("heatmap_nc"),
230
-                           plotOutput("heatmap_pc1"),
231
-                           plotOutput("heatmap_mv")),
232
-                  tabPanel("Confounding",
233
-                           plotOutput("box_pc1_pc"),
234
-                           plotOutput("box_pc1_nc"),
235
-                           plotOutput("box_pc1_qc")),
236
-                  tabPanel("Drop-out",
237
-                           plotOutput("box_detection"),
238
-                           plotOutput("box_dropout"),
239
-                           plotOutput("curve_dropout"))
240
-                  )
241
-                  
242
-    )))
243
-
244
-shinyApp(ui = ui, server = server)
245 0
deleted file mode 100644
... ...
@@ -1,93 +0,0 @@
1
-Lik_Fun = function(Z,X,Beta){
2
-  return((1-Z) + (2*Z - 1)/(1+exp(-X %*% Beta)))
3
-}
4
-
5
-Post_Z = function(Y,W,Alpha,X,Beta){
6
-  return(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta)/(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta) + (Y == 0)*Lik_Fun(0,X,Beta)))
7
-}
8
-
9
-estimateFNR = function(x, bulk_model = FALSE, is.control = TRUE, K = 0, X = NULL, Alpha = NULL,
10
-                       is.expressed = NULL, niter = 100, ntresh = 0){
11
-  
12
-  require(matrixStats)
13
-  Y = t(matrix(as.numeric(x > ntresh ),nrow = dim(x)[1]))
14
-  
15
-  if(bulk_model){
16
-    x2 = x
17
-    x2[x2 <= ntresh] = NA
18
-    Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE))))
19
-  }
20
-  
21
-  # Prepare Wanted Variation and Gene-Intrinsic Intercept
22
-  X = cbind(X,rep(1,dim(x)[2]))
23
-  is.intercept = dim(X)[2] == 1
24
-  Beta = matrix(0,dim(X)[2],dim(x)[1])
25
-  
26
-  # Prepare Unwanted Variation (K factors) and Sample-Intrinsic Intercept
27
-  if( K == 0 || !is.null(Alpha) ){
28
-    Alpha = rbind(Alpha,rep(1,dim(x)[1]))
29
-    K = dim(Alpha)[1] - 1
30
-  }else{
31
-    M = 2*Y[is.control,] - 1
32
-    M = t(t(M - rowMeans(M)) - colMeans(M)) + mean(M)
33
-    Alpha = rbind(t(svd(M,nv = K)$v[,1:K]),rep(1,dim(x)[1]))
34
-  }
35
-  W = matrix(0,dim(x)[2],K+1)
36
-  
37
-  # Initialize Z
38
-  Z = Post_Z(Y,W,Alpha,X,Beta)
39
-  
40
-  # If control genes are supplied, W fit once, according to control genes
41
-  if(!is.null(is.expressed)){
42
-    cY = Y[,is.expressed]
43
-    cAlpha = Alpha[,is.expressed]
44
-    for (i in 1:dim(Z)[1]){
45
-      W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients
46
-    }   
47
-  }
48
-  
49
-  ## DR: it seems strange to me that this is a for loop and not a while !converged
50
-  for(n in 1:niter){    
51
-    # Update Beta
52
-    if(!is.intercept){
53
-      
54
-      if(is.null(is.expressed)){
55
-        for(i in 1:dim(Z)[2]){
56
-          Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients
57
-        }
58
-        
59
-      }else{ # Only over non-control genes
60
-        for(i in (1:dim(Z)[2])[!is.expressed]){
61
-          Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients
62
-        }
63
-      }
64
-      
65
-      
66
-    }else{
67
-      # Short-cut!
68
-      Beta = t(matrix(log(colMeans(Z)/(1-colMeans(Z)))))
69
-    }
70
-    
71
-    # Update Z
72
-    Z = Post_Z(Y,W,Alpha,X,Beta)
73
-  
74
-    if(is.null(is.expressed)){
75
-      # Update W
76
-      for (i in 1:dim(Z)[1]){
77
-        W[i,] = glm(cbind(Y[i,],(1-Y[i,])*Z[i,]) ~ 0 + t(Alpha),quasibinomial)$coefficients
78
-      }
79
-    
80
-      # Update Z
81
-      Z = Post_Z(Y,W,Alpha,X,Beta)
82
-    }
83
-  }
84
-  
85
-  # Stick to our initial assumptions
86
-  if(!is.null(is.expressed)){
87
-    Beta[,is.expressed] = NA
88
-    Z[,is.expressed] = 1
89
-  }
90
-  EL = sum(na.omit(as.vector(Z*log(Lik_Fun(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(Lik_Fun(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(Lik_Fun(0,X,Beta)))))
91
-  
92
-  return(list(Z = t(Z), EL = EL, P = t(Lik_Fun(1,W,Alpha)), W = W, X = X, Alpha = Alpha, Beta = Beta))
93
-}
94 0
\ No newline at end of file
... ...
@@ -2,7 +2,6 @@
2 2
 #' 
3 3
 #' This function is used internally in scone to parse the variables used to generate the design matrices.
4 4
 #' 
5
-#'  
6 5
 #' @param pars character. A vector of parameters corresponding to a row of params.
7 6
 #' @param bio factor. The biological factor of interest.
8 7
 #' @param batch factor. The known batch effects.
... ...
@@ -44,6 +43,8 @@ parse_row <- function(pars, bio, batch, ruv_factors, qc) {
44 43
 #' the bio variable. Here, nested means that each batch is made of observations from only one level of bio,
45 44
 #' while each level of bio may contain multiple batches.
46 45
 #'  
46
+#' @export
47
+#'  
47 48
 #' @param bio factor. The biological factor of interest.
48 49
 #' @param batch factor. The known batch effects.
49 50
 #' @param W numeric. Either a vector or matrix containing one or more continuous covariates (e.g. RUV factors).
... ...
@@ -111,11 +112,15 @@ make_design <- function(bio, batch, W, nested=FALSE) {
111 112
 #' for which expression needs to be adjusted, start with either the word "batch" or the letter "W" (case sensitive).
112 113
 #' Any other covariate (including the intercept) is kept.
113 114
 #'  
115
+#' @importFrom limma lmFit
116
+#' @export
117
+#' 
114 118
 #' @param log_expr matrix. The log gene expression (genes in row, samples in columns).
115 119
 #' @param design_mat matrix. The design matrix (usually the result of make_design).
116
-#' 
120
+#' @param batch factor. A factor with the batch information.
121
+#' @param weights matrix. A matrix of weights.
117 122
 #' @return The corrected log gene expression.
118
-lm_adjust <- function(log_expr, design_mat, weights=NULL) {
123
+lm_adjust <- function(log_expr, design_mat, batch=NULL, weights=NULL) {
119 124
   lm_object <- lmFit(log_expr, design = design_mat, weights = weights)
120 125
 
121 126
   uvind <- grep("^W", colnames(design_mat))
... ...
@@ -28,18 +28,24 @@ library(class)
28 28
 #' If NULL, condition KNN concordance will be returned NA.
29 29
 #' @param batch factor. The known batch variable (variation to be removed), NA is allowed.
30 30
 #' If NULL, batch KNN concordance will be returned NA.
31
-#' @param qc_factors. Factors of unwanted variation derived from quality metrics.
31
+#' @param qc_factors Factors of unwanted variation derived from quality metrics.
32 32
 #' If NULL, qc correlations will be returned NA.
33
-#' @param ruv_factors. Factors of unwanted variation derived from negative control genes (adjustable set).
33
+#' @param ruv_factors Factors of unwanted variation derived from negative control genes (adjustable set).
34 34
 #' If NULL, ruv correlations will be returned NA.
35
-#' @param uv_factors. Factors of unwanted variation derived from negative control genes (un-adjustable set).
35
+#' @param uv_factors Factors of unwanted variation derived from negative control genes (un-adjustable set).
36 36
 #' If NULL, uv correlations will be returned NA.
37
-#' @param wv_factors. Factors of wanted variation derived from positive control genes (un-adjustable set).
37
+#' @param wv_factors Factors of wanted variation derived from positive control genes (un-adjustable set).
38 38
 #' If NULL, wv correlations will be returned NA.
39 39
 #' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out.
40 40
 #' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), 
41 41
 #' and a weighted average is returned.
42 42
 #' 
43
+#' @importFrom scde bwpca
44
+#' @importFrom class knn
45
+#' @importFrom fpc pamk
46
+#' 
47
+#' @export
48
+#' 
43 49
 #' @return A list with the following elements:
44 50
 #' \itemize{
45 51
 #' \item{KNN_BIO}{K-NN concordance rate by biological condition.}
... ...
@@ -72,7 +78,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL,
72 78
     if(is.null(weights)){
73 79
       proj = prcomp(t(expr),center = TRUE,scale. = TRUE)$x[,1:eval_pcs]
74 80
     }else{
75
-      proj = scde::bwpca(mat = t(expr),matw = t(weights),npcs = eval_pcs, seed = seed, em.maxiter = em.maxiter)$scores
81
+      proj = bwpca(mat = t(expr),matw = t(weights),npcs = eval_pcs, seed = seed, em.maxiter = em.maxiter)$scores
76 82
     }
77 83
   }else{
78 84
     eval_pcs = dim(proj)[2]
... ...
@@ -83,7 +89,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL,
83 89
   if( !is.null(eval_knn)  ){
84 90
     
85 91
     if( !is.null(bio) | !any(!is.na(bio)) ){
86
-      KNN_BIO = mean(attributes(class::knn(train = proj[!is.na(bio),],test = proj[!is.na(bio),],cl = bio[!is.na(bio)], k = eval_knn,prob = TRUE))$prob)
92
+      KNN_BIO = mean(attributes(knn(train = proj[!is.na(bio),],test = proj[!is.na(bio),],cl = bio[!is.na(bio)], k = eval_knn,prob = TRUE))$prob)
87 93
     }else{
88 94
       KNN_BIO = NA
89 95
     }
... ...
@@ -91,7 +97,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL,
91 97
   
92 98
     # K-NN Batch
93 99
     if( !is.null(batch) | !any(!is.na(batch)) ){
94
-      KNN_BATCH = mean(attributes(class::knn(train = proj[!is.na(batch),],test = proj[!is.na(batch),],cl = batch[!is.na(batch)], k = eval_knn,prob = TRUE))$prob)
100
+      KNN_BATCH = mean(attributes(knn(train = proj[!is.na(batch),],test = proj[!is.na(batch),],cl = batch[!is.na(batch)], k = eval_knn,prob = TRUE))$prob)
95 101
     }else{
96 102
       KNN_BATCH = NA
97 103
     }
... ...
@@ -116,7 +122,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL,
116 122
         is_cond = which(bio == cond)
117 123
         cond_w = length(is_cond)
118 124
         if(cond_w > max(eval_kclust)){
119
-          pamk_object = fpc::pamk(proj[is_cond,],krange = eval_kclust)
125
+          pamk_object = pamk(proj[is_cond,],krange = eval_kclust)
120 126
           PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width
121 127
         }else{
122 128
           stop("Number of clusters 'k' must be smaller than bio class size")
... ...
@@ -128,7 +134,7 @@ score_matri