1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,14 @@ |
1 |
+Package: SCONE |
|
2 |
+Version: 0.0.1 |
|
3 |
+Title: Single Cell Overview of Normalized Expression data |
|
4 |
+Description: SCONE. |
|
5 |
+Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", |
|
6 |
+ role = c("aut", "cre", "cph")), |
|
7 |
+ person("Davide", "Risso", email = "risso.davide@gmail.com", |
|
8 |
+ role = c("aut"))) |
|
9 |
+Author: Michael Cole [aut, cre, cph] and Davide Risso [aut] |
|
10 |
+Maintainer: Michael Cole <mbeloc@gmail.com> |
|
11 |
+Date: 01-22-2016 |
|
12 |
+License: Artistic-2.0 |
|
13 |
+LazyLoad: yes |
|
14 |
+RoxygenNote: 5.0.1 |
1 | 16 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,532 @@ |
1 |
+## DR: if I understand correctly "nom" is used only for the name of the file to store the data. |
|
2 |
+## There should be a better way to do this, using the name of the function (the opposite of get()?). |
|
3 |
+ |
|
4 |
+source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/estimateFNR.R") |
|
5 |
+source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE_DEFAULTS.R") |
|
6 |
+library(BiocParallel,quietly = TRUE) |
|
7 |
+library(cluster,quietly = TRUE) |
|
8 |
+library(fpc,quietly = TRUE) |
|
9 |
+ |
|
10 |
+runAdjustTask = function(task,evaluate_material,design,nested_model, to_store = FALSE){ |
|
11 |
+ |
|
12 |
+#??? source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE) |
|
13 |
+ |
|
14 |
+ result = try({ |
|
15 |
+ |
|
16 |
+ if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){ |
|
17 |
+ if(to_store){ |
|
18 |
+ load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
19 |
+ score_out = scone_out$evaluation |
|
20 |
+ |
|
21 |
+ # Return values |
|
22 |
+ if(!is.na(score_out)[1]){ |
|
23 |
+ scores = score_out$scores |
|
24 |
+ }else{ |
|
25 |
+ scores = NA |
|
26 |
+ } |
|
27 |
+ |
|
28 |
+ print(task$nom) |
|
29 |
+ return(list(scores = scores)) |
|
30 |
+ }else{ |
|
31 |
+ print(task$nom) |
|
32 |
+ return() |
|
33 |
+ } |
|
34 |
+ |
|
35 |
+ }else{ |
|
36 |
+ |
|
37 |
+ norm_lout = ADJUST_FN(task$ei,batch = task$batch, |
|
38 |
+ bio = task$condition, |
|
39 |
+ uv = task$uv, |
|
40 |
+ w = task$w, |
|
41 |
+ design = design, |
|
42 |
+ nested_model = nested_model) |
|
43 |
+ norm_out = exp(norm_lout) - 1 |
|
44 |
+ score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch, |
|
45 |
+ qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, |
|
46 |
+ DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores, |
|
47 |
+ dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN, |
|
48 |
+ fnr_pi = evaluate_material$fnr_pi, nom = task$nom) |
|
49 |
+ |
|
50 |
+ scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom) |
|
51 |
+ save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
52 |
+ if(to_store){ |
|
53 |
+ |
|
54 |
+ # Return values |
|
55 |
+ if(!is.na(score_out)[1]){ |
|
56 |
+ scores = score_out$scores |
|
57 |
+ }else{ |
|
58 |
+ scores = NA |
|
59 |
+ } |
|
60 |
+ |
|
61 |
+ print(task$nom) |
|
62 |
+ return(list(scores = scores)) |
|
63 |
+ }else{ |
|
64 |
+ print(task$nom) |
|
65 |
+ return() |
|
66 |
+ } |
|
67 |
+ |
|
68 |
+ } |
|
69 |
+ |
|
70 |
+ }) |
|
71 |
+ |
|
72 |
+ return(NA) |
|
73 |
+ |
|
74 |
+} |
|
75 |
+ |
|
76 |
+runFactorFreeTask = function(task,evaluate_material,emat){ |
|
77 |
+ |
|
78 |
+ ## ??? source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE) |
|
79 |
+ |
|
80 |
+ result = try({ |
|
81 |
+ if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){ |
|
82 |
+ load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
83 |
+ norm_out = scone_out$exp_mat |
|
84 |
+ score_out = scone_out$evaluation |
|
85 |
+ |
|
86 |
+ # Return values |
|
87 |
+ if(!is.na(score_out)[1]){ |
|
88 |
+ scores = score_out$scores |
|
89 |
+ }else{ |
|
90 |
+ scores = NA |
|
91 |
+ } |
|
92 |
+ |
|
93 |
+ print(task$nom) |
|
94 |
+ return(list(eo = norm_out, scores = scores)) |
|
95 |
+ |
|
96 |
+ }else{ |
|
97 |
+ norm_out = task$FN(emat) |
|
98 |
+ |
|
99 |
+ score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch, |
|
100 |
+ qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, |
|
101 |
+ DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores, |
|
102 |
+ dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN, |
|
103 |
+ fnr_pi = evaluate_material$fnr_pi, nom = task$nom) |
|
104 |
+ |
|
105 |
+ scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom) |
|
106 |
+ save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
107 |
+ |
|
108 |
+ # Return values |
|
109 |
+ if(!is.na(score_out)[1]){ |
|
110 |
+ scores = score_out$scores |
|
111 |
+ }else{ |
|
112 |
+ scores = NA |
|
113 |
+ } |
|
114 |
+ |
|
115 |
+ print(task$nom) |
|
116 |
+ return(list(eo = norm_out, scores = scores)) |
|
117 |
+ } |
|
118 |
+ |
|
119 |
+ }) |
|
120 |
+ |
|
121 |
+ return(NA) |
|
122 |
+ |
|
123 |
+} |
|
124 |
+ |
|
125 |
+scoreMethod = function(e,condition = NULL, batch = NULL, |
|
126 |
+ qual_scores = NULL, HK_scores = NULL, |
|
127 |
+ DE_scores = NULL, CC_scores = NULL, |
|
128 |
+ dim_eval = NULL, K_clust = NULL, K_NN = NULL, |
|
129 |
+ fnr_pi = NULL, nom = NULL){ |
|
130 |
+ if(any(is.na(e) | is.infinite(e) | is.nan(e))){ |
|
131 |
+ warning("NA/Inf/NaN Expression Values. Returning NA.") |
|
132 |
+ return(NA) |
|
133 |
+ } |
|
134 |
+ # Project the data: PCA or wPCA |
|
135 |
+ if(grepl("^IMPUTE",nom)){ |
|
136 |
+ pc_val = prcomp(t(log(e + 1)),center = TRUE,scale. = TRUE)$x[,1:dim_eval] |
|
137 |
+ }else{ |
|
138 |
+ pc_val = wPCA(log(e + 1),1 - fnr_pi,nu = dim_eval,filt = TRUE)$x |
|
139 |
+ } |
|
140 |
+ |
|
141 |
+ # Max cor with quality scores |
|
142 |
+ EXP_QPC_COR = max(cor(pc_val,qual_scores,method = "spearman")^2) |
|
143 |
+ |
|
144 |
+ # Max cor with housekeeping scores |
|
145 |
+ if(!is.null(HK_scores)){ |
|
146 |
+ EXP_HK_COR = max(cor(pc_val,HK_scores,method = "spearman")^2) |
|
147 |
+ }else{ |
|
148 |
+ EXP_HK_COR = NA |
|
149 |
+ } |
|
150 |
+ |
|
151 |
+ # Max cor with differentially expressed scores |
|
152 |
+ if(!is.null(DE_scores)){ |
|
153 |
+ EXP_DE_COR = max(cor(pc_val,DE_scores,method = "spearman")^2) |
|
154 |
+ }else{ |
|
155 |
+ EXP_DE_COR = NA |
|
156 |
+ } |
|
157 |
+ |
|
158 |
+ # Max cor with cell cycle scores |
|
159 |
+ if(!is.null(CC_scores)){ |
|
160 |
+ EXP_CC_COR = max(cor(pc_val,CC_scores,method = "spearman")^2) |
|
161 |
+ }else{ |
|
162 |
+ EXP_CC_COR = NA |
|
163 |
+ } |
|
164 |
+ |
|
165 |
+ d = as.matrix(dist(pc_val,method = "euclidean")) |
|
166 |
+ |
|
167 |
+ # K-NN Condition |
|
168 |
+ if(!is.null(condition)){ |
|
169 |
+ m <- sapply(seq_len(nrow(d)), function(s) { |
|
170 |
+ ds <- d[,s] |
|
171 |
+ ra <- rank(ds) |
|
172 |
+ return(mean(condition[(ra <= K_NN+1) & (ra != 1)] == condition[s])) |
|
173 |
+ }) |
|
174 |
+ KNN_BIO = mean(m) |
|
175 |
+ }else{ |
|
176 |
+ KNN_BIO = NA |
|
177 |
+ } |
|
178 |
+ |
|
179 |
+ # K-NN Batch |
|
180 |
+ if(!is.null(batch)){ |
|
181 |
+ m <- sapply(seq_len(nrow(d)), function(s) { |
|
182 |
+ ds <- d[,s] |
|
183 |
+ ra <- rank(ds) |
|
184 |
+ return(mean(batch[(ra <= K_NN+1) & (ra != 1)] == batch[s])) |
|
185 |
+ }) |
|
186 |
+ KNN_BATCH = mean(m) |
|
187 |
+ }else{ |
|
188 |
+ KNN_BATCH = NA |
|
189 |
+ } |
|
190 |
+ |
|
191 |
+ # PAM Silh |
|
192 |
+ if(K_clust > 1){ |
|
193 |
+ pam_object = pam(pc_val,k = K_clust) |
|
194 |
+ PAM_SIL = pam_object$silinfo$avg.width |
|
195 |
+ clusters = pam_object$clustering |
|
196 |
+ }else{ |
|
197 |
+ PAM_SIL = NA |
|
198 |
+ clusters = NA |
|
199 |
+ } |
|
200 |
+ scores = c(EXP_QPC_COR,EXP_HK_COR,EXP_DE_COR,EXP_CC_COR,KNN_BIO,KNN_BATCH,PAM_SIL) |
|
201 |
+ names(scores) = c("EXP_QPC_COR","EXP_HK_COR","EXP_DE_COR","EXP_CC_COR","KNN_BIO","KNN_BATCH","PAM_SIL") |
|
202 |
+ return(list(scores = scores, pc_val = pc_val, clusters = clusters)) |
|
203 |
+} |
|
204 |
+ |
|
205 |
+## ===== SCONE: Single-Cell Overview of Normalized Expression data ===== |
|
206 |
+# Intermediate wrapper is missing" |
|
207 |
+# Decide model for the user - describe reason -" |
|
208 |
+# One batch one condition - user friendly |
|
209 |
+# Check rank of design matrix - if nested - we're good - IFF? |
|
210 |
+# Positive (evaluate) and negative controls (modeled + evaluate) |
|
211 |
+# show correlation between q and hk - why we don't try all |
|
212 |
+# Default number of dim_UV |
|
213 |
+# Factor free norms and adjust norms as lists = impute or not, positive data or not, only q , only hk |
|
214 |
+# function 1, function 2, and k |
|
215 |
+SCONE = function(e,condition = NULL, batch = NULL, |
|
216 |
+ design = c("factorial","nested"), |
|
217 |
+ nested_model = c("fixed","random"), |
|
218 |
+ qual = NULL, is_HK = NULL, |
|
219 |
+ is_DE = NULL,is_CC = NULL, |
|
220 |
+ dim_UV = 5, |
|
221 |
+ dim_eval = 3, K_clust = NULL, K_NN = 10, |
|
222 |
+ out_dir = NULL, |
|
223 |
+ K_clust_MAX = 10,K_pseudobatch_MAX = 10, |
|
224 |
+ factor_free_only = FALSE, to_store = FALSE, n_workers = 10){ |
|
225 |
+ param = MulticoreParam(workers = n_workers) |
|
226 |
+ ## ----- Set up output directory ---- |
|
227 |
+ if (file.exists(out_dir)){ |
|
228 |
+ #stop("Designated output directory already exists.") |
|
229 |
+ } else { |
|
230 |
+ dir.create(out_dir) |
|
231 |
+ } |
|
232 |
+ |
|
233 |
+ ## ----- Imputation Step ---- |
|
234 |
+ print("Imputation Step...") |
|
235 |
+ HK_default = FALSE |
|
236 |
+ if(is.null(is_HK)){ |
|
237 |
+ warning("No control genes have been specified, using all genes for FNR estimation.") |
|
238 |
+ HK_default = TRUE |
|
239 |
+ is_HK = rep(TRUE ,nrow(e)) |
|
240 |
+ } |
|
241 |
+ if(file.exists(paste0(out_dir,"/fnr_out.Rdata"))){ |
|
242 |
+ load(paste0(out_dir,"/fnr_out.Rdata")) |
|
243 |
+ }else{ |
|
244 |
+ fnr_out = estimateFNR(e, bulk_model = TRUE, is.expressed = is_HK) |
|
245 |
+ save(fnr_out,file = paste0(out_dir,"/fnr_out.Rdata")) |
|
246 |
+ } |
|
247 |
+ fnr_mu = exp(fnr_out$Alpha[1,]) |
|
248 |
+ fnr_pi = (e == 0) * fnr_out$Z |
|
249 |
+ imputed_e = e + (fnr_mu * fnr_pi) |
|
250 |
+ print("Complete.") |
|
251 |
+ |
|
252 |
+ ## ----- Generate Unwanted Factors ---- |
|
253 |
+ print("Generating Unwanted Factors for Evaluation...") |
|
254 |
+ if(file.exists(paste0(out_dir,"/evaluate_material.Rdata"))){ |
|
255 |
+ |
|
256 |
+ load(paste0(out_dir,"/evaluate_material.Rdata")) |
|
257 |
+ |
|
258 |
+ }else{ |
|
259 |
+ |
|
260 |
+ # Factors of alignment quality metrics |
|
261 |
+ if(is.null(qual)){ |
|
262 |
+ warning("No quality metrics specified. Total counts and efficiency used.") |
|
263 |
+ numba = colSums(e == 0) |
|
264 |
+ beta = colMeans(e == 0) |
|
265 |
+ qual = cbind(numba, log(beta/(1-beta))) |
|
266 |
+ } |
|
267 |
+ |
|
268 |
+ if(is.null(dim_UV)){ |
|
269 |
+ dim_UV = min(ncol(qual), sum(is_HK)) |
|
270 |
+ warning(paste("Number of unwanted factors not specified. Selecting minimum of number of quality features and number of control genes:",dim_UV)) |
|
271 |
+ if(dim_UV >= ncol(e)){ |
|
272 |
+ stop("Number of unwanted factors >= number of samples!") |
|
273 |
+ } |
|
274 |
+ } |
|
275 |
+ |
|
276 |
+ qual_scores = prcomp(qual, center = TRUE, scale = TRUE)$x[,1:dim_UV] |
|
277 |
+ |
|
278 |
+ if(!HK_default){ |
|
279 |
+ HK_scores = wPCA(log(e[is_HK,] + 1), 1 - fnr_pi[is_HK,], nu = dim_UV, filt = TRUE)$x |
|
280 |
+ }else{ |
|
281 |
+ HK_scores = NULL |
|
282 |
+ } |
|
283 |
+ |
|
284 |
+ if(!is.null(is_DE)){ |
|
285 |
+ DE_scores = wPCA(log(e[is_DE,] + 1), 1 - fnr_pi[is_DE,], nu = dim_eval, filt = TRUE)$x |
|
286 |
+ }else{ |
|
287 |
+ DE_scores = NULL |
|
288 |
+ } |
|
289 |
+ |
|
290 |
+ if(!is.null(is_CC)){ |
|
291 |
+ CC_scores = wPCA(log(e[is_CC,] + 1), 1 - fnr_pi[is_CC,], nu = dim_eval, filt = TRUE)$x |
|
292 |
+ }else{ |
|
293 |
+ CC_scores = NULL |
|
294 |
+ } |
|
295 |
+ |
|
296 |
+ ## ----- Evaluation Criteria ---- |
|
297 |
+ if(is.null(K_clust)){ |
|
298 |
+ if(is.null(DE_scores)){ |
|
299 |
+ warning("No clustering K specified. Selecting K using PAM average silhouette width on raw wPCA.") |
|
300 |
+ pc_val = wPCA(log(e + 1), 1 - fnr_pi, nu = dim_eval, filt = TRUE)$x |
|
301 |
+ }else{ |
|
302 |
+ warning("No clustering K specified. Selecting K using PAM average silhouette width on DE scores.") |
|
303 |
+ pc_val = DE_scores |
|
304 |
+ } |
|
305 |
+ |
|
306 |
+ pamk.best <- pamk(data = pc_val, krange = 1:K_clust_MAX) |
|
307 |
+ K_clust = pamk.best$nc |
|
308 |
+ } |
|
309 |
+ |
|
310 |
+ data_dir = paste0(out_dir,"/data/") |
|
311 |
+ if (file.exists(data_dir)){ |
|
312 |
+ #stop("Designated data directory already exists.") |
|
313 |
+ } else { |
|
314 |
+ dir.create(data_dir) |
|
315 |
+ } |
|
316 |
+ |
|
317 |
+ evaluate_material = list(condition = condition, batch = batch, |
|
318 |
+ qual_scores = qual_scores, HK_scores = HK_scores, |
|
319 |
+ DE_scores = DE_scores, CC_scores = CC_scores, |
|
320 |
+ dim_eval = dim_eval, K_clust = K_clust, K_NN = K_NN, |
|
321 |
+ fnr_pi = fnr_pi, out_dir = data_dir) |
|
322 |
+ |
|
323 |
+ save(evaluate_material,file = paste0(out_dir,"/evaluate_material.Rdata")) |
|
324 |
+ |
|
325 |
+ } |
|
326 |
+ |
|
327 |
+ print("Complete.") |
|
328 |
+ |
|
329 |
+ |
|
330 |
+ ## ----- Factor-Free Normalization |
|
331 |
+ ## DR: major change: the current code will copy over and over the same matrices (e and imputed_e). |
|
332 |
+ ## There's no need to do so as we can infer what matrix to use from the name. |
|
333 |
+ |
|
334 |
+ task_list = list() |
|
335 |
+ |
|
336 |
+ # Generate Task List for Non-imputed |
|
337 |
+ task_names = paste("NOIMPUTE",c("NONE","UQ","UQP","DESEQ","DESEQP","FQ","FQP","TMM"),sep = "_") |
|
338 |
+ |
|
339 |
+ ##DR: why not TMM__FN_POS? |
|
340 |
+ task_FN = c(identity,UQ_FN,UQ_FN_POS,DESEQ_FN,DESEQ_FN_POS,FQ_FN,FQ_FN_POS,TMM_FN) |
|
341 |
+ names(task_FN) = task_names |
|
342 |
+ for (task in task_names){ |
|
343 |
+ task_list[[task]] = list(FN = task_FN[[task]], nom = task) |
|
344 |
+ } |
|
345 |
+ # Generate Task List for Imputed |
|
346 |
+ task_names = paste("IMPUTE",c("NONE","UQ","DESEQ","FQ","TMM"),sep = "_") |
|
347 |
+ task_FN = c(identity,UQ_FN,DESEQ_FN,FQ_FN,TMM_FN) |
|
348 |
+ names(task_FN) = task_names |
|
349 |
+ for (task in task_names){ |
|
350 |
+ task_list[[task]] = list(FN = task_FN[[task]], nom = task) |
|
351 |
+ } |
|
352 |
+ |
|
353 |
+ # Parallel Normalization + Evaluation |
|
354 |
+ print("Factor-Free Normalization and Evaluation...") |
|
355 |
+ factor_free_out_noimpute = bplapply(task_list[grep("^NOIMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=e, BPPARAM = param) |
|
356 |
+ factor_free_out_impute = bplapply(task_list[grep("^IMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=imputed_e, BPPARAM = param) |
|
357 |
+ factor_free_out <- c(factor_free_out_noimpute, factor_free_out_impute) |
|
358 |
+ print("Complete.") |
|
359 |
+ |
|
360 |
+ ## ----- Selection Step (Optional) |
|
361 |
+ if(factor_free_only){ |
|
362 |
+ return(list(factor_free_out = factor_free_out, factor_based_out = NA)) |
|
363 |
+ } |
|
364 |
+ ## ----- Factor-Based Normalization |
|
365 |
+ |
|
366 |
+ print("Selecting Base Tasks...") |
|
367 |
+ base_task_list <- lapply(1:length(factor_free_out), function(i) { |
|
368 |
+ if(!any(is.na(factor_free_out[[i]]$eo) | is.nan(factor_free_out[[i]]$eo) | is.infinite(factor_free_out[[i]]$eo))) { |
|
369 |
+ return(list(ei = factor_free_out[[i]]$eo, nom = names(factor_free_out)[i])) |
|
370 |
+ } |
|
371 |
+ }) |
|
372 |
+ names(base_task_list) <- names(factor_free_out) |
|
373 |
+ base_task_list <- base_task_list[!sapply(base_task_list, is.null)] |
|
374 |
+ print("Complete.") |
|
375 |
+ |
|
376 |
+ |
|
377 |
+ # WEIGHT |
|
378 |
+ print("Extending Tasks by Weighting Scheme...") |
|
379 |
+ ext_task_list = list() |
|
380 |
+ for( task in names(base_task_list)){ |
|
381 |
+ base_nom = task |
|
382 |
+ if(!grepl("^IMPUTE",task)){ |
|
383 |
+ ext_nom = paste(base_nom,"ZEROWEIGHT",sep = "_") |
|
384 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
385 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
386 |
+ ext_task_list[[ext_nom]]$w = 0 + (ext_task_list[[ext_nom]]$ei > 0) |
|
387 |
+ } |
|
388 |
+ ext_nom = paste(base_nom,"NOWEIGHT",sep = "_") |
|
389 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
390 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
391 |
+ ext_task_list[[ext_nom]]$w = NULL |
|
392 |
+ } |
|
393 |
+ print("Complete.") |
|
394 |
+ |
|
395 |
+ base_task_list = ext_task_list |
|
396 |
+ |
|
397 |
+ # Generate Method-Specific Factors (Should be parallel) |
|
398 |
+ print("Computing Method-Specific Factors...") |
|
399 |
+ |
|
400 |
+ task_factor_list = list() |
|
401 |
+ for( task in names(base_task_list)){ |
|
402 |
+ |
|
403 |
+ # HK |
|
404 |
+ if(!HK_default){ |
|
405 |
+ if(!is.null(base_task_list[[task]]$w)){ |
|
406 |
+ task_factor_list[[task]]$HK_scores = wPCA(x = log(base_task_list[[task]]$ei[is_HK,]+1),w = base_task_list[[task]]$w[is_HK,],nu = dim_UV,filt = TRUE)$x |
|
407 |
+ }else{ |
|
408 |
+ task_factor_list[[task]]$HK_scores = prcomp(t(log(base_task_list[[task]]$ei[is_HK,]+1)))$x[,1:dim_UV] |
|
409 |
+ } |
|
410 |
+ |
|
411 |
+ if(is.null(K_pseudobatch)){ |
|
412 |
+ pamk_object = pamk(task_factor_list[[task]]$HK_scores,krange = 1:K_pseudobatch_MAX) |
|
413 |
+ task_factor_list[[task]]$hk_nc = pamk_object$nc |
|
414 |
+ task_factor_list[[task]]$pam_clustering = pamk_object$pamobject$clustering |
|
415 |
+ }else{ |
|
416 |
+ task_factor_list[[task]]$hk_nc = K_pseudobatch |
|
417 |
+ task_factor_list[[task]]$pam_clustering = pam(task_factor_list[[task]]$HK_scores,k = K_pseudobatch)$clustering |
|
418 |
+ } |
|
419 |
+ }else{ |
|
420 |
+ task_factor_list[[task]]$HK_scores = NULL |
|
421 |
+ task_factor_list[[task]]$hk_nc = NULL |
|
422 |
+ task_factor_list[[task]]$pam_clustering = NULL |
|
423 |
+ } |
|
424 |
+ } |
|
425 |
+ print("Complete.") |
|
426 |
+ |
|
427 |
+ # Generate General Factor Clusterings |
|
428 |
+ print("Generating General Factor-Based Clusters...") |
|
429 |
+ if(is.null(K_pseudobatch)){ |
|
430 |
+ pamk_object = pamk(evaluate_material$qual_scores,krange = 1:K_pseudobatch_MAX) |
|
431 |
+ qual_nc = pamk_object$nc |
|
432 |
+ qual_clustering = as.factor(pamk_object$pamobject$clustering) |
|
433 |
+ }else{ |
|
434 |
+ qual_nc = K_pseudobatch |
|
435 |
+ qual_clustering = as.factor(pam(evaluate_material$qual_scores,k = K_pseudobatch)$clustering) |
|
436 |
+ } |
|
437 |
+ print("Complete.") |
|
438 |
+ |
|
439 |
+ # BIO |
|
440 |
+ print("Extending Tasks by Biological Covariate...") |
|
441 |
+ ext_task_list = list() |
|
442 |
+ for( task in names(base_task_list)){ |
|
443 |
+ base_nom = task |
|
444 |
+ |
|
445 |
+ if(!is.null(condition)){ |
|
446 |
+ ext_nom = paste(base_nom,"BIO",sep = "_") |
|
447 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
448 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
449 |
+ ext_task_list[[ext_nom]]$condition = condition |
|
450 |
+ } |
|
451 |
+ ext_nom = paste(base_nom,"NOBIO",sep = "_") |
|
452 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
453 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
454 |
+ ext_task_list[[ext_nom]]$condition = NULL |
|
455 |
+ } |
|
456 |
+ print("Complete.") |
|
457 |
+ |
|
458 |
+ print("Extending Tasks by Batch Covariate...") |
|
459 |
+ base_task_list = ext_task_list |
|
460 |
+ # BATCH |
|
461 |
+ ext_task_list = list() |
|
462 |
+ for( task in names(base_task_list)){ |
|
463 |
+ base_nom = task |
|
464 |
+ |
|
465 |
+ if(!is.null(batch)){ |
|
466 |
+ ext_nom = paste(base_nom,"BATCH",sep = "_") |
|
467 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
468 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
469 |
+ ext_task_list[[ext_nom]]$batch = batch |
|
470 |
+ } |
|
471 |
+ ext_nom = paste(base_nom,"NOBATCH",sep = "_") |
|
472 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
473 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
474 |
+ ext_task_list[[ext_nom]]$batch = NULL |
|
475 |
+ } |
|
476 |
+ print("Complete.") |
|
477 |
+ |
|
478 |
+ print("Extending Tasks by UV Covariates...") |
|
479 |
+ base_task_list = ext_task_list |
|
480 |
+ # UV |
|
481 |
+ ext_task_list = list() |
|
482 |
+ for( task in names(base_task_list)){ |
|
483 |
+ base_nom = task |
|
484 |
+ method_class = gsub("WEIGHT_.*","WEIGHT",task) |
|
485 |
+ |
|
486 |
+ ext_nom = paste(base_nom,"NOUV",sep = "_") |
|
487 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
488 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
489 |
+ ext_task_list[[ext_nom]]$uv = NULL |
|
490 |
+ |
|
491 |
+ # HK UV |
|
492 |
+ if(!HK_default){ |
|
493 |
+ for(j in 1:dim_UV){ |
|
494 |
+ ext_nom = paste(base_nom,paste("HK",j,sep = "_"),sep = "_") |
|
495 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
496 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
497 |
+ ext_task_list[[ext_nom]]$uv = as.matrix(task_factor_list[[method_class]]$HK_scores[,1:j]) |
|
498 |
+ } |
|
499 |
+ # if(task_factor_list[[method_class]]$hk_nc > 1){ |
|
500 |
+ # ext_nom = paste(base_nom,"HKCLUST",sep = "_") |
|
501 |
+ # ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
502 |
+ # ext_task_list[[ext_nom]]$nom = ext_nom |
|
503 |
+ # ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(task_factor_list[[method_class]]$pam_clustering)) |
|
504 |
+ # } |
|
505 |
+ } |
|
506 |
+ # Qual UV |
|
507 |
+ for(j in 1:dim_UV){ |
|
508 |
+ ext_nom = paste(base_nom,paste("Q",j,sep = "_"),sep = "_") |
|
509 |
+ ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
510 |
+ ext_task_list[[ext_nom]]$nom = ext_nom |
|
511 |
+ ext_task_list[[ext_nom]]$uv = as.matrix(evaluate_material$qual_scores[,1:j]) |
|
512 |
+ } |
|
513 |
+ # if(qual_nc > 1){ |
|
514 |
+ # ext_nom = paste(base_nom,"QCLUST",sep = "_") |
|
515 |
+ # ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
516 |
+ # ext_task_list[[ext_nom]]$nom = ext_nom |
|
517 |
+ # ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(qual_clustering)) |
|
518 |
+ # } |
|
519 |
+ } |
|
520 |
+ print("Complete.") |
|
521 |
+ ext_task_list = ext_task_list[!grepl("NOBATCH_NOUV",names(ext_task_list))] |
|
522 |
+ if(!to_store){ |
|
523 |
+ finished_tasks = gsub(".Rdata","",list.files(evaluate_material$out_dir)) |
|
524 |
+ ext_task_list = ext_task_list[! (names(ext_task_list) %in% finished_tasks)] |
|
525 |
+ } |
|
526 |
+ print(length(ext_task_list)) |
|
527 |
+ print("Factor-Based Adjustment Normalization and Evaluation...") |
|
528 |
+ factor_based_out = bplapply(ext_task_list,FUN = runAdjustTask, evaluate_material = evaluate_material,design = design,nested_model = nested_model,BPPARAM = param) |
|
529 |
+ print("Complete.") |
|
530 |
+ |
|
531 |
+ return(list(factor_free_out = factor_free_out, factor_based_out = factor_based_out)) |
|
532 |
+} |
0 | 533 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,392 @@ |
1 |
+library(DESeq,quietly = TRUE) |
|
2 |
+library(EDASeq,quietly = TRUE) |
|
3 |
+library(edgeR,quietly = TRUE) |
|
4 |
+library(sva,quietly = TRUE) |
|
5 |
+library(aroma.light,quietly = TRUE) |
|
6 |
+library(lme4,quietly = TRUE) |
|
7 |
+ |
|
8 |
+#' Upper-quartile normalization wrapper. |
|
9 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
10 |
+#' @return Upper-quartile normalized matrix (scaled by sample UQ) |
|
11 |
+UQ_FN = function(ei){ |
|
12 |
+ eo = betweenLaneNormalization(ei, which="upper", round = FALSE) |
|
13 |
+ return(eo) |
|
14 |
+} |
|
15 |
+ |
|
16 |
+uq_f <- function(which="upper", round = FALSE){ |
|
17 |
+ function(x) { |
|
18 |
+ betweenLaneNormalization(x, which = which, round = round) |
|
19 |
+ } |
|
20 |
+} |
|
21 |
+ |
|
22 |
+# setMethod(f="UQ_FN", |
|
23 |
+# signature="SummarizedExperiment", |
|
24 |
+# definition=function(x) { |
|
25 |
+# exprs(x) <- UQ_FN(exprs(x)) |
|
26 |
+# return(x) |
|
27 |
+# }) |
|
28 |
+# |
|
29 |
+# setMethod(f="UQ_FN", |
|
30 |
+# signature="matrix", |
|
31 |
+# definition= function(ei){ |
|
32 |
+# eo = betweenLaneNormalization(ei, which="upper", round = FALSE) |
|
33 |
+# return(eo) |
|
34 |
+# }) |
|
35 |
+ |
|
36 |
+#' Upper-quartile normalization applied to positive data. |
|
37 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
38 |
+#' @return Upper-quartile (positive) normalized matrix (scaled by sample UQ) |
|
39 |
+UQ_FN_POS = function(ei){ |
|
40 |
+ zei = ei |
|
41 |
+ zei[ei == 0] = NA |
|
42 |
+ q = apply(zei, 2, quantile, 0.75, na.rm = TRUE) |
|
43 |
+ zeo = t(t(zei)/q)*mean(q) |
|
44 |
+ eo = zeo |
|
45 |
+ eo[ei == 0] = 0 |
|
46 |
+ return(eo) |
|
47 |
+} |
|
48 |
+ |
|
49 |
+#' Full-Quantile normalization wrapper. |
|
50 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
51 |
+#' @return FQ normalized matrix. |
|
52 |
+FQ_FN = function(ei){ |
|
53 |
+ eo = normalizeQuantileRank.matrix(ei) |
|
54 |
+ return(eo) |
|
55 |
+} |
|
56 |
+ |
|
57 |
+#' Full-Quantile normalization applied to positive data. |
|
58 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
59 |
+#' @return FQ (positive) normalized matrix. |
|
60 |
+FQ_FN_POS = function(ei){ |
|
61 |
+ |
|
62 |
+ # Vector of integers used for computation |
|
63 |
+ base_rank = 1:nrow(ei) |
|
64 |
+ |
|
65 |
+ # Quantile Index Matrix: Values between 0 and 1 corresponding to quantile |
|
66 |
+ quant_mat = NULL |
|
67 |
+ # Re-ordered Data Matrix |
|
68 |
+ x_mat = NULL |
|
69 |
+ print("Sorting Matrix...") |
|
70 |
+ # For each sample: |
|
71 |
+ for (i in 1:dim(ei)[2]){ |
|
72 |
+ # Sort data and replace zeroes with NA |
|
73 |
+ x_mat = cbind(x_mat,rev(sort(ei[,i]))) |
|
74 |
+ x_mat[x_mat == 0 ] = NA |
|
75 |
+ # Compute corresponding quantile indices for that sample |
|
76 |
+ quant = base_rank/sum(ei[,i]>0) |
|
77 |
+ quant[quant > 1] = NA |
|
78 |
+ quant_mat = cbind(quant_mat,quant) |
|
79 |
+ } |
|
80 |
+ |
|
81 |
+ # Vector form of quantile index matrix |
|
82 |
+ quant_out = as.numeric(as.vector(quant_mat)) |
|
83 |
+ print("Complete.") |
|
84 |
+ print("Spline Interpolation...") |
|
85 |
+ # Interpolation Matrix (Values of all quantiles) |
|
86 |
+ inter_mat = rep(0,length(quant_out)) |
|
87 |
+ ob_counts = rep(0,length(quant_out)) # Number of observations for averaging |
|
88 |
+ # For each sample |
|
89 |
+ for (i in 1:dim(ei)[2]){ |
|
90 |
+ # Produce spline interpolation for that sample, value ~ quantile index |
|
91 |
+ x1 = na.omit(quant_mat[,i]) |
|
92 |
+ y1 = na.omit(x_mat[,i]) |
|
93 |
+ # Evaluated at all possible quantile indices |
|
94 |
+ inter = approx(x1,y1,xout = quant_out, rule = 2)$y |
|
95 |
+ ob_counts = ob_counts + !is.na(inter) |
|
96 |
+ inter[is.na(inter)] = 0 |
|
97 |
+ inter_mat = inter_mat + inter |
|
98 |
+ } |
|
99 |
+ print("Complete.") |
|
100 |
+ |
|
101 |
+ # Average over the interpolated values from all samples |
|
102 |
+ inter_mean = inter_mat/ob_counts |
|
103 |
+ |
|
104 |
+ ## Substituting Mean Interpolated Values for Expression Values and Return |
|
105 |
+ eo = matrix(inter_mean,ncol = dim(ei)[2]) |
|
106 |
+ eo[is.na(eo)] = 0 |
|
107 |
+ for (i in 1:dim(ei)[2]){ |
|
108 |
+ eo[,i] = rev(eo[,i])[order(order(ei[,i]))] |
|
109 |
+ } |
|
110 |
+ print("Finished!") |
|
111 |
+ return(eo) |
|
112 |
+} |
|
113 |
+ |
|
114 |
+#' DESeq normalization wrapper. |
|
115 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
116 |
+#' @return DESeq normalized matrix (scaled by sample ) |
|
117 |
+DESEQ_FN = function(ei){ |
|
118 |
+ size_fac = estimateSizeFactorsForMatrix(ei) |
|
119 |
+ eo = t(t(ei)/size_fac) |
|
120 |
+ return(eo) |
|
121 |
+} |
|
122 |
+ |
|
123 |
+#' DESeq normalization applied to positive data. |
|
124 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
125 |
+#' @return DESeq (positive) normalized matrix (scaled by sample ) |
|
126 |
+DESEQ_FN_POS = function(ei){ |
|
127 |
+ if(any(ei < 0)){ |
|
128 |
+ stop("Negative values in input.") |
|
129 |
+ } |
|
130 |
+ if(!is.null(dim(ei))){ |
|
131 |
+ y = ei |
|
132 |
+ y[y == 0] = NA # Matrix with zeroes replaced w/ NA |
|
133 |
+ geom_mean = exp(apply(log(y),1,sum,na.rm = TRUE)/rowSums(!is.na(y))) # Compute Geometric Mean of Expression for Each Gene (Use positive data only) |
|
134 |
+ }else{ |
|
135 |
+ stop("Null imput matrix dimension.") |
|
136 |
+ } |
|
137 |
+ if(!any(geom_mean > 0)){ |
|
138 |
+ stop("Geometric mean non-positive for all genes.") |
|
139 |
+ } |
|
140 |
+ ratios = ei / geom_mean # Divide each Expression Value by Geometric Mean of Corresponding Gene |
|
141 |
+ if(any(is.infinite(geom_mean))){ |
|
142 |
+ stop("Infinite mean! This should never happen :-<") |
|
143 |
+ } |
|
144 |
+ ratios = ratios[geom_mean > 0,] # Ignore Genes with Zero Mean |
|
145 |
+ y = ratios |
|
146 |
+ y[y == 0] = NA |
|
147 |
+ size = apply(y,2,median,na.rm = TRUE) # Size factor taken as median of ratios (positive data only) |
|
148 |
+ if(any(size == 0)){ |
|
149 |
+ stop("Zero library size. This should never happen :-(") |
|
150 |
+ } |
|
151 |
+ eo = t(t(ei)/size)*mean(size) |
|
152 |
+ return(eo) |
|
153 |
+} |
|
154 |
+ |
|
155 |
+#' TMM normalization wrapper. |
|
156 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
157 |
+#' @return TMM normalized matrix (scaled by sample ) |
|
158 |
+TMM_FN = function(ei){ |
|
159 |
+ size_fac = calcNormFactors(ei,method = "TMM") |
|
160 |
+ eo = t(t(ei)/size_fac) |
|
161 |
+ return(eo) |
|
162 |
+} |
|
163 |
+ |
|
164 |
+#' Output residuals + intercept of fit to quality score matrix |
|
165 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
166 |
+#' @param default_args_list = list of default arguments |
|
167 |
+#' @param args_list = list of user-provided arguments |
|
168 |
+#' @return Matrix adjusted for K quality scores |
|
169 |
+ |
|
170 |
+QUALREG_FN = function(ei, default_args_list, args_list){ |
|
171 |
+ EPSILON = default_args_list$EPSILON |
|
172 |
+ scores = prcomp(default_args_list$q,center = TRUE,scale = TRUE)$x[,1:default_args_list$K] |
|
173 |
+ re = log(ei + EPSILON) |
|
174 |
+ for (g in 1:dim(ei)[1]){ |
|
175 |
+ lin.mod = lm(re[g,] ~ scores) |
|
176 |
+ re[g,] = lin.mod$coefficients[1] + lin.mod$residuals |
|
177 |
+ } |
|
178 |
+ re = exp(re) |
|
179 |
+ re[e == 0] = 0 |
|
180 |
+ return(re) |
|
181 |
+} |
|
182 |
+ |
|
183 |
+#' Output residuals + intercept of fit to control-gene derived factors |
|
184 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
185 |
+#' @param default_args_list = list of default arguments |
|
186 |
+#' @param args_list = list of user-provided arguments |
|
187 |
+#' @return Matrix adjusted for K unwanted factors |
|
188 |
+ |
|
189 |
+RUVG_FN = function(ei, default_args_list, args_list){ |
|
190 |
+ EPSILON = default_args_list$EPSILON |
|
191 |
+ scores = prcomp(t(log(ei[default_args_list$control_genes,]+default_args_list$EPSILON)),center = TRUE,scale. = FALSE)$x[,1:default_args_list$K] |
|
192 |
+ re = log(ei + default_args_list$EPSILON) |
|
193 |
+ for (g in 1:dim(ei)[1]){ |
|
194 |
+ lin.mod = lm(re[g,] ~ scores) |
|
195 |
+ re[g,] = lin.mod$coefficients[1] + lin.mod$residuals |
|
196 |
+ } |
|
197 |
+ re = exp(re) |
|
198 |
+ re[e == 0] = 0 |
|
199 |
+ return(re) |
|
200 |
+} |
|
201 |
+ |
|
202 |
+#' ComBat Wrapper |
|
203 |
+#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
204 |
+#' @param default_args_list = list of default arguments |
|
205 |
+#' @param args_list = list of user-provided arguments |
|
206 |
+#' @return Matrix adjusted for batch |
|
207 |
+ |
|
208 |
+COMBAT_FN = function(ei, default_args_list, args_list){ |
|
209 |
+ EPSILON = default_args_list$EPSILON |
|
210 |
+ SD_EPSILON = default_args_list$SD_EPSILON |
|
211 |
+ ei = log(ei + default_args_list$EPSILON) |
|
212 |
+ pheno = as.data.frame(cbind(default_args_list$tech_batch,default_args_list$bio_cond)) |
|
213 |
+ if(is.null(default_args_list$bio_cond) || any(is.na(default_args_list$bio_cond))){ |
|
214 |
+ colnames(pheno) = c("batch") |
|
215 |
+ is_var = apply(ei,1,sd) > SD_EPSILON |
|
216 |
+ mod = model.matrix(~ 1, data = pheno) |
|
217 |
+ }else{ |
|
218 |
+ colnames(pheno) = c("batch","phenotype") |
|
219 |
+ is_var = T |
|
220 |
+ for (p in unique(default_args_list$bio_cond)){ |
|
221 |
+ is_var = is_var & (apply(ei[,default_args_list$bio_cond == p],1,sd) > SD_EPSILON) |
|
222 |
+ } |
|
223 |
+ mod = model.matrix(~ as.factor(phenotype), data = pheno) |
|
224 |
+ } |
|
225 |
+ eo = ei |
|
226 |
+ eo[is_var,] = ComBat(dat=ei[is_var,], batch=default_args_list$tech_batch, mod=mod, par.prior=TRUE, prior.plots=FALSE) |
|
227 |
+ eo = exp(eo) |
|
228 |
+ eo[ei == 0] = 0 |
|
229 |
+ return(eo) |
|
230 |
+} |
|
231 |
+ |
|
232 |
+NESTED_BATCH_FIXED_FN = function(ei,bio,batch,uv = NULL,w = NULL){ |
|
233 |
+ if(!is.null(bio)){ |
|
234 |
+ bio = factor(bio, levels = sort(levels(bio))) |
|
235 |
+ batch = factor(batch, levels = unique(batch[order(bio)])) |
|
236 |
+ |
|
237 |
+ n_vec <- tapply(batch, bio, function(x) nlevels(droplevels(x))) |
|
238 |
+ }else{ |
|
239 |
+ n_vec = nlevels(droplevels(batch)) |
|
240 |
+ } |
|
241 |
+ |
|
242 |
+ mat = matrix(0,nrow = sum(n_vec),ncol = sum(n_vec - 1)) |
|
243 |
+ xi = 1 |
|
244 |
+ yi = 1 |
|
245 |
+ for(i in 1:length(n_vec)){ |
|
246 |
+ if(n_vec[i] > 1){ |
|
247 |
+ cs = contr.sum(n_vec[i]) |
|
248 |
+ dd = dim(cs) |
|
249 |
+ mat[xi:(xi + dd[1] - 1),yi:(yi + dd[2] - 1)] = cs |
|
250 |
+ xi = xi + dd[1] |
|
251 |
+ yi = yi + dd[2] |
|
252 |
+ }else{ |
|
253 |
+ xi = xi + 1 |
|
254 |
+ } |
|
255 |
+ } |
|
256 |
+ |
|
257 |
+ leo = log(ei+1) |
|
258 |
+ if(is.null(uv)){ |
|
259 |
+ if(!is.null(bio)) { |
|
260 |
+ design_mat <- model.matrix(~ bio + batch, contrasts=list(bio=contr.sum, batch=mat)) |
|
261 |
+ } else { |
|
262 |
+ design_mat <- model.matrix(~ batch, contrasts=list(batch=mat)) |
|
263 |
+ } |
|
264 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
265 |
+ |
|
266 |
+ bind <- (ncol(lm_object$coefficients) - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients)) |
|
267 |
+ leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch] |
|
268 |
+ } else { |
|
269 |
+ if(!is.null(bio)) { |
|
270 |
+ design_mat <- model.matrix(~ bio + batch + uv, contrasts=list(bio=contr.sum, batch=mat)) |
|
271 |
+ } else { |
|
272 |
+ design_mat <- model.matrix(~ batch + uv, contrasts=list(batch=mat)) |
|
273 |
+ } |
|
274 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
275 |
+ |
|
276 |
+ uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
277 |
+ bind = (ncol(lm_object$coefficients) - dim(uv)[2] - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients) - dim(uv)[2]) |
|
278 |
+ leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
279 |
+ } |
|
280 |
+ return(leo) |
|
281 |
+} |
|
282 |
+ |
|
283 |
+NESTED_BATCH_RANDOM_FN = function(ei,bio,batch,uv = NULL,w = NULL){ |
|
284 |
+ design <- model.matrix(~batch - 1) |
|
285 |
+ leo = log(ei+1) |
|
286 |
+ if(is.null(uv)){ |
|
287 |
+ for(i in 1:dim(ei)[1]){ |
|
288 |
+ lm_object = lmer(leo[i,] ~ 1 + bio + (1 | batch),weights = w[i,]) |
|
289 |
+ betaj <- ranef(lm_object)[[1]][,1] |
|
290 |
+ leo[i,] = leo[i,] - as.vector(design %*% betaj) |
|
291 |
+ } |
|
292 |
+ }else{ |
|
293 |
+ for(i in 1:dim(ei)[1]){ |
|
294 |
+ lm_object = lmer(leo[i,] ~ 1 + bio + uv + (1 | batch), |
|
295 |
+ weights = w[i,]) |
|
296 |
+ betaj <- ranef(lm_object)[[1]][,1] |
|
297 |
+ uvind = (length(coef(lm_object)[[1]][1,]) - dim(uv)[2] + 1):(length(coef(lm_object)[[1]][1,])) |
|
298 |
+ leo[i,] = leo[i,] - as.vector(design %*% betaj) - as.vector(uv %*% t(coef(lm_object)[[1]][1,][uvind])) |
|
299 |
+ } |
|
300 |
+ } |
|
301 |
+ return(leo) |
|
302 |
+} |
|
303 |
+ |
|
304 |
+FACTORIAL_BATCH_FN = function(ei,bio,batch,uv = NULL,w = NULL){ |
|
305 |
+ contrasts <- contr.treatment(length(levels(batch))) |
|
306 |
+ rownames(contrasts) = levels(batch) |
|
307 |
+ leo = log(ei+1) |
|
308 |
+ |
|
309 |
+ if(is.null(bio)){ |
|
310 |
+ if(is.null(uv)){ |
|
311 |
+ |
|
312 |
+ # Just Batch |
|
313 |
+ design_mat <- model.matrix(~batch) |
|
314 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
315 |
+ bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients)) |
|
316 |
+ leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] |
|
317 |
+ } else { |
|
318 |
+ |
|
319 |
+ # Batch + UV |
|
320 |
+ design_mat <- model.matrix(~batch + uv) |
|
321 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
322 |
+ uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
323 |
+ bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2]) |
|
324 |
+ leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
325 |
+ } |
|
326 |
+ } else { |
|
327 |
+ if(is.null(uv)){ |
|
328 |
+ |
|
329 |
+ # Batch + BIO |
|
330 |
+ design_mat <- model.matrix(~bio + batch) |
|
331 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
332 |
+ bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients)) |
|
333 |
+ leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] |
|
334 |
+ |
|
335 |
+ } else { |
|
336 |
+ |
|
337 |
+ # Batch + UV + BIO |
|
338 |
+ design_mat <- model.matrix(~bio + batch + uv) |
|
339 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
340 |
+ uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
341 |
+ bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2]) |
|
342 |
+ leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
343 |
+ } |
|
344 |
+ } |
|
345 |
+ return(leo) |
|
346 |
+} |
|
347 |
+ |
|
348 |
+BATCHFREE_FN = function(ei,bio,uv = NULL,w = NULL){ |
|
349 |
+ leo = log(ei+1) |
|
350 |
+ |
|
351 |
+ if(is.null(bio)){ |
|
352 |
+ # Just UV |
|
353 |
+ design_mat <- model.matrix(~uv) |
|
354 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
355 |
+ uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
356 |
+ leo = leo - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
357 |
+ } else { |
|
358 |
+ # Bio + UV |
|
359 |
+ design_mat <- model.matrix(~bio + uv) |
|
360 |
+ lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
361 |
+ uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
362 |
+ leo2 = leo - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
363 |
+ } |
|
364 |
+ return(leo) |
|
365 |
+} |
|
366 |
+ |
|
367 |
+ADJUST_FN = function(ei,batch = NULL, |
|
368 |
+ bio = NULL, |
|
369 |
+ uv = NULL, |
|
370 |
+ w = NULL, |
|
371 |
+ design = c("factorial","nested"), |
|
372 |
+ nested_model = c("fixed","random")){ |
|
373 |
+ |
|
374 |
+ if(!is.null(batch)){ |
|
375 |
+ design <- match.arg(design) |
|
376 |
+ |
|
377 |
+ if(design == "factorial"){ |
|
378 |
+ leo = FACTORIAL_BATCH_FN(ei,bio,batch,uv,w) |
|
379 |
+ }else{ |
|
380 |
+ nested_model <- match.arg(nested_model) |
|
381 |
+ |
|
382 |
+ if(nested_model == "fixed"){ |
|
383 |
+ leo = NESTED_BATCH_FIXED_FN(ei,bio,batch,uv,w) |
|
384 |
+ }else{ |
|
385 |
+ leo = NESTED_BATCH_RANDOM_FN(ei,bio,batch,uv,w) |
|
386 |
+ } |
|
387 |
+ } |
|
388 |
+ }else{ |
|
389 |
+ leo = BATCHFREE_FN(ei,bio,uv,w) |
|
390 |
+ } |
|
391 |
+ return(leo) |
|
392 |
+} |
|
0 | 393 |
\ No newline at end of file |
1 | 394 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,69 @@ |
1 |
+out_dir = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data" |
|
2 |
+flist = paste(out_dir,list.files(out_dir),sep = "/") |
|
3 |
+ |
|
4 |
+# Scoring |
|
5 |
+scores = NULL |
|
6 |
+snames = NULL |
|
7 |
+for(fil in flist){ |
|
8 |
+ load(fil) |
|
9 |
+ nom = scone_out$method |
|
10 |
+ sc = scone_out$evaluation |
|
11 |
+ if(!is.na(sc)[1]){ |
|
12 |
+ print(nom) |
|
13 |
+ snames = c(snames,nom) |
|
14 |
+ scores = rbind(scores,sc$scores) |
|
15 |
+ }else{ |
|
16 |
+ # print(nom) |
|
17 |
+ } |
|
18 |
+} |
|
19 |
+rownames(scores) = snames |
|
20 |
+escores = t(t(scores)*c(-1,-1,1,1,1,-1,1)) |
|
21 |
+colnames(escores) = paste(colnames(scores),c("_INV","")[1.5 + c(-1,-1,1,1,1,-1,1)/2],sep = "") |
|
22 |
+ |
|
23 |
+scores = t(na.omit(t(scores))) |
|
24 |
+escores = t(na.omit(t(escores))) |
|
25 |
+ |
|
26 |
+# Score Heatmaps |
|
27 |
+require(gplots) |
|
28 |
+heatmap.2(t(apply(scores ,2,rank)[order(-apply((apply(escores ,2,rank)),1,min)),][1:20,]),density.info = 'n',trace = 'n',key.xlab = "Score",key.title = NA, |
|
29 |
+ Colv = NA,col = colorRampPalette(c("purple","black","yellow"))(100), |
|
30 |
+ margins = c(15,10),cexRow = .75,cexCol = .5) |
|
31 |
+title(main = "Top 20 SCONE",cex.lab=0.5) |
|
32 |
+ |
|
33 |
+my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min))) |
|
34 |
+ |
|
35 |
+names(which(my_rank == 1)) |
|
36 |
+load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_NONE.Rdata") |
|
37 |
+is_zero = scone_out$exp_mat == 0 |
|
38 |
+load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1.Rdata") |
|
39 |
+myma = scone_out$exp_mat |
|
40 |
+hk = read.table("~/YosefCode/packages/RNASeq/summary/EXAMPLE/reference_files/house_keeping_human_names.txt")$V1 |
|
41 |
+myma[is_zero] = 0 |
|
42 |
+fnr_out = estimateFNR(myma,bulk_model = T,is.expressed = rownames(myma) %in% as.character(hk) ) |
|
43 |
+w = 0 + !is_zero |
|
44 |
+w[is_zero] = 1-fnr_out$Z[is_zero] |
|
45 |
+wpc = wPCA(log(scone_out$exp_mat + 1),w,filt = T) |
|
46 |
+plot(wpc$x[,1:2], col = cols) |
|
47 |
+ |
|
48 |
+rownames(wpc$rotation) = rownames(scone_out$exp_mat)[wpc$to.pass] |
|
49 |
+sort(-abs(wpc$rotation[,2]))[1:20] |
|
50 |
+ |
|
51 |
+load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/evaluate_material.Rdata") |
|
52 |
+pchs = 16 |
|
53 |
+cols = rainbow(length(levels(evaluate_material$batch)))[evaluate_material$batch] |
|
54 |
+plot(scone_out$evaluation$pc_val[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells") |
|
55 |
+legend("bottomleft",legend = (levels(evaluate_material$batch)),pch = 16, col = rainbow(length(levels(evaluate_material$batch))), cex = .5) |
|
56 |
+outy = log(scone_out$exp_mat+1) |
|
57 |
+cols = rainbow(2)[scone_out$evaluation$clusters] |
|
58 |
+plot(scone_out$evaluation$pc_val[,c(1,3)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells") |
|
59 |
+ |
|
60 |
+write.table(outy,file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1_log_counts.txt",quote = F,sep = "\t",col.names = T,row.names = T) |
|
61 |
+ evaluate_material$ |
|
62 |
+evaluate_material$dim_eval |
|
63 |
+plot(evaluate_material$DE_scores[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells") |
|
64 |
+ |
|
65 |
+ |
|
66 |
+my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min))) |
|
67 |
+write.table(cbind(scores,my_rank),file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/scores.txt",quote = F,sep = "\t") |
|
68 |
+ |
|
69 |
+ |
0 | 70 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,244 @@ |
1 |
+library(shiny) |
|
2 |
+library(EDASeq) |
|
3 |
+library(clusterCells) |
|
4 |
+load("../shiny/tmp.rda") |
|
5 |
+ |
|
6 |
+library(RColorBrewer) |
|
7 |
+cc <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"),brewer.pal(12,"Set3"), |
|
8 |
+ brewer.pal(8, "Accent")) |
|
9 |
+ |
|
10 |
+choose_color <- function(input) { |
|
11 |
+ if(input$color_code=="type") { |
|
12 |
+ cols <- cc[type] |
|
13 |
+ } else { |
|
14 |
+ cols <- cc[batch] |
|
15 |
+ } |
|
16 |
+ return(cols) |
|
17 |
+} |
|
18 |
+ |
|
19 |
+server <- function(input, output) { |
|
20 |
+ output$boxplot <- renderPlot({ |
|
21 |
+ set.seed(19883) |
|
22 |
+ if(input$downsample) { |
|
23 |
+ idx <- sample(1:100) |
|
24 |
+ } else { |
|
25 |
+ idx <- 1:ncol(norm) |
|
26 |
+ } |
|
27 |
+ boxplot(log_norm[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]], |
|
28 |
+ outline=FALSE, names=FALSE, ylab="log(count+1)", main="Log normalized counts") |
|
29 |
+ lines(apply(log_norm[,idx], 2, median)) |
|
30 |
+ } |
|
31 |
+ ) |
|
32 |
+ |
|
33 |
+ output$rle <- renderPlot({ |
|
34 |
+ set.seed(19883) |
|
35 |
+ if(input$downsample & ncol(norm)>100) { |
|
36 |
+ idx <- sample(1:100) |
|
37 |
+ } else { |
|
38 |
+ idx <- 1:ncol(norm) |
|
39 |
+ } |
|
40 |
+ boxplot(rle[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]], |
|
41 |
+ outline=FALSE, names=FALSE, ylab="Relative Log Expression", main="Relative Log Expression (RLE)") |
|
42 |
+ abline(h=0, lty=2) |
|
43 |
+ }) |
|
44 |
+ |
|
45 |
+ output$pca <- renderPlot( |
|
46 |
+ plot(pca$x[,1:2], col=choose_color(input), pch=19, main="All genes") |
|
47 |
+ ) |
|
48 |
+ |
|
49 |
+ output$pca_pc <- renderPlot( |
|
50 |
+ plot(pca_pc$x[,1:2], col=choose_color(input), pch=19, main="Positive controls") |
|
51 |
+ ) |
|
52 |
+ |
|
53 |
+ output$pca_nc <- renderPlot( |
|
54 |
+ plot(pca_nc$x[,1:2], col=choose_color(input), pch=19, main="Negative controls") |
|
55 |
+ ) |
|
56 |
+ |
|
57 |
+ output$pca_mv <- renderPlot( |
|
58 |
+ if(input$variance=="variance") { |
|
59 |
+ plot(pca_var$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes") |
|
60 |
+ } else { |
|
61 |
+ plot(pca_cv$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes") |
|
62 |
+ } |
|
63 |
+ ) |
|
64 |
+ |
|
65 |
+ output$qc_plot <- renderPlot({ |
|
66 |
+ barplot(abs(cors), beside = TRUE, col=cc[1:ncol(qc)], border=cc[1:ncol(qc)], ylim=c(0, 1), |
|
67 |
+ space=c(0, 4), main="Absolute correlation with QC scores") |
|
68 |
+ legend("topright", colnames(qc), fill=cc, border=cc, cex=.5) |
|
69 |
+ } |
|
70 |
+ ) |
|
71 |
+ |
|
72 |
+ output$qc_pca <- renderPlot({ |
|
73 |
+ plot(qc_pca$x[,1:2], pch=19, col=choose_color(input), main="Quality PCA") |
|
74 |
+ } |
|
75 |
+ ) |
|
76 |
+ |
|
77 |
+ output$qc_corr <- renderTable(abs(cors[,1:5])) |
|
78 |
+ |
|
79 |
+ output$pam_cc <- renderPlot({ |
|
80 |
+ dualHeatmap(heatData = sub[[input$k]], main="Heatmap of co-clustering, based on 100 subsamplings", |
|
81 |
+ clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, |
|
82 |
+ annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
83 |
+ annColor=list(Batch=cc, Type=cc)) |
|
84 |
+ }) |
|
85 |
+ |
|
86 |
+ output$pam_type <- renderTable({ |
|
87 |
+ table(pam[[input$k-1]]$clustering, type) |
|
88 |
+ }) |
|
89 |
+ |
|
90 |
+ output$pam_batch <- renderTable({ |
|
91 |
+ table(pam[[input$k-1]]$clustering, batch) |
|
92 |
+ }) |
|
93 |
+ |
|
94 |
+ output$pam_pca <- renderPlot({ |
|
95 |
+ plot(pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Expression PCA") |
|
96 |
+ legend("topright", as.character(1:input$k), fill=cc, cex=.5) |
|
97 |
+ }) |
|
98 |
+ |
|
99 |
+ output$pam_qc <- renderPlot({ |
|
100 |
+ plot(qc_pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Quality PCA") |
|
101 |
+ legend("topright", as.character(1:input$k), fill=cc, cex=.5) |
|
102 |
+ }) |
|
103 |
+ |
|
104 |
+ output$heatmap_pc <- renderPlot({ |
|
105 |
+ dualHeatmap(heatData = t(norm), main="Positive Control genes", |
|
106 |
+ clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=poscon, |
|
107 |
+ annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
108 |
+ annColor=list(Batch=cc, Type=cc)) |
|
109 |
+ }) |
|
110 |
+ |
|
111 |
+ output$heatmap_nc <- renderPlot({ |
|
112 |
+ dualHeatmap(heatData = t(norm), main="Negative Control genes", |
|
113 |
+ clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=negcon, |
|
114 |
+ annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
115 |
+ annColor=list(Batch=cc, Type=cc)) |
|
116 |
+ }) |
|
117 |
+ |
|
118 |
+ output$heatmap_mv <- renderPlot({ |
|
119 |
+ if(input$variance=="variance") { |
|
120 |
+ genes <- mv_var |
|
121 |
+ } else { |
|
122 |
+ genes <- mv_cv |
|
123 |
+ } |
|
124 |
+ |
|
125 |
+ dualHeatmap(heatData = t(norm), main="Top 100 most variable genes", |
|
126 |
+ clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes, |
|
127 |
+ annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
128 |
+ annColor=list(Batch=cc, Type=cc)) |
|
129 |
+ }) |
|
130 |
+ |
|
131 |
+ output$heatmap_pc1 <- renderPlot({ |
|
132 |
+ genes <- names(sort(pca$rotation[,1], decreasing = TRUE)[1:100]) |
|
133 |
+ dualHeatmap(heatData = t(norm), main="Top 100 genes by PC1 loadings", |
|
134 |
+ clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes, |
|
135 |
+ annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
136 |
+ annColor=list(Batch=cc, Type=cc)) |
|
137 |
+ }) |
|
138 |
+ |
|
139 |
+ output$box_pc1_pc <- renderPlot({ |
|
140 |
+ boxplot(tapply(pca_pc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Positive Controls") |
|
141 |
+ }) |
|
142 |
+ |
|
143 |
+ output$box_pc1_nc <- renderPlot({ |
|
144 |
+ boxplot(tapply(pca_nc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Negative Controls") |
|
145 |
+ }) |
|
146 |
+ |
|
147 |
+ output$box_pc1_qc <- renderPlot({ |
|
148 |
+ boxplot(tapply(qc_pca$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of quality scores") |
|
149 |
+ }) |
|
150 |
+ |
|
151 |
+ output$curve_dropout <- renderPlot({ |
|
152 |
+ plot(logistic(fitted[order(fnr$Alpha[1,]),1])~sort(fnr$Alpha[1,]), type='l', ylim=c(0, 1), col=rgb(0, 0, 0, 0.2), xlab="log average expression", ylab="Probability of drop-out", main="Drop-out Curves") |
|
153 |
+ for(i in 2:ncol(fitted)) { |
|
154 |
+ lines(logistic(fitted[order(fnr$Alpha[1,]),i])~sort(fnr$Alpha[1,]), type='l', col=rgb(0, 0, 0, 0.2)) |
|
155 |
+ } |
|
156 |
+ lines(logistic(rowMeans(fitted)[order(fnr$Alpha[1,])])~sort(fnr$Alpha[1,]), col=2, lwd=2) |
|
157 |
+ }) |
|
158 |
+ |
|
159 |
+ output$box_detection <- renderPlot({ |
|
160 |
+ boxplot(tapply(colSums(norm>5), get(input$color_code), function(x) x), las=2, col=cc, ylab="Number of detected genes (> 5 reads)", main="Detected Genes") |
|
161 |
+ }) |
|
162 |
+ |
|
163 |
+ output$box_dropout <- renderPlot({ |
|
164 |
+ boxplot(tapply(colMeans(1-w), get(input$color_code), function(x) x), las=2, col=cc, ylab="Drop-out rate", main="Drop-out Rate") |
|
165 |
+ }) |
|
166 |
+ |
|
167 |
+ } |
|
168 |
+ |
|
169 |
+ui <- fluidPage( |
|
170 |
+ titlePanel("In-depth Normalization Report"), |
|
171 |
+ |
|
172 |
+ sidebarLayout( |
|
173 |
+ sidebarPanel( |
|
174 |
+ selectInput("color_code", label = "Color code / stratify by", |
|
175 |
+ choices = list("Batch"="batch", "Cell Type"="type"), |
|
176 |
+ selected = "batch"), |
|
177 |
+ selectInput("variance", label = "Select most variable genes by", |
|
178 |
+ choices = list("Variance"="variance", "Coefficient of variation"="cv"), |
|
179 |
+ selected = "cv"), |
|
180 |
+ sliderInput("k", label = "Number of clusters", |
|
181 |
+ min=2, max=10, |
|
182 |
+ value = 7), |
|
183 |
+ checkboxInput("downsample", label="Downsample to 100 cells for boxplot panel", value=TRUE), |
|
184 |
+ helpText("RLE: relative log expression."), |
|
185 |
+ helpText("PCA: principal component analysis."), |
|
186 |
+ helpText("QC: quality control."), |
|
187 |
+ helpText("The 'Boxplot' panel shows the read count distribution and the RLE plot."), |
|
188 |
+ helpText("The 'PCA' panel shows the PCA of different subset of genes."), |
|
189 |
+ helpText("The 'QC' panel shows the absolute correlation between the first 10 principal components of the expression matrix and the QC measures. It also shows the PCA of the QC measures. Note that the PCA score signs are not meaningful (e.g., 'good' cells could have a low value of PC1 and 'bad' cells a high value)."), |
|
190 |
+ helpText("The 'Cluster stability' panel shows a heatmap of the PAM co-clustering based on 100 subsamples of the data. It shows the relation between the clusters obtained by PAM (on the original data) to cell type, batch, expression PCA, and QC PCA. Note that these results are reported for normalization evaluation only. A proper cluster analysis should be performed to find sub-populations in the data."), |
|
191 |
+ helpText("The 'Heatmaps' panel shows the log expression of the positive controls, negative controls, genes with highest PC1 loadings, and most variable genes."), |
|
192 |
+ helpText("The 'Confounding' panel shows the first PC of positive controls, negative controls, and QC stratified by either cell type or batch. This is useful to identify lower quality batches and potential confounding between quality and cell type."), |
|
193 |
+ helpText("The 'Drop-out' panel shows the number of detected genes and the drop-out rate stratified by either cell type or batch. It also shows the probability of drop-out as a function of the average expression"), |
|
194 |
+ helpText("The 'Number of clusters' slider controls the PAM clustering in the 'Cluster stability' tab."), |
|
195 |
+ helpText("Please note that with many cells (>500), some plots may take several seconds (sometimes minutes) to appear."), |
|
196 |
+ helpText("By default, we downsample the data to 100 cells for the boxplot visualization (only for the 'Boxplot' tab). Please uncheck it to see the full dataset (it may take some time).") |
|
197 |
+ ), |
|
198 |
+ |
|
199 |
+ mainPanel( |
|
200 |
+ tabsetPanel(type = "tabs", |
|
201 |
+ ## Add a "summary" tab (wait for Michael's text) |
|
202 |
+ tabPanel("Boxplot", |
|
203 |
+ plotOutput("boxplot"), |
|
204 |
+ plotOutput("rle")), |
|
205 |
+ tabPanel("PCA", |
|
206 |
+ plotOutput("pca"), |
|
207 |
+ plotOutput("pca_pc"), |
|
208 |
+ plotOutput("pca_nc"), |
|
209 |
+ plotOutput("pca_mv")), |
|
210 |
+ tabPanel("QC", |
|
211 |
+ plotOutput("qc_plot"), |
|
212 |
+ tableOutput("qc_corr"), |
|
213 |
+ plotOutput("qc_pca")), |
|
214 |
+ tabPanel("Cluster stability", |
|
215 |
+ plotOutput("pam_cc"), |
|
216 |
+ fluidRow( |
|
217 |
+ column(3, |
|
218 |
+ h3("Type"), |
|
219 |
+ br(), |
|
220 |
+ tableOutput("pam_type")), |
|
221 |
+ column(3, |
|
222 |
+ h3("Batch"), |
|
223 |
+ br(), |
|
224 |
+ tableOutput("pam_batch"))), |
|
225 |
+ plotOutput("pam_pca"), |
|
226 |
+ plotOutput("pam_qc")), |
|
227 |
+ tabPanel("Heatmaps", |
|
228 |
+ plotOutput("heatmap_pc"), |
|
229 |
+ plotOutput("heatmap_nc"), |
|
230 |
+ plotOutput("heatmap_pc1"), |
|
231 |
+ plotOutput("heatmap_mv")), |
|
232 |
+ tabPanel("Confounding", |
|
233 |
+ plotOutput("box_pc1_pc"), |
|
234 |
+ plotOutput("box_pc1_nc"), |
|
235 |
+ plotOutput("box_pc1_qc")), |
|
236 |
+ tabPanel("Drop-out", |
|
237 |
+ plotOutput("box_detection"), |
|
238 |
+ plotOutput("box_dropout"), |
|
239 |
+ plotOutput("curve_dropout")) |
|
240 |
+ ) |
|
241 |
+ |
|
242 |
+ ))) |
|
243 |
+ |
|
244 |
+shinyApp(ui = ui, server = server) |
0 | 245 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,93 @@ |
1 |
+Lik_Fun = function(Z,X,Beta){ |
|
2 |
+ return((1-Z) + (2*Z - 1)/(1+exp(-X %*% Beta))) |
|
3 |
+} |
|
4 |
+ |
|
5 |
+Post_Z = function(Y,W,Alpha,X,Beta){ |
|
6 |
+ return(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta)/(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta) + (Y == 0)*Lik_Fun(0,X,Beta))) |
|
7 |
+} |
|
8 |
+ |
|
9 |
+estimateFNR = function(x, bulk_model = FALSE, is.control = TRUE, K = 0, X = NULL, Alpha = NULL, |
|
10 |
+ is.expressed = NULL, niter = 100, ntresh = 0){ |
|
11 |
+ |
|
12 |
+ require(matrixStats) |
|
13 |
+ Y = t(matrix(as.numeric(x > ntresh ),nrow = dim(x)[1])) |
|
14 |
+ |
|
15 |
+ if(bulk_model){ |
|
16 |
+ x2 = x |
|
17 |
+ x2[x2 <= ntresh] = NA |
|
18 |
+ Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE)))) |
|
19 |
+ } |
|
20 |
+ |
|
21 |
+ # Prepare Wanted Variation and Gene-Intrinsic Intercept |
|
22 |
+ X = cbind(X,rep(1,dim(x)[2])) |
|
23 |
+ is.intercept = dim(X)[2] == 1 |
|
24 |
+ Beta = matrix(0,dim(X)[2],dim(x)[1]) |
|
25 |
+ |
|
26 |
+ # Prepare Unwanted Variation (K factors) and Sample-Intrinsic Intercept |
|
27 |
+ if( K == 0 || !is.null(Alpha) ){ |
|
28 |
+ Alpha = rbind(Alpha,rep(1,dim(x)[1])) |
|
29 |
+ K = dim(Alpha)[1] - 1 |
|
30 |
+ }else{ |
|
31 |
+ M = 2*Y[is.control,] - 1 |
|
32 |
+ M = t(t(M - rowMeans(M)) - colMeans(M)) + mean(M) |
|
33 |
+ Alpha = rbind(t(svd(M,nv = K)$v[,1:K]),rep(1,dim(x)[1])) |
|
34 |
+ } |
|
35 |
+ W = matrix(0,dim(x)[2],K+1) |
|
36 |
+ |
|
37 |
+ # Initialize Z |
|
38 |
+ Z = Post_Z(Y,W,Alpha,X,Beta) |
|
39 |
+ |
|
40 |
+ # If control genes are supplied, W fit once, according to control genes |
|
41 |
+ if(!is.null(is.expressed)){ |
|
42 |
+ cY = Y[,is.expressed] |
|
43 |
+ cAlpha = Alpha[,is.expressed] |
|
44 |
+ for (i in 1:dim(Z)[1]){ |
|
45 |
+ W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients |
|
46 |
+ } |
|
47 |
+ } |
|
48 |
+ |
|
49 |
+ ## DR: it seems strange to me that this is a for loop and not a while !converged |
|
50 |
+ for(n in 1:niter){ |
|
51 |
+ # Update Beta |
|
52 |
+ if(!is.intercept){ |
|
53 |
+ |
|
54 |
+ if(is.null(is.expressed)){ |
|
55 |
+ for(i in 1:dim(Z)[2]){ |
|
56 |
+ Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients |
|
57 |
+ } |
|
58 |
+ |
|
59 |
+ }else{ # Only over non-control genes |
|
60 |
+ for(i in (1:dim(Z)[2])[!is.expressed]){ |
|
61 |
+ Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients |
|
62 |
+ } |
|
63 |
+ } |
|
64 |
+ |
|
65 |
+ |
|
66 |
+ }else{ |
|
67 |
+ # Short-cut! |
|
68 |
+ Beta = t(matrix(log(colMeans(Z)/(1-colMeans(Z))))) |
|
69 |
+ } |
|
70 |
+ |
|
71 |
+ # Update Z |
|
72 |
+ Z = Post_Z(Y,W,Alpha,X,Beta) |
|
73 |
+ |
|
74 |
+ if(is.null(is.expressed)){ |
|
75 |
+ # Update W |
|
76 |
+ for (i in 1:dim(Z)[1]){ |
|
77 |
+ W[i,] = glm(cbind(Y[i,],(1-Y[i,])*Z[i,]) ~ 0 + t(Alpha),quasibinomial)$coefficients |
|
78 |
+ } |
|
79 |
+ |
|
80 |
+ # Update Z |
|
81 |
+ Z = Post_Z(Y,W,Alpha,X,Beta) |
|
82 |
+ } |
|
83 |
+ } |
|
84 |
+ |
|
85 |
+ # Stick to our initial assumptions |
|
86 |
+ if(!is.null(is.expressed)){ |
|
87 |
+ Beta[,is.expressed] = NA |
|
88 |
+ Z[,is.expressed] = 1 |
|
89 |
+ } |
|
90 |
+ EL = sum(na.omit(as.vector(Z*log(Lik_Fun(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(Lik_Fun(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(Lik_Fun(0,X,Beta))))) |
|
91 |
+ |
|
92 |
+ return(list(Z = t(Z), EL = EL, P = t(Lik_Fun(1,W,Alpha)), W = W, X = X, Alpha = Alpha, Beta = Beta)) |
|
93 |
+} |
|
0 | 94 |
\ No newline at end of file |
1 | 95 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,142 @@ |
1 |
+#' Internal Function |
|
2 |
+#' |
|
3 |
+#' This function is used internally in scone to parse the variables used to generate the design matrices. |
|
4 |
+#' |
|
5 |
+#' |
|
6 |
+#' @param pars character. A vector of parameters corresponding to a row of params. |
|
7 |
+#' @param bio factor. The biological factor of interest. |
|
8 |
+#' @param batch factor. The known batch effects. |
|
9 |
+#' @param ruv_factors list. A list containing the factors of unwanted variation. |
|
10 |
+#' @param qc matrix. The principal components of the QC metrics. |
|
11 |
+#' |
|
12 |
+#' @return A list with the variables to be passed to make_design. |
|
13 |
+parse_row <- function(pars, bio, batch, ruv_factors, qc) { |
|
14 |
+ sc_name <- paste(pars[1:2], collapse="_") |
|
15 |
+ |
|
16 |
+ W <- out_bio <- out_batch <- NULL |
|
17 |
+ |
|
18 |
+ if(pars[3]!="no_uv") { |
|
19 |
+ parsed <- strsplit(as.character(pars[3]), "=")[[1]] |
|
20 |
+ if(grepl("ruv", parsed[1])) { |
|
21 |
+ W <- ruv_factors[[sc_name]][,1:as.numeric(parsed[2])] |
|
22 |
+ } else { |
|
23 |
+ W <- qc[,1:as.numeric(parsed[2])] |
|
24 |
+ } |
|