... | ... |
@@ -1,4 +1,4 @@ |
1 |
-Package: SCONE |
|
1 |
+Package: scone |
|
2 | 2 |
Version: 0.0.1 |
3 | 3 |
Title: Single Cell Overview of Normalized Expression data |
4 | 4 |
Description: SCONE. |
... | ... |
@@ -8,7 +8,9 @@ Authors@R: c(person("Michael", "Cole", email = "mbeloc@gmail.com", |
8 | 8 |
role = c("aut"))) |
9 | 9 |
Author: Michael Cole [aut, cre, cph] and Davide Risso [aut] |
10 | 10 |
Maintainer: Michael Cole <mbeloc@gmail.com> |
11 |
-Date: 01-22-2016 |
|
11 |
+Imports: BiocParallel, DESeq, EDASeq, MASS, RUVSeq, aroma.light, class, |
|
12 |
+ edgeR, fpc, limma, matrixStats, scde |
|
13 |
+Date: 02-06-2016 |
|
12 | 14 |
License: Artistic-2.0 |
13 | 15 |
LazyLoad: yes |
14 | 16 |
RoxygenNote: 5.0.1 |
... | ... |
@@ -0,0 +1,27 @@ |
1 |
+# Generated by roxygen2: do not edit by hand |
|
2 |
+ |
|
3 |
+export(DESEQ_FN) |
|
4 |
+export(DESEQ_FN_POS) |
|
5 |
+export(FQ_FN) |
|
6 |
+export(FQ_FN_POS) |
|
7 |
+export(TMM_FN) |
|
8 |
+export(UQ_FN) |
|
9 |
+export(UQ_FN_POS) |
|
10 |
+export(estimate_zinb) |
|
11 |
+export(impute_zinb) |
|
12 |
+export(lm_adjust) |
|
13 |
+export(make_design) |
|
14 |
+export(scone) |
|
15 |
+export(score_matrix) |
|
16 |
+import(BiocParallel) |
|
17 |
+importFrom(DESeq,estimateSizeFactorsForMatrix) |
|
18 |
+importFrom(EDASeq,betweenLaneNormalization) |
|
19 |
+importFrom(MASS,glm.nb) |
|
20 |
+importFrom(RUVSeq,RUVg) |
|
21 |
+importFrom(aroma.light,normalizeQuantileRank.matrix) |
|
22 |
+importFrom(class,knn) |
|
23 |
+importFrom(edgeR,calcNormFactors) |
|
24 |
+importFrom(fpc,pamk) |
|
25 |
+importFrom(limma,lmFit) |
|
26 |
+importFrom(matrixStats,rowMedians) |
|
27 |
+importFrom(scde,bwpca) |
0 | 28 |
deleted file mode 100644 |
... | ... |
@@ -1,532 +0,0 @@ |
1 |
-## DR: if I understand correctly "nom" is used only for the name of the file to store the data. |
|
2 |
-## There should be a better way to do this, using the name of the function (the opposite of get()?). |
|
3 |
- |
|
4 |
-source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/estimateFNR.R") |
|
5 |
-source("/data/yosef/users/daviderisso/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE_DEFAULTS.R") |
|
6 |
-library(BiocParallel,quietly = TRUE) |
|
7 |
-library(cluster,quietly = TRUE) |
|
8 |
-library(fpc,quietly = TRUE) |
|
9 |
- |
|
10 |
-runAdjustTask = function(task,evaluate_material,design,nested_model, to_store = FALSE){ |
|
11 |
- |
|
12 |
-#??? source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE) |
|
13 |
- |
|
14 |
- result = try({ |
|
15 |
- |
|
16 |
- if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){ |
|
17 |
- if(to_store){ |
|
18 |
- load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
19 |
- score_out = scone_out$evaluation |
|
20 |
- |
|
21 |
- # Return values |
|
22 |
- if(!is.na(score_out)[1]){ |
|
23 |
- scores = score_out$scores |
|
24 |
- }else{ |
|
25 |
- scores = NA |
|
26 |
- } |
|
27 |
- |
|
28 |
- print(task$nom) |
|
29 |
- return(list(scores = scores)) |
|
30 |
- }else{ |
|
31 |
- print(task$nom) |
|
32 |
- return() |
|
33 |
- } |
|
34 |
- |
|
35 |
- }else{ |
|
36 |
- |
|
37 |
- norm_lout = ADJUST_FN(task$ei,batch = task$batch, |
|
38 |
- bio = task$condition, |
|
39 |
- uv = task$uv, |
|
40 |
- w = task$w, |
|
41 |
- design = design, |
|
42 |
- nested_model = nested_model) |
|
43 |
- norm_out = exp(norm_lout) - 1 |
|
44 |
- score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch, |
|
45 |
- qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, |
|
46 |
- DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores, |
|
47 |
- dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN, |
|
48 |
- fnr_pi = evaluate_material$fnr_pi, nom = task$nom) |
|
49 |
- |
|
50 |
- scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom) |
|
51 |
- save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
52 |
- if(to_store){ |
|
53 |
- |
|
54 |
- # Return values |
|
55 |
- if(!is.na(score_out)[1]){ |
|
56 |
- scores = score_out$scores |
|
57 |
- }else{ |
|
58 |
- scores = NA |
|
59 |
- } |
|
60 |
- |
|
61 |
- print(task$nom) |
|
62 |
- return(list(scores = scores)) |
|
63 |
- }else{ |
|
64 |
- print(task$nom) |
|
65 |
- return() |
|
66 |
- } |
|
67 |
- |
|
68 |
- } |
|
69 |
- |
|
70 |
- }) |
|
71 |
- |
|
72 |
- return(NA) |
|
73 |
- |
|
74 |
-} |
|
75 |
- |
|
76 |
-runFactorFreeTask = function(task,evaluate_material,emat){ |
|
77 |
- |
|
78 |
- ## ??? source("~/YosefCode/packages/RNASeq/OFBIT/SCONE/SCONE.R") # library(SCONE) |
|
79 |
- |
|
80 |
- result = try({ |
|
81 |
- if(file.exists(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = ""))){ |
|
82 |
- load(paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
83 |
- norm_out = scone_out$exp_mat |
|
84 |
- score_out = scone_out$evaluation |
|
85 |
- |
|
86 |
- # Return values |
|
87 |
- if(!is.na(score_out)[1]){ |
|
88 |
- scores = score_out$scores |
|
89 |
- }else{ |
|
90 |
- scores = NA |
|
91 |
- } |
|
92 |
- |
|
93 |
- print(task$nom) |
|
94 |
- return(list(eo = norm_out, scores = scores)) |
|
95 |
- |
|
96 |
- }else{ |
|
97 |
- norm_out = task$FN(emat) |
|
98 |
- |
|
99 |
- score_out = scoreMethod(e = norm_out ,condition = evaluate_material$condition, batch = evaluate_material$batch, |
|
100 |
- qual_scores = evaluate_material$qual_scores, HK_scores = evaluate_material$HK_scores, |
|
101 |
- DE_scores = evaluate_material$DE_scores, CC_scores = evaluate_material$CC_scores, |
|
102 |
- dim_eval = evaluate_material$dim_eval, K_clust = evaluate_material$K_clust, K_NN = evaluate_material$K_NN, |
|
103 |
- fnr_pi = evaluate_material$fnr_pi, nom = task$nom) |
|
104 |
- |
|
105 |
- scone_out = list(exp_mat = norm_out,evaluation = score_out,method = task$nom) |
|
106 |
- save(scone_out,file = paste(evaluate_material$out_dir,"/",task$nom,".Rdata",sep = "")) |
|
107 |
- |
|
108 |
- # Return values |
|
109 |
- if(!is.na(score_out)[1]){ |
|
110 |
- scores = score_out$scores |
|
111 |
- }else{ |
|
112 |
- scores = NA |
|
113 |
- } |
|
114 |
- |
|
115 |
- print(task$nom) |
|
116 |
- return(list(eo = norm_out, scores = scores)) |
|
117 |
- } |
|
118 |
- |
|
119 |
- }) |
|
120 |
- |
|
121 |
- return(NA) |
|
122 |
- |
|
123 |
-} |
|
124 |
- |
|
125 |
-scoreMethod = function(e,condition = NULL, batch = NULL, |
|
126 |
- qual_scores = NULL, HK_scores = NULL, |
|
127 |
- DE_scores = NULL, CC_scores = NULL, |
|
128 |
- dim_eval = NULL, K_clust = NULL, K_NN = NULL, |
|
129 |
- fnr_pi = NULL, nom = NULL){ |
|
130 |
- if(any(is.na(e) | is.infinite(e) | is.nan(e))){ |
|
131 |
- warning("NA/Inf/NaN Expression Values. Returning NA.") |
|
132 |
- return(NA) |
|
133 |
- } |
|
134 |
- # Project the data: PCA or wPCA |
|
135 |
- if(grepl("^IMPUTE",nom)){ |
|
136 |
- pc_val = prcomp(t(log(e + 1)),center = TRUE,scale. = TRUE)$x[,1:dim_eval] |
|
137 |
- }else{ |
|
138 |
- pc_val = wPCA(log(e + 1),1 - fnr_pi,nu = dim_eval,filt = TRUE)$x |
|
139 |
- } |
|
140 |
- |
|
141 |
- # Max cor with quality scores |
|
142 |
- EXP_QPC_COR = max(cor(pc_val,qual_scores,method = "spearman")^2) |
|
143 |
- |
|
144 |
- # Max cor with housekeeping scores |
|
145 |
- if(!is.null(HK_scores)){ |
|
146 |
- EXP_HK_COR = max(cor(pc_val,HK_scores,method = "spearman")^2) |
|
147 |
- }else{ |
|
148 |
- EXP_HK_COR = NA |
|
149 |
- } |
|
150 |
- |
|
151 |
- # Max cor with differentially expressed scores |
|
152 |
- if(!is.null(DE_scores)){ |
|
153 |
- EXP_DE_COR = max(cor(pc_val,DE_scores,method = "spearman")^2) |
|
154 |
- }else{ |
|
155 |
- EXP_DE_COR = NA |
|
156 |
- } |
|
157 |
- |
|
158 |
- # Max cor with cell cycle scores |
|
159 |
- if(!is.null(CC_scores)){ |
|
160 |
- EXP_CC_COR = max(cor(pc_val,CC_scores,method = "spearman")^2) |
|
161 |
- }else{ |
|
162 |
- EXP_CC_COR = NA |
|
163 |
- } |
|
164 |
- |
|
165 |
- d = as.matrix(dist(pc_val,method = "euclidean")) |
|
166 |
- |
|
167 |
- # K-NN Condition |
|
168 |
- if(!is.null(condition)){ |
|
169 |
- m <- sapply(seq_len(nrow(d)), function(s) { |
|
170 |
- ds <- d[,s] |
|
171 |
- ra <- rank(ds) |
|
172 |
- return(mean(condition[(ra <= K_NN+1) & (ra != 1)] == condition[s])) |
|
173 |
- }) |
|
174 |
- KNN_BIO = mean(m) |
|
175 |
- }else{ |
|
176 |
- KNN_BIO = NA |
|
177 |
- } |
|
178 |
- |
|
179 |
- # K-NN Batch |
|
180 |
- if(!is.null(batch)){ |
|
181 |
- m <- sapply(seq_len(nrow(d)), function(s) { |
|
182 |
- ds <- d[,s] |
|
183 |
- ra <- rank(ds) |
|
184 |
- return(mean(batch[(ra <= K_NN+1) & (ra != 1)] == batch[s])) |
|
185 |
- }) |
|
186 |
- KNN_BATCH = mean(m) |
|
187 |
- }else{ |
|
188 |
- KNN_BATCH = NA |
|
189 |
- } |
|
190 |
- |
|
191 |
- # PAM Silh |
|
192 |
- if(K_clust > 1){ |
|
193 |
- pam_object = pam(pc_val,k = K_clust) |
|
194 |
- PAM_SIL = pam_object$silinfo$avg.width |
|
195 |
- clusters = pam_object$clustering |
|
196 |
- }else{ |
|
197 |
- PAM_SIL = NA |
|
198 |
- clusters = NA |
|
199 |
- } |
|
200 |
- scores = c(EXP_QPC_COR,EXP_HK_COR,EXP_DE_COR,EXP_CC_COR,KNN_BIO,KNN_BATCH,PAM_SIL) |
|
201 |
- names(scores) = c("EXP_QPC_COR","EXP_HK_COR","EXP_DE_COR","EXP_CC_COR","KNN_BIO","KNN_BATCH","PAM_SIL") |
|
202 |
- return(list(scores = scores, pc_val = pc_val, clusters = clusters)) |
|
203 |
-} |
|
204 |
- |
|
205 |
-## ===== SCONE: Single-Cell Overview of Normalized Expression data ===== |
|
206 |
-# Intermediate wrapper is missing" |
|
207 |
-# Decide model for the user - describe reason -" |
|
208 |
-# One batch one condition - user friendly |
|
209 |
-# Check rank of design matrix - if nested - we're good - IFF? |
|
210 |
-# Positive (evaluate) and negative controls (modeled + evaluate) |
|
211 |
-# show correlation between q and hk - why we don't try all |
|
212 |
-# Default number of dim_UV |
|
213 |
-# Factor free norms and adjust norms as lists = impute or not, positive data or not, only q , only hk |
|
214 |
-# function 1, function 2, and k |
|
215 |
-SCONE = function(e,condition = NULL, batch = NULL, |
|
216 |
- design = c("factorial","nested"), |
|
217 |
- nested_model = c("fixed","random"), |
|
218 |
- qual = NULL, is_HK = NULL, |
|
219 |
- is_DE = NULL,is_CC = NULL, |
|
220 |
- dim_UV = 5, |
|
221 |
- dim_eval = 3, K_clust = NULL, K_NN = 10, |
|
222 |
- out_dir = NULL, |
|
223 |
- K_clust_MAX = 10,K_pseudobatch_MAX = 10, |
|
224 |
- factor_free_only = FALSE, to_store = FALSE, n_workers = 10){ |
|
225 |
- param = MulticoreParam(workers = n_workers) |
|
226 |
- ## ----- Set up output directory ---- |
|
227 |
- if (file.exists(out_dir)){ |
|
228 |
- #stop("Designated output directory already exists.") |
|
229 |
- } else { |
|
230 |
- dir.create(out_dir) |
|
231 |
- } |
|
232 |
- |
|
233 |
- ## ----- Imputation Step ---- |
|
234 |
- print("Imputation Step...") |
|
235 |
- HK_default = FALSE |
|
236 |
- if(is.null(is_HK)){ |
|
237 |
- warning("No control genes have been specified, using all genes for FNR estimation.") |
|
238 |
- HK_default = TRUE |
|
239 |
- is_HK = rep(TRUE ,nrow(e)) |
|
240 |
- } |
|
241 |
- if(file.exists(paste0(out_dir,"/fnr_out.Rdata"))){ |
|
242 |
- load(paste0(out_dir,"/fnr_out.Rdata")) |
|
243 |
- }else{ |
|
244 |
- fnr_out = estimateFNR(e, bulk_model = TRUE, is.expressed = is_HK) |
|
245 |
- save(fnr_out,file = paste0(out_dir,"/fnr_out.Rdata")) |
|
246 |
- } |
|
247 |
- fnr_mu = exp(fnr_out$Alpha[1,]) |
|
248 |
- fnr_pi = (e == 0) * fnr_out$Z |
|
249 |
- imputed_e = e + (fnr_mu * fnr_pi) |
|
250 |
- print("Complete.") |
|
251 |
- |
|
252 |
- ## ----- Generate Unwanted Factors ---- |
|
253 |
- print("Generating Unwanted Factors for Evaluation...") |
|
254 |
- if(file.exists(paste0(out_dir,"/evaluate_material.Rdata"))){ |
|
255 |
- |
|
256 |
- load(paste0(out_dir,"/evaluate_material.Rdata")) |
|
257 |
- |
|
258 |
- }else{ |
|
259 |
- |
|
260 |
- # Factors of alignment quality metrics |
|
261 |
- if(is.null(qual)){ |
|
262 |
- warning("No quality metrics specified. Total counts and efficiency used.") |
|
263 |
- numba = colSums(e == 0) |
|
264 |
- beta = colMeans(e == 0) |
|
265 |
- qual = cbind(numba, log(beta/(1-beta))) |
|
266 |
- } |
|
267 |
- |
|
268 |
- if(is.null(dim_UV)){ |
|
269 |
- dim_UV = min(ncol(qual), sum(is_HK)) |
|
270 |
- warning(paste("Number of unwanted factors not specified. Selecting minimum of number of quality features and number of control genes:",dim_UV)) |
|
271 |
- if(dim_UV >= ncol(e)){ |
|
272 |
- stop("Number of unwanted factors >= number of samples!") |
|
273 |
- } |
|
274 |
- } |
|
275 |
- |
|
276 |
- qual_scores = prcomp(qual, center = TRUE, scale = TRUE)$x[,1:dim_UV] |
|
277 |
- |
|
278 |
- if(!HK_default){ |
|
279 |
- HK_scores = wPCA(log(e[is_HK,] + 1), 1 - fnr_pi[is_HK,], nu = dim_UV, filt = TRUE)$x |
|
280 |
- }else{ |
|
281 |
- HK_scores = NULL |
|
282 |
- } |
|
283 |
- |
|
284 |
- if(!is.null(is_DE)){ |
|
285 |
- DE_scores = wPCA(log(e[is_DE,] + 1), 1 - fnr_pi[is_DE,], nu = dim_eval, filt = TRUE)$x |
|
286 |
- }else{ |
|
287 |
- DE_scores = NULL |
|
288 |
- } |
|
289 |
- |
|
290 |
- if(!is.null(is_CC)){ |
|
291 |
- CC_scores = wPCA(log(e[is_CC,] + 1), 1 - fnr_pi[is_CC,], nu = dim_eval, filt = TRUE)$x |
|
292 |
- }else{ |
|
293 |
- CC_scores = NULL |
|
294 |
- } |
|
295 |
- |
|
296 |
- ## ----- Evaluation Criteria ---- |
|
297 |
- if(is.null(K_clust)){ |
|
298 |
- if(is.null(DE_scores)){ |
|
299 |
- warning("No clustering K specified. Selecting K using PAM average silhouette width on raw wPCA.") |
|
300 |
- pc_val = wPCA(log(e + 1), 1 - fnr_pi, nu = dim_eval, filt = TRUE)$x |
|
301 |
- }else{ |
|
302 |
- warning("No clustering K specified. Selecting K using PAM average silhouette width on DE scores.") |
|
303 |
- pc_val = DE_scores |
|
304 |
- } |
|
305 |
- |
|
306 |
- pamk.best <- pamk(data = pc_val, krange = 1:K_clust_MAX) |
|
307 |
- K_clust = pamk.best$nc |
|
308 |
- } |
|
309 |
- |
|
310 |
- data_dir = paste0(out_dir,"/data/") |
|
311 |
- if (file.exists(data_dir)){ |
|
312 |
- #stop("Designated data directory already exists.") |
|
313 |
- } else { |
|
314 |
- dir.create(data_dir) |
|
315 |
- } |
|
316 |
- |
|
317 |
- evaluate_material = list(condition = condition, batch = batch, |
|
318 |
- qual_scores = qual_scores, HK_scores = HK_scores, |
|
319 |
- DE_scores = DE_scores, CC_scores = CC_scores, |
|
320 |
- dim_eval = dim_eval, K_clust = K_clust, K_NN = K_NN, |
|
321 |
- fnr_pi = fnr_pi, out_dir = data_dir) |
|
322 |
- |
|
323 |
- save(evaluate_material,file = paste0(out_dir,"/evaluate_material.Rdata")) |
|
324 |
- |
|
325 |
- } |
|
326 |
- |
|
327 |
- print("Complete.") |
|
328 |
- |
|
329 |
- |
|
330 |
- ## ----- Factor-Free Normalization |
|
331 |
- ## DR: major change: the current code will copy over and over the same matrices (e and imputed_e). |
|
332 |
- ## There's no need to do so as we can infer what matrix to use from the name. |
|
333 |
- |
|
334 |
- task_list = list() |
|
335 |
- |
|
336 |
- # Generate Task List for Non-imputed |
|
337 |
- task_names = paste("NOIMPUTE",c("NONE","UQ","UQP","DESEQ","DESEQP","FQ","FQP","TMM"),sep = "_") |
|
338 |
- |
|
339 |
- ##DR: why not TMM__FN_POS? |
|
340 |
- task_FN = c(identity,UQ_FN,UQ_FN_POS,DESEQ_FN,DESEQ_FN_POS,FQ_FN,FQ_FN_POS,TMM_FN) |
|
341 |
- names(task_FN) = task_names |
|
342 |
- for (task in task_names){ |
|
343 |
- task_list[[task]] = list(FN = task_FN[[task]], nom = task) |
|
344 |
- } |
|
345 |
- # Generate Task List for Imputed |
|
346 |
- task_names = paste("IMPUTE",c("NONE","UQ","DESEQ","FQ","TMM"),sep = "_") |
|
347 |
- task_FN = c(identity,UQ_FN,DESEQ_FN,FQ_FN,TMM_FN) |
|
348 |
- names(task_FN) = task_names |
|
349 |
- for (task in task_names){ |
|
350 |
- task_list[[task]] = list(FN = task_FN[[task]], nom = task) |
|
351 |
- } |
|
352 |
- |
|
353 |
- # Parallel Normalization + Evaluation |
|
354 |
- print("Factor-Free Normalization and Evaluation...") |
|
355 |
- factor_free_out_noimpute = bplapply(task_list[grep("^NOIMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=e, BPPARAM = param) |
|
356 |
- factor_free_out_impute = bplapply(task_list[grep("^IMPUTE", names(task_list))], FUN = runFactorFreeTask, evaluate_material = evaluate_material, emat=imputed_e, BPPARAM = param) |
|
357 |
- factor_free_out <- c(factor_free_out_noimpute, factor_free_out_impute) |
|
358 |
- print("Complete.") |
|
359 |
- |
|
360 |
- ## ----- Selection Step (Optional) |
|
361 |
- if(factor_free_only){ |
|
362 |
- return(list(factor_free_out = factor_free_out, factor_based_out = NA)) |
|
363 |
- } |
|
364 |
- ## ----- Factor-Based Normalization |
|
365 |
- |
|
366 |
- print("Selecting Base Tasks...") |
|
367 |
- base_task_list <- lapply(1:length(factor_free_out), function(i) { |
|
368 |
- if(!any(is.na(factor_free_out[[i]]$eo) | is.nan(factor_free_out[[i]]$eo) | is.infinite(factor_free_out[[i]]$eo))) { |
|
369 |
- return(list(ei = factor_free_out[[i]]$eo, nom = names(factor_free_out)[i])) |
|
370 |
- } |
|
371 |
- }) |
|
372 |
- names(base_task_list) <- names(factor_free_out) |
|
373 |
- base_task_list <- base_task_list[!sapply(base_task_list, is.null)] |
|
374 |
- print("Complete.") |
|
375 |
- |
|
376 |
- |
|
377 |
- # WEIGHT |
|
378 |
- print("Extending Tasks by Weighting Scheme...") |
|
379 |
- ext_task_list = list() |
|
380 |
- for( task in names(base_task_list)){ |
|
381 |
- base_nom = task |
|
382 |
- if(!grepl("^IMPUTE",task)){ |
|
383 |
- ext_nom = paste(base_nom,"ZEROWEIGHT",sep = "_") |
|
384 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
385 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
386 |
- ext_task_list[[ext_nom]]$w = 0 + (ext_task_list[[ext_nom]]$ei > 0) |
|
387 |
- } |
|
388 |
- ext_nom = paste(base_nom,"NOWEIGHT",sep = "_") |
|
389 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
390 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
391 |
- ext_task_list[[ext_nom]]$w = NULL |
|
392 |
- } |
|
393 |
- print("Complete.") |
|
394 |
- |
|
395 |
- base_task_list = ext_task_list |
|
396 |
- |
|
397 |
- # Generate Method-Specific Factors (Should be parallel) |
|
398 |
- print("Computing Method-Specific Factors...") |
|
399 |
- |
|
400 |
- task_factor_list = list() |
|
401 |
- for( task in names(base_task_list)){ |
|
402 |
- |
|
403 |
- # HK |
|
404 |
- if(!HK_default){ |
|
405 |
- if(!is.null(base_task_list[[task]]$w)){ |
|
406 |
- task_factor_list[[task]]$HK_scores = wPCA(x = log(base_task_list[[task]]$ei[is_HK,]+1),w = base_task_list[[task]]$w[is_HK,],nu = dim_UV,filt = TRUE)$x |
|
407 |
- }else{ |
|
408 |
- task_factor_list[[task]]$HK_scores = prcomp(t(log(base_task_list[[task]]$ei[is_HK,]+1)))$x[,1:dim_UV] |
|
409 |
- } |
|
410 |
- |
|
411 |
- if(is.null(K_pseudobatch)){ |
|
412 |
- pamk_object = pamk(task_factor_list[[task]]$HK_scores,krange = 1:K_pseudobatch_MAX) |
|
413 |
- task_factor_list[[task]]$hk_nc = pamk_object$nc |
|
414 |
- task_factor_list[[task]]$pam_clustering = pamk_object$pamobject$clustering |
|
415 |
- }else{ |
|
416 |
- task_factor_list[[task]]$hk_nc = K_pseudobatch |
|
417 |
- task_factor_list[[task]]$pam_clustering = pam(task_factor_list[[task]]$HK_scores,k = K_pseudobatch)$clustering |
|
418 |
- } |
|
419 |
- }else{ |
|
420 |
- task_factor_list[[task]]$HK_scores = NULL |
|
421 |
- task_factor_list[[task]]$hk_nc = NULL |
|
422 |
- task_factor_list[[task]]$pam_clustering = NULL |
|
423 |
- } |
|
424 |
- } |
|
425 |
- print("Complete.") |
|
426 |
- |
|
427 |
- # Generate General Factor Clusterings |
|
428 |
- print("Generating General Factor-Based Clusters...") |
|
429 |
- if(is.null(K_pseudobatch)){ |
|
430 |
- pamk_object = pamk(evaluate_material$qual_scores,krange = 1:K_pseudobatch_MAX) |
|
431 |
- qual_nc = pamk_object$nc |
|
432 |
- qual_clustering = as.factor(pamk_object$pamobject$clustering) |
|
433 |
- }else{ |
|
434 |
- qual_nc = K_pseudobatch |
|
435 |
- qual_clustering = as.factor(pam(evaluate_material$qual_scores,k = K_pseudobatch)$clustering) |
|
436 |
- } |
|
437 |
- print("Complete.") |
|
438 |
- |
|
439 |
- # BIO |
|
440 |
- print("Extending Tasks by Biological Covariate...") |
|
441 |
- ext_task_list = list() |
|
442 |
- for( task in names(base_task_list)){ |
|
443 |
- base_nom = task |
|
444 |
- |
|
445 |
- if(!is.null(condition)){ |
|
446 |
- ext_nom = paste(base_nom,"BIO",sep = "_") |
|
447 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
448 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
449 |
- ext_task_list[[ext_nom]]$condition = condition |
|
450 |
- } |
|
451 |
- ext_nom = paste(base_nom,"NOBIO",sep = "_") |
|
452 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
453 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
454 |
- ext_task_list[[ext_nom]]$condition = NULL |
|
455 |
- } |
|
456 |
- print("Complete.") |
|
457 |
- |
|
458 |
- print("Extending Tasks by Batch Covariate...") |
|
459 |
- base_task_list = ext_task_list |
|
460 |
- # BATCH |
|
461 |
- ext_task_list = list() |
|
462 |
- for( task in names(base_task_list)){ |
|
463 |
- base_nom = task |
|
464 |
- |
|
465 |
- if(!is.null(batch)){ |
|
466 |
- ext_nom = paste(base_nom,"BATCH",sep = "_") |
|
467 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
468 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
469 |
- ext_task_list[[ext_nom]]$batch = batch |
|
470 |
- } |
|
471 |
- ext_nom = paste(base_nom,"NOBATCH",sep = "_") |
|
472 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
473 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
474 |
- ext_task_list[[ext_nom]]$batch = NULL |
|
475 |
- } |
|
476 |
- print("Complete.") |
|
477 |
- |
|
478 |
- print("Extending Tasks by UV Covariates...") |
|
479 |
- base_task_list = ext_task_list |
|
480 |
- # UV |
|
481 |
- ext_task_list = list() |
|
482 |
- for( task in names(base_task_list)){ |
|
483 |
- base_nom = task |
|
484 |
- method_class = gsub("WEIGHT_.*","WEIGHT",task) |
|
485 |
- |
|
486 |
- ext_nom = paste(base_nom,"NOUV",sep = "_") |
|
487 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
488 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
489 |
- ext_task_list[[ext_nom]]$uv = NULL |
|
490 |
- |
|
491 |
- # HK UV |
|
492 |
- if(!HK_default){ |
|
493 |
- for(j in 1:dim_UV){ |
|
494 |
- ext_nom = paste(base_nom,paste("HK",j,sep = "_"),sep = "_") |
|
495 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
496 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
497 |
- ext_task_list[[ext_nom]]$uv = as.matrix(task_factor_list[[method_class]]$HK_scores[,1:j]) |
|
498 |
- } |
|
499 |
- # if(task_factor_list[[method_class]]$hk_nc > 1){ |
|
500 |
- # ext_nom = paste(base_nom,"HKCLUST",sep = "_") |
|
501 |
- # ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
502 |
- # ext_task_list[[ext_nom]]$nom = ext_nom |
|
503 |
- # ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(task_factor_list[[method_class]]$pam_clustering)) |
|
504 |
- # } |
|
505 |
- } |
|
506 |
- # Qual UV |
|
507 |
- for(j in 1:dim_UV){ |
|
508 |
- ext_nom = paste(base_nom,paste("Q",j,sep = "_"),sep = "_") |
|
509 |
- ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
510 |
- ext_task_list[[ext_nom]]$nom = ext_nom |
|
511 |
- ext_task_list[[ext_nom]]$uv = as.matrix(evaluate_material$qual_scores[,1:j]) |
|
512 |
- } |
|
513 |
- # if(qual_nc > 1){ |
|
514 |
- # ext_nom = paste(base_nom,"QCLUST",sep = "_") |
|
515 |
- # ext_task_list[[ext_nom]] = base_task_list[[base_nom]] |
|
516 |
- # ext_task_list[[ext_nom]]$nom = ext_nom |
|
517 |
- # ext_task_list[[ext_nom]]$uv = as.matrix(as.factor(qual_clustering)) |
|
518 |
- # } |
|
519 |
- } |
|
520 |
- print("Complete.") |
|
521 |
- ext_task_list = ext_task_list[!grepl("NOBATCH_NOUV",names(ext_task_list))] |
|
522 |
- if(!to_store){ |
|
523 |
- finished_tasks = gsub(".Rdata","",list.files(evaluate_material$out_dir)) |
|
524 |
- ext_task_list = ext_task_list[! (names(ext_task_list) %in% finished_tasks)] |
|
525 |
- } |
|
526 |
- print(length(ext_task_list)) |
|
527 |
- print("Factor-Based Adjustment Normalization and Evaluation...") |
|
528 |
- factor_based_out = bplapply(ext_task_list,FUN = runAdjustTask, evaluate_material = evaluate_material,design = design,nested_model = nested_model,BPPARAM = param) |
|
529 |
- print("Complete.") |
|
530 |
- |
|
531 |
- return(list(factor_free_out = factor_free_out, factor_based_out = factor_based_out)) |
|
532 |
-} |
... | ... |
@@ -1,11 +1,6 @@ |
1 |
-library(DESeq,quietly = TRUE) |
|
2 |
-library(EDASeq,quietly = TRUE) |
|
3 |
-library(edgeR,quietly = TRUE) |
|
4 |
-library(sva,quietly = TRUE) |
|
5 |
-library(aroma.light,quietly = TRUE) |
|
6 |
-library(lme4,quietly = TRUE) |
|
7 |
- |
|
8 | 1 |
#' Upper-quartile normalization wrapper. |
2 |
+#' @importFrom EDASeq betweenLaneNormalization |
|
3 |
+#' @export |
|
9 | 4 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
10 | 5 |
#' @return Upper-quartile normalized matrix (scaled by sample UQ) |
11 | 6 |
UQ_FN = function(ei){ |
... | ... |
@@ -34,6 +29,7 @@ uq_f <- function(which="upper", round = FALSE){ |
34 | 29 |
# }) |
35 | 30 |
|
36 | 31 |
#' Upper-quartile normalization applied to positive data. |
32 |
+#' @export |
|
37 | 33 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
38 | 34 |
#' @return Upper-quartile (positive) normalized matrix (scaled by sample UQ) |
39 | 35 |
UQ_FN_POS = function(ei){ |
... | ... |
@@ -47,6 +43,8 @@ UQ_FN_POS = function(ei){ |
47 | 43 |
} |
48 | 44 |
|
49 | 45 |
#' Full-Quantile normalization wrapper. |
46 |
+#' @importFrom aroma.light normalizeQuantileRank.matrix |
|
47 |
+#' @export |
|
50 | 48 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
51 | 49 |
#' @return FQ normalized matrix. |
52 | 50 |
FQ_FN = function(ei){ |
... | ... |
@@ -55,6 +53,7 @@ FQ_FN = function(ei){ |
55 | 53 |
} |
56 | 54 |
|
57 | 55 |
#' Full-Quantile normalization applied to positive data. |
56 |
+#' @export |
|
58 | 57 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
59 | 58 |
#' @return FQ (positive) normalized matrix. |
60 | 59 |
FQ_FN_POS = function(ei){ |
... | ... |
@@ -112,6 +111,8 @@ FQ_FN_POS = function(ei){ |
112 | 111 |
} |
113 | 112 |
|
114 | 113 |
#' DESeq normalization wrapper. |
114 |
+#' @importFrom DESeq estimateSizeFactorsForMatrix |
|
115 |
+#' @export |
|
115 | 116 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
116 | 117 |
#' @return DESeq normalized matrix (scaled by sample ) |
117 | 118 |
DESEQ_FN = function(ei){ |
... | ... |
@@ -121,6 +122,7 @@ DESEQ_FN = function(ei){ |
121 | 122 |
} |
122 | 123 |
|
123 | 124 |
#' DESeq normalization applied to positive data. |
125 |
+#' @export |
|
124 | 126 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
125 | 127 |
#' @return DESeq (positive) normalized matrix (scaled by sample ) |
126 | 128 |
DESEQ_FN_POS = function(ei){ |
... | ... |
@@ -153,6 +155,8 @@ DESEQ_FN_POS = function(ei){ |
153 | 155 |
} |
154 | 156 |
|
155 | 157 |
#' TMM normalization wrapper. |
158 |
+#' @importFrom edgeR calcNormFactors |
|
159 |
+#' @export |
|
156 | 160 |
#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
157 | 161 |
#' @return TMM normalized matrix (scaled by sample ) |
158 | 162 |
TMM_FN = function(ei){ |
... | ... |
@@ -161,232 +165,3 @@ TMM_FN = function(ei){ |
161 | 165 |
return(eo) |
162 | 166 |
} |
163 | 167 |
|
164 |
-#' Output residuals + intercept of fit to quality score matrix |
|
165 |
-#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
166 |
-#' @param default_args_list = list of default arguments |
|
167 |
-#' @param args_list = list of user-provided arguments |
|
168 |
-#' @return Matrix adjusted for K quality scores |
|
169 |
- |
|
170 |
-QUALREG_FN = function(ei, default_args_list, args_list){ |
|
171 |
- EPSILON = default_args_list$EPSILON |
|
172 |
- scores = prcomp(default_args_list$q,center = TRUE,scale = TRUE)$x[,1:default_args_list$K] |
|
173 |
- re = log(ei + EPSILON) |
|
174 |
- for (g in 1:dim(ei)[1]){ |
|
175 |
- lin.mod = lm(re[g,] ~ scores) |
|
176 |
- re[g,] = lin.mod$coefficients[1] + lin.mod$residuals |
|
177 |
- } |
|
178 |
- re = exp(re) |
|
179 |
- re[e == 0] = 0 |
|
180 |
- return(re) |
|
181 |
-} |
|
182 |
- |
|
183 |
-#' Output residuals + intercept of fit to control-gene derived factors |
|
184 |
-#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
185 |
-#' @param default_args_list = list of default arguments |
|
186 |
-#' @param args_list = list of user-provided arguments |
|
187 |
-#' @return Matrix adjusted for K unwanted factors |
|
188 |
- |
|
189 |
-RUVG_FN = function(ei, default_args_list, args_list){ |
|
190 |
- EPSILON = default_args_list$EPSILON |
|
191 |
- scores = prcomp(t(log(ei[default_args_list$control_genes,]+default_args_list$EPSILON)),center = TRUE,scale. = FALSE)$x[,1:default_args_list$K] |
|
192 |
- re = log(ei + default_args_list$EPSILON) |
|
193 |
- for (g in 1:dim(ei)[1]){ |
|
194 |
- lin.mod = lm(re[g,] ~ scores) |
|
195 |
- re[g,] = lin.mod$coefficients[1] + lin.mod$residuals |
|
196 |
- } |
|
197 |
- re = exp(re) |
|
198 |
- re[e == 0] = 0 |
|
199 |
- return(re) |
|
200 |
-} |
|
201 |
- |
|
202 |
-#' ComBat Wrapper |
|
203 |
-#' @param ei = Numerical matrix. (rows = genes, cols = samples). Unique row.names are required. |
|
204 |
-#' @param default_args_list = list of default arguments |
|
205 |
-#' @param args_list = list of user-provided arguments |
|
206 |
-#' @return Matrix adjusted for batch |
|
207 |
- |
|
208 |
-COMBAT_FN = function(ei, default_args_list, args_list){ |
|
209 |
- EPSILON = default_args_list$EPSILON |
|
210 |
- SD_EPSILON = default_args_list$SD_EPSILON |
|
211 |
- ei = log(ei + default_args_list$EPSILON) |
|
212 |
- pheno = as.data.frame(cbind(default_args_list$tech_batch,default_args_list$bio_cond)) |
|
213 |
- if(is.null(default_args_list$bio_cond) || any(is.na(default_args_list$bio_cond))){ |
|
214 |
- colnames(pheno) = c("batch") |
|
215 |
- is_var = apply(ei,1,sd) > SD_EPSILON |
|
216 |
- mod = model.matrix(~ 1, data = pheno) |
|
217 |
- }else{ |
|
218 |
- colnames(pheno) = c("batch","phenotype") |
|
219 |
- is_var = T |
|
220 |
- for (p in unique(default_args_list$bio_cond)){ |
|
221 |
- is_var = is_var & (apply(ei[,default_args_list$bio_cond == p],1,sd) > SD_EPSILON) |
|
222 |
- } |
|
223 |
- mod = model.matrix(~ as.factor(phenotype), data = pheno) |
|
224 |
- } |
|
225 |
- eo = ei |
|
226 |
- eo[is_var,] = ComBat(dat=ei[is_var,], batch=default_args_list$tech_batch, mod=mod, par.prior=TRUE, prior.plots=FALSE) |
|
227 |
- eo = exp(eo) |
|
228 |
- eo[ei == 0] = 0 |
|
229 |
- return(eo) |
|
230 |
-} |
|
231 |
- |
|
232 |
-NESTED_BATCH_FIXED_FN = function(ei,bio,batch,uv = NULL,w = NULL){ |
|
233 |
- if(!is.null(bio)){ |
|
234 |
- bio = factor(bio, levels = sort(levels(bio))) |
|
235 |
- batch = factor(batch, levels = unique(batch[order(bio)])) |
|
236 |
- |
|
237 |
- n_vec <- tapply(batch, bio, function(x) nlevels(droplevels(x))) |
|
238 |
- }else{ |
|
239 |
- n_vec = nlevels(droplevels(batch)) |
|
240 |
- } |
|
241 |
- |
|
242 |
- mat = matrix(0,nrow = sum(n_vec),ncol = sum(n_vec - 1)) |
|
243 |
- xi = 1 |
|
244 |
- yi = 1 |
|
245 |
- for(i in 1:length(n_vec)){ |
|
246 |
- if(n_vec[i] > 1){ |
|
247 |
- cs = contr.sum(n_vec[i]) |
|
248 |
- dd = dim(cs) |
|
249 |
- mat[xi:(xi + dd[1] - 1),yi:(yi + dd[2] - 1)] = cs |
|
250 |
- xi = xi + dd[1] |
|
251 |
- yi = yi + dd[2] |
|
252 |
- }else{ |
|
253 |
- xi = xi + 1 |
|
254 |
- } |
|
255 |
- } |
|
256 |
- |
|
257 |
- leo = log(ei+1) |
|
258 |
- if(is.null(uv)){ |
|
259 |
- if(!is.null(bio)) { |
|
260 |
- design_mat <- model.matrix(~ bio + batch, contrasts=list(bio=contr.sum, batch=mat)) |
|
261 |
- } else { |
|
262 |
- design_mat <- model.matrix(~ batch, contrasts=list(batch=mat)) |
|
263 |
- } |
|
264 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
265 |
- |
|
266 |
- bind <- (ncol(lm_object$coefficients) - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients)) |
|
267 |
- leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch] |
|
268 |
- } else { |
|
269 |
- if(!is.null(bio)) { |
|
270 |
- design_mat <- model.matrix(~ bio + batch + uv, contrasts=list(bio=contr.sum, batch=mat)) |
|
271 |
- } else { |
|
272 |
- design_mat <- model.matrix(~ batch + uv, contrasts=list(batch=mat)) |
|
273 |
- } |
|
274 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
275 |
- |
|
276 |
- uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
277 |
- bind = (ncol(lm_object$coefficients) - dim(uv)[2] - (sum(n_vec - 1) - 1)):(ncol(lm_object$coefficients) - dim(uv)[2]) |
|
278 |
- leo = leo - t(attr(design_mat,"contrasts")$batch %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
279 |
- } |
|
280 |
- return(leo) |
|
281 |
-} |
|
282 |
- |
|
283 |
-NESTED_BATCH_RANDOM_FN = function(ei,bio,batch,uv = NULL,w = NULL){ |
|
284 |
- design <- model.matrix(~batch - 1) |
|
285 |
- leo = log(ei+1) |
|
286 |
- if(is.null(uv)){ |
|
287 |
- for(i in 1:dim(ei)[1]){ |
|
288 |
- lm_object = lmer(leo[i,] ~ 1 + bio + (1 | batch),weights = w[i,]) |
|
289 |
- betaj <- ranef(lm_object)[[1]][,1] |
|
290 |
- leo[i,] = leo[i,] - as.vector(design %*% betaj) |
|
291 |
- } |
|
292 |
- }else{ |
|
293 |
- for(i in 1:dim(ei)[1]){ |
|
294 |
- lm_object = lmer(leo[i,] ~ 1 + bio + uv + (1 | batch), |
|
295 |
- weights = w[i,]) |
|
296 |
- betaj <- ranef(lm_object)[[1]][,1] |
|
297 |
- uvind = (length(coef(lm_object)[[1]][1,]) - dim(uv)[2] + 1):(length(coef(lm_object)[[1]][1,])) |
|
298 |
- leo[i,] = leo[i,] - as.vector(design %*% betaj) - as.vector(uv %*% t(coef(lm_object)[[1]][1,][uvind])) |
|
299 |
- } |
|
300 |
- } |
|
301 |
- return(leo) |
|
302 |
-} |
|
303 |
- |
|
304 |
-FACTORIAL_BATCH_FN = function(ei,bio,batch,uv = NULL,w = NULL){ |
|
305 |
- contrasts <- contr.treatment(length(levels(batch))) |
|
306 |
- rownames(contrasts) = levels(batch) |
|
307 |
- leo = log(ei+1) |
|
308 |
- |
|
309 |
- if(is.null(bio)){ |
|
310 |
- if(is.null(uv)){ |
|
311 |
- |
|
312 |
- # Just Batch |
|
313 |
- design_mat <- model.matrix(~batch) |
|
314 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
315 |
- bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients)) |
|
316 |
- leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] |
|
317 |
- } else { |
|
318 |
- |
|
319 |
- # Batch + UV |
|
320 |
- design_mat <- model.matrix(~batch + uv) |
|
321 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
322 |
- uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
323 |
- bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2]) |
|
324 |
- leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
325 |
- } |
|
326 |
- } else { |
|
327 |
- if(is.null(uv)){ |
|
328 |
- |
|
329 |
- # Batch + BIO |
|
330 |
- design_mat <- model.matrix(~bio + batch) |
|
331 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
332 |
- bind = (ncol(lm_object$coefficients) + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients)) |
|
333 |
- leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] |
|
334 |
- |
|
335 |
- } else { |
|
336 |
- |
|
337 |
- # Batch + UV + BIO |
|
338 |
- design_mat <- model.matrix(~bio + batch + uv) |
|
339 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
340 |
- uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
341 |
- bind = (ncol(lm_object$coefficients) - dim(uv)[2] + (2 - (length(levels(batch))))):(ncol(lm_object$coefficients) - dim(uv)[2]) |
|
342 |
- leo = leo - t(contrasts %*% t(lm_object$coefficients[,bind]))[,batch] - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
343 |
- } |
|
344 |
- } |
|
345 |
- return(leo) |
|
346 |
-} |
|
347 |
- |
|
348 |
-BATCHFREE_FN = function(ei,bio,uv = NULL,w = NULL){ |
|
349 |
- leo = log(ei+1) |
|
350 |
- |
|
351 |
- if(is.null(bio)){ |
|
352 |
- # Just UV |
|
353 |
- design_mat <- model.matrix(~uv) |
|
354 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
355 |
- uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
356 |
- leo = leo - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
357 |
- } else { |
|
358 |
- # Bio + UV |
|
359 |
- design_mat <- model.matrix(~bio + uv) |
|
360 |
- lm_object <- lmFit(leo, design = design_mat, weights = w) |
|
361 |
- uvind = (ncol(lm_object$coefficients) - dim(uv)[2] + 1):(ncol(lm_object$coefficients)) |
|
362 |
- leo2 = leo - t(uv %*% t(lm_object$coefficients[,uvind])) |
|
363 |
- } |
|
364 |
- return(leo) |
|
365 |
-} |
|
366 |
- |
|
367 |
-ADJUST_FN = function(ei,batch = NULL, |
|
368 |
- bio = NULL, |
|
369 |
- uv = NULL, |
|
370 |
- w = NULL, |
|
371 |
- design = c("factorial","nested"), |
|
372 |
- nested_model = c("fixed","random")){ |
|
373 |
- |
|
374 |
- if(!is.null(batch)){ |
|
375 |
- design <- match.arg(design) |
|
376 |
- |
|
377 |
- if(design == "factorial"){ |
|
378 |
- leo = FACTORIAL_BATCH_FN(ei,bio,batch,uv,w) |
|
379 |
- }else{ |
|
380 |
- nested_model <- match.arg(nested_model) |
|
381 |
- |
|
382 |
- if(nested_model == "fixed"){ |
|
383 |
- leo = NESTED_BATCH_FIXED_FN(ei,bio,batch,uv,w) |
|
384 |
- }else{ |
|
385 |
- leo = NESTED_BATCH_RANDOM_FN(ei,bio,batch,uv,w) |
|
386 |
- } |
|
387 |
- } |
|
388 |
- }else{ |
|
389 |
- leo = BATCHFREE_FN(ei,bio,uv,w) |
|
390 |
- } |
|
391 |
- return(leo) |
|
392 |
-} |
|
393 | 168 |
\ No newline at end of file |
394 | 169 |
deleted file mode 100644 |
... | ... |
@@ -1,69 +0,0 @@ |
1 |
-out_dir = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data" |
|
2 |
-flist = paste(out_dir,list.files(out_dir),sep = "/") |
|
3 |
- |
|
4 |
-# Scoring |
|
5 |
-scores = NULL |
|
6 |
-snames = NULL |
|
7 |
-for(fil in flist){ |
|
8 |
- load(fil) |
|
9 |
- nom = scone_out$method |
|
10 |
- sc = scone_out$evaluation |
|
11 |
- if(!is.na(sc)[1]){ |
|
12 |
- print(nom) |
|
13 |
- snames = c(snames,nom) |
|
14 |
- scores = rbind(scores,sc$scores) |
|
15 |
- }else{ |
|
16 |
- # print(nom) |
|
17 |
- } |
|
18 |
-} |
|
19 |
-rownames(scores) = snames |
|
20 |
-escores = t(t(scores)*c(-1,-1,1,1,1,-1,1)) |
|
21 |
-colnames(escores) = paste(colnames(scores),c("_INV","")[1.5 + c(-1,-1,1,1,1,-1,1)/2],sep = "") |
|
22 |
- |
|
23 |
-scores = t(na.omit(t(scores))) |
|
24 |
-escores = t(na.omit(t(escores))) |
|
25 |
- |
|
26 |
-# Score Heatmaps |
|
27 |
-require(gplots) |
|
28 |
-heatmap.2(t(apply(scores ,2,rank)[order(-apply((apply(escores ,2,rank)),1,min)),][1:20,]),density.info = 'n',trace = 'n',key.xlab = "Score",key.title = NA, |
|
29 |
- Colv = NA,col = colorRampPalette(c("purple","black","yellow"))(100), |
|
30 |
- margins = c(15,10),cexRow = .75,cexCol = .5) |
|
31 |
-title(main = "Top 20 SCONE",cex.lab=0.5) |
|
32 |
- |
|
33 |
-my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min))) |
|
34 |
- |
|
35 |
-names(which(my_rank == 1)) |
|
36 |
-load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_NONE.Rdata") |
|
37 |
-is_zero = scone_out$exp_mat == 0 |
|
38 |
-load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/data/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1.Rdata") |
|
39 |
-myma = scone_out$exp_mat |
|
40 |
-hk = read.table("~/YosefCode/packages/RNASeq/summary/EXAMPLE/reference_files/house_keeping_human_names.txt")$V1 |
|
41 |
-myma[is_zero] = 0 |
|
42 |
-fnr_out = estimateFNR(myma,bulk_model = T,is.expressed = rownames(myma) %in% as.character(hk) ) |
|
43 |
-w = 0 + !is_zero |
|
44 |
-w[is_zero] = 1-fnr_out$Z[is_zero] |
|
45 |
-wpc = wPCA(log(scone_out$exp_mat + 1),w,filt = T) |
|
46 |
-plot(wpc$x[,1:2], col = cols) |
|
47 |
- |
|
48 |
-rownames(wpc$rotation) = rownames(scone_out$exp_mat)[wpc$to.pass] |
|
49 |
-sort(-abs(wpc$rotation[,2]))[1:20] |
|
50 |
- |
|
51 |
-load("/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/evaluate_material.Rdata") |
|
52 |
-pchs = 16 |
|
53 |
-cols = rainbow(length(levels(evaluate_material$batch)))[evaluate_material$batch] |
|
54 |
-plot(scone_out$evaluation$pc_val[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells") |
|
55 |
-legend("bottomleft",legend = (levels(evaluate_material$batch)),pch = 16, col = rainbow(length(levels(evaluate_material$batch))), cex = .5) |
|
56 |
-outy = log(scone_out$exp_mat+1) |
|
57 |
-cols = rainbow(2)[scone_out$evaluation$clusters] |
|
58 |
-plot(scone_out$evaluation$pc_val[,c(1,3)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells") |
|
59 |
- |
|
60 |
-write.table(outy,file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/NOIMPUTE_FQ_NOWEIGHT_NOBIO_NOBATCH_HK_1_log_counts.txt",quote = F,sep = "\t",col.names = T,row.names = T) |
|
61 |
- evaluate_material$ |
|
62 |
-evaluate_material$dim_eval |
|
63 |
-plot(evaluate_material$DE_scores[,c(1,2)], xlab = "wPC1", ylab = "wPC2",col = cols,pch = pchs,main = "Top SCONE (Filtered): T Cells") |
|
64 |
- |
|
65 |
- |
|
66 |
-my_rank = (1 + max(rank(apply((apply(escores ,2,rank)),1,min))) - rank(apply((apply(escores ,2,rank)),1,min))) |
|
67 |
-write.table(cbind(scores,my_rank),file = "/data/yosef2/HIV/dat/CMV_HIV_150wk/SCONE_out_rerun/scores.txt",quote = F,sep = "\t") |
|
68 |
- |
|
69 |
- |
70 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,244 +0,0 @@ |
1 |
-library(shiny) |
|
2 |
-library(EDASeq) |
|
3 |
-library(clusterCells) |
|
4 |
-load("../shiny/tmp.rda") |
|
5 |
- |
|
6 |
-library(RColorBrewer) |
|
7 |
-cc <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"),brewer.pal(12,"Set3"), |
|
8 |
- brewer.pal(8, "Accent")) |
|
9 |
- |
|
10 |
-choose_color <- function(input) { |
|
11 |
- if(input$color_code=="type") { |
|
12 |
- cols <- cc[type] |
|
13 |
- } else { |
|
14 |
- cols <- cc[batch] |
|
15 |
- } |
|
16 |
- return(cols) |
|
17 |
-} |
|
18 |
- |
|
19 |
-server <- function(input, output) { |
|
20 |
- output$boxplot <- renderPlot({ |
|
21 |
- set.seed(19883) |
|
22 |
- if(input$downsample) { |
|
23 |
- idx <- sample(1:100) |
|
24 |
- } else { |
|
25 |
- idx <- 1:ncol(norm) |
|
26 |
- } |
|
27 |
- boxplot(log_norm[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]], |
|
28 |
- outline=FALSE, names=FALSE, ylab="log(count+1)", main="Log normalized counts") |
|
29 |
- lines(apply(log_norm[,idx], 2, median)) |
|
30 |
- } |
|
31 |
- ) |
|
32 |
- |
|
33 |
- output$rle <- renderPlot({ |
|
34 |
- set.seed(19883) |
|
35 |
- if(input$downsample & ncol(norm)>100) { |
|
36 |
- idx <- sample(1:100) |
|
37 |
- } else { |
|
38 |
- idx <- 1:ncol(norm) |
|
39 |
- } |
|
40 |
- boxplot(rle[,idx], boxcol=cc[get(input$color_code)[idx]], medlwd=5, col=cc[get(input$color_code)[idx]], |
|
41 |
- outline=FALSE, names=FALSE, ylab="Relative Log Expression", main="Relative Log Expression (RLE)") |
|
42 |
- abline(h=0, lty=2) |
|
43 |
- }) |
|
44 |
- |
|
45 |
- output$pca <- renderPlot( |
|
46 |
- plot(pca$x[,1:2], col=choose_color(input), pch=19, main="All genes") |
|
47 |
- ) |
|
48 |
- |
|
49 |
- output$pca_pc <- renderPlot( |
|
50 |
- plot(pca_pc$x[,1:2], col=choose_color(input), pch=19, main="Positive controls") |
|
51 |
- ) |
|
52 |
- |
|
53 |
- output$pca_nc <- renderPlot( |
|
54 |
- plot(pca_nc$x[,1:2], col=choose_color(input), pch=19, main="Negative controls") |
|
55 |
- ) |
|
56 |
- |
|
57 |
- output$pca_mv <- renderPlot( |
|
58 |
- if(input$variance=="variance") { |
|
59 |
- plot(pca_var$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes") |
|
60 |
- } else { |
|
61 |
- plot(pca_cv$x[,1:2], col=choose_color(input), pch=19, main="Top 100 most variable genes") |
|
62 |
- } |
|
63 |
- ) |
|
64 |
- |
|
65 |
- output$qc_plot <- renderPlot({ |
|
66 |
- barplot(abs(cors), beside = TRUE, col=cc[1:ncol(qc)], border=cc[1:ncol(qc)], ylim=c(0, 1), |
|
67 |
- space=c(0, 4), main="Absolute correlation with QC scores") |
|
68 |
- legend("topright", colnames(qc), fill=cc, border=cc, cex=.5) |
|
69 |
- } |
|
70 |
- ) |
|
71 |
- |
|
72 |
- output$qc_pca <- renderPlot({ |
|
73 |
- plot(qc_pca$x[,1:2], pch=19, col=choose_color(input), main="Quality PCA") |
|
74 |
- } |
|
75 |
- ) |
|
76 |
- |
|
77 |
- output$qc_corr <- renderTable(abs(cors[,1:5])) |
|
78 |
- |
|
79 |
- output$pam_cc <- renderPlot({ |
|
80 |
- dualHeatmap(heatData = sub[[input$k]], main="Heatmap of co-clustering, based on 100 subsamplings", |
|
81 |
- clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, |
|
82 |
- annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
83 |
- annColor=list(Batch=cc, Type=cc)) |
|
84 |
- }) |
|
85 |
- |
|
86 |
- output$pam_type <- renderTable({ |
|
87 |
- table(pam[[input$k-1]]$clustering, type) |
|
88 |
- }) |
|
89 |
- |
|
90 |
- output$pam_batch <- renderTable({ |
|
91 |
- table(pam[[input$k-1]]$clustering, batch) |
|
92 |
- }) |
|
93 |
- |
|
94 |
- output$pam_pca <- renderPlot({ |
|
95 |
- plot(pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Expression PCA") |
|
96 |
- legend("topright", as.character(1:input$k), fill=cc, cex=.5) |
|
97 |
- }) |
|
98 |
- |
|
99 |
- output$pam_qc <- renderPlot({ |
|
100 |
- plot(qc_pca$x[,1:2], pch=19, col=cc[pam[[input$k-1]]$clustering], main="Quality PCA") |
|
101 |
- legend("topright", as.character(1:input$k), fill=cc, cex=.5) |
|
102 |
- }) |
|
103 |
- |
|
104 |
- output$heatmap_pc <- renderPlot({ |
|
105 |
- dualHeatmap(heatData = t(norm), main="Positive Control genes", |
|
106 |
- clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=poscon, |
|
107 |
- annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
108 |
- annColor=list(Batch=cc, Type=cc)) |
|
109 |
- }) |
|
110 |
- |
|
111 |
- output$heatmap_nc <- renderPlot({ |
|
112 |
- dualHeatmap(heatData = t(norm), main="Negative Control genes", |
|
113 |
- clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=negcon, |
|
114 |
- annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
115 |
- annColor=list(Batch=cc, Type=cc)) |
|
116 |
- }) |
|
117 |
- |
|
118 |
- output$heatmap_mv <- renderPlot({ |
|
119 |
- if(input$variance=="variance") { |
|
120 |
- genes <- mv_var |
|
121 |
- } else { |
|
122 |
- genes <- mv_cv |
|
123 |
- } |
|
124 |
- |
|
125 |
- dualHeatmap(heatData = t(norm), main="Top 100 most variable genes", |
|
126 |
- clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes, |
|
127 |
- annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
128 |
- annColor=list(Batch=cc, Type=cc)) |
|
129 |
- }) |
|
130 |
- |
|
131 |
- output$heatmap_pc1 <- renderPlot({ |
|
132 |
- genes <- names(sort(pca$rotation[,1], decreasing = TRUE)[1:100]) |
|
133 |
- dualHeatmap(heatData = t(norm), main="Top 100 genes by PC1 loadings", |
|
134 |
- clusterSamples=TRUE, clusterVars=TRUE, dual=FALSE, whVars=genes, |
|
135 |
- annLegend=FALSE, annCol = data.frame(Batch=batch, Type=type), |
|
136 |
- annColor=list(Batch=cc, Type=cc)) |
|
137 |
- }) |
|
138 |
- |
|
139 |
- output$box_pc1_pc <- renderPlot({ |
|
140 |
- boxplot(tapply(pca_pc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Positive Controls") |
|
141 |
- }) |
|
142 |
- |
|
143 |
- output$box_pc1_nc <- renderPlot({ |
|
144 |
- boxplot(tapply(pca_nc$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of Negative Controls") |
|
145 |
- }) |
|
146 |
- |
|
147 |
- output$box_pc1_qc <- renderPlot({ |
|
148 |
- boxplot(tapply(qc_pca$x[,1], get(input$color_code), function(x) x), las=2, col=cc, ylab="PC1 score", main="PCA of quality scores") |
|
149 |
- }) |
|
150 |
- |
|
151 |
- output$curve_dropout <- renderPlot({ |
|
152 |
- plot(logistic(fitted[order(fnr$Alpha[1,]),1])~sort(fnr$Alpha[1,]), type='l', ylim=c(0, 1), col=rgb(0, 0, 0, 0.2), xlab="log average expression", ylab="Probability of drop-out", main="Drop-out Curves") |
|
153 |
- for(i in 2:ncol(fitted)) { |
|
154 |
- lines(logistic(fitted[order(fnr$Alpha[1,]),i])~sort(fnr$Alpha[1,]), type='l', col=rgb(0, 0, 0, 0.2)) |
|
155 |
- } |
|
156 |
- lines(logistic(rowMeans(fitted)[order(fnr$Alpha[1,])])~sort(fnr$Alpha[1,]), col=2, lwd=2) |
|
157 |
- }) |
|
158 |
- |
|
159 |
- output$box_detection <- renderPlot({ |
|
160 |
- boxplot(tapply(colSums(norm>5), get(input$color_code), function(x) x), las=2, col=cc, ylab="Number of detected genes (> 5 reads)", main="Detected Genes") |
|
161 |
- }) |
|
162 |
- |
|
163 |
- output$box_dropout <- renderPlot({ |
|
164 |
- boxplot(tapply(colMeans(1-w), get(input$color_code), function(x) x), las=2, col=cc, ylab="Drop-out rate", main="Drop-out Rate") |
|
165 |
- }) |
|
166 |
- |
|
167 |
- } |
|
168 |
- |
|
169 |
-ui <- fluidPage( |
|
170 |
- titlePanel("In-depth Normalization Report"), |
|
171 |
- |
|
172 |
- sidebarLayout( |
|
173 |
- sidebarPanel( |
|
174 |
- selectInput("color_code", label = "Color code / stratify by", |
|
175 |
- choices = list("Batch"="batch", "Cell Type"="type"), |
|
176 |
- selected = "batch"), |
|
177 |
- selectInput("variance", label = "Select most variable genes by", |
|
178 |
- choices = list("Variance"="variance", "Coefficient of variation"="cv"), |
|
179 |
- selected = "cv"), |
|
180 |
- sliderInput("k", label = "Number of clusters", |
|
181 |
- min=2, max=10, |
|
182 |
- value = 7), |
|
183 |
- checkboxInput("downsample", label="Downsample to 100 cells for boxplot panel", value=TRUE), |
|
184 |
- helpText("RLE: relative log expression."), |
|
185 |
- helpText("PCA: principal component analysis."), |
|
186 |
- helpText("QC: quality control."), |
|
187 |
- helpText("The 'Boxplot' panel shows the read count distribution and the RLE plot."), |
|
188 |
- helpText("The 'PCA' panel shows the PCA of different subset of genes."), |
|
189 |
- helpText("The 'QC' panel shows the absolute correlation between the first 10 principal components of the expression matrix and the QC measures. It also shows the PCA of the QC measures. Note that the PCA score signs are not meaningful (e.g., 'good' cells could have a low value of PC1 and 'bad' cells a high value)."), |
|
190 |
- helpText("The 'Cluster stability' panel shows a heatmap of the PAM co-clustering based on 100 subsamples of the data. It shows the relation between the clusters obtained by PAM (on the original data) to cell type, batch, expression PCA, and QC PCA. Note that these results are reported for normalization evaluation only. A proper cluster analysis should be performed to find sub-populations in the data."), |
|
191 |
- helpText("The 'Heatmaps' panel shows the log expression of the positive controls, negative controls, genes with highest PC1 loadings, and most variable genes."), |
|
192 |
- helpText("The 'Confounding' panel shows the first PC of positive controls, negative controls, and QC stratified by either cell type or batch. This is useful to identify lower quality batches and potential confounding between quality and cell type."), |
|
193 |
- helpText("The 'Drop-out' panel shows the number of detected genes and the drop-out rate stratified by either cell type or batch. It also shows the probability of drop-out as a function of the average expression"), |
|
194 |
- helpText("The 'Number of clusters' slider controls the PAM clustering in the 'Cluster stability' tab."), |
|
195 |
- helpText("Please note that with many cells (>500), some plots may take several seconds (sometimes minutes) to appear."), |
|
196 |
- helpText("By default, we downsample the data to 100 cells for the boxplot visualization (only for the 'Boxplot' tab). Please uncheck it to see the full dataset (it may take some time).") |
|
197 |
- ), |
|
198 |
- |
|
199 |
- mainPanel( |
|
200 |
- tabsetPanel(type = "tabs", |
|
201 |
- ## Add a "summary" tab (wait for Michael's text) |
|
202 |
- tabPanel("Boxplot", |
|
203 |
- plotOutput("boxplot"), |
|
204 |
- plotOutput("rle")), |
|
205 |
- tabPanel("PCA", |
|
206 |
- plotOutput("pca"), |
|
207 |
- plotOutput("pca_pc"), |
|
208 |
- plotOutput("pca_nc"), |
|
209 |
- plotOutput("pca_mv")), |
|
210 |
- tabPanel("QC", |
|
211 |
- plotOutput("qc_plot"), |
|
212 |
- tableOutput("qc_corr"), |
|
213 |
- plotOutput("qc_pca")), |
|
214 |
- tabPanel("Cluster stability", |
|
215 |
- plotOutput("pam_cc"), |
|
216 |
- fluidRow( |
|
217 |
- column(3, |
|
218 |
- h3("Type"), |
|
219 |
- br(), |
|
220 |
- tableOutput("pam_type")), |
|
221 |
- column(3, |
|
222 |
- h3("Batch"), |
|
223 |
- br(), |
|
224 |
- tableOutput("pam_batch"))), |
|
225 |
- plotOutput("pam_pca"), |
|
226 |
- plotOutput("pam_qc")), |
|
227 |
- tabPanel("Heatmaps", |
|
228 |
- plotOutput("heatmap_pc"), |
|
229 |
- plotOutput("heatmap_nc"), |
|
230 |
- plotOutput("heatmap_pc1"), |
|
231 |
- plotOutput("heatmap_mv")), |
|
232 |
- tabPanel("Confounding", |
|
233 |
- plotOutput("box_pc1_pc"), |
|
234 |
- plotOutput("box_pc1_nc"), |
|
235 |
- plotOutput("box_pc1_qc")), |
|
236 |
- tabPanel("Drop-out", |
|
237 |
- plotOutput("box_detection"), |
|
238 |
- plotOutput("box_dropout"), |
|
239 |
- plotOutput("curve_dropout")) |
|
240 |
- ) |
|
241 |
- |
|
242 |
- ))) |
|
243 |
- |
|
244 |
-shinyApp(ui = ui, server = server) |
245 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,93 +0,0 @@ |
1 |
-Lik_Fun = function(Z,X,Beta){ |
|
2 |
- return((1-Z) + (2*Z - 1)/(1+exp(-X %*% Beta))) |
|
3 |
-} |
|
4 |
- |
|
5 |
-Post_Z = function(Y,W,Alpha,X,Beta){ |
|
6 |
- return(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta)/(Lik_Fun(Y,W,Alpha)*Lik_Fun(1,X,Beta) + (Y == 0)*Lik_Fun(0,X,Beta))) |
|
7 |
-} |
|
8 |
- |
|
9 |
-estimateFNR = function(x, bulk_model = FALSE, is.control = TRUE, K = 0, X = NULL, Alpha = NULL, |
|
10 |
- is.expressed = NULL, niter = 100, ntresh = 0){ |
|
11 |
- |
|
12 |
- require(matrixStats) |
|
13 |
- Y = t(matrix(as.numeric(x > ntresh ),nrow = dim(x)[1])) |
|
14 |
- |
|
15 |
- if(bulk_model){ |
|
16 |
- x2 = x |
|
17 |
- x2[x2 <= ntresh] = NA |
|
18 |
- Alpha = t(matrix(log(rowMedians(x2, na.rm=TRUE)))) |
|
19 |
- } |
|
20 |
- |
|
21 |
- # Prepare Wanted Variation and Gene-Intrinsic Intercept |
|
22 |
- X = cbind(X,rep(1,dim(x)[2])) |
|
23 |
- is.intercept = dim(X)[2] == 1 |
|
24 |
- Beta = matrix(0,dim(X)[2],dim(x)[1]) |
|
25 |
- |
|
26 |
- # Prepare Unwanted Variation (K factors) and Sample-Intrinsic Intercept |
|
27 |
- if( K == 0 || !is.null(Alpha) ){ |
|
28 |
- Alpha = rbind(Alpha,rep(1,dim(x)[1])) |
|
29 |
- K = dim(Alpha)[1] - 1 |
|
30 |
- }else{ |
|
31 |
- M = 2*Y[is.control,] - 1 |
|
32 |
- M = t(t(M - rowMeans(M)) - colMeans(M)) + mean(M) |
|
33 |
- Alpha = rbind(t(svd(M,nv = K)$v[,1:K]),rep(1,dim(x)[1])) |
|
34 |
- } |
|
35 |
- W = matrix(0,dim(x)[2],K+1) |
|
36 |
- |
|
37 |
- # Initialize Z |
|
38 |
- Z = Post_Z(Y,W,Alpha,X,Beta) |
|
39 |
- |
|
40 |
- # If control genes are supplied, W fit once, according to control genes |
|
41 |
- if(!is.null(is.expressed)){ |
|
42 |
- cY = Y[,is.expressed] |
|
43 |
- cAlpha = Alpha[,is.expressed] |
|
44 |
- for (i in 1:dim(Z)[1]){ |
|
45 |
- W[i,] = glm(cbind(cY[i,],1-cY[i,]) ~ 0 + t(cAlpha),quasibinomial)$coefficients |
|
46 |
- } |
|
47 |
- } |
|
48 |
- |
|
49 |
- ## DR: it seems strange to me that this is a for loop and not a while !converged |
|
50 |
- for(n in 1:niter){ |
|
51 |
- # Update Beta |
|
52 |
- if(!is.intercept){ |
|
53 |
- |
|
54 |
- if(is.null(is.expressed)){ |
|
55 |
- for(i in 1:dim(Z)[2]){ |
|
56 |
- Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients |
|
57 |
- } |
|
58 |
- |
|
59 |
- }else{ # Only over non-control genes |
|
60 |
- for(i in (1:dim(Z)[2])[!is.expressed]){ |
|
61 |
- Beta[,i] = glm(cbind(Z[,i],1-Z[,i]) ~ 0 + X,quasibinomial)$coefficients |
|
62 |
- } |
|
63 |
- } |
|
64 |
- |
|
65 |
- |
|
66 |
- }else{ |
|
67 |
- # Short-cut! |
|
68 |
- Beta = t(matrix(log(colMeans(Z)/(1-colMeans(Z))))) |
|
69 |
- } |
|
70 |
- |
|
71 |
- # Update Z |
|
72 |
- Z = Post_Z(Y,W,Alpha,X,Beta) |
|
73 |
- |
|
74 |
- if(is.null(is.expressed)){ |
|
75 |
- # Update W |
|
76 |
- for (i in 1:dim(Z)[1]){ |
|
77 |
- W[i,] = glm(cbind(Y[i,],(1-Y[i,])*Z[i,]) ~ 0 + t(Alpha),quasibinomial)$coefficients |
|
78 |
- } |
|
79 |
- |
|
80 |
- # Update Z |
|
81 |
- Z = Post_Z(Y,W,Alpha,X,Beta) |
|
82 |
- } |
|
83 |
- } |
|
84 |
- |
|
85 |
- # Stick to our initial assumptions |
|
86 |
- if(!is.null(is.expressed)){ |
|
87 |
- Beta[,is.expressed] = NA |
|
88 |
- Z[,is.expressed] = 1 |
|
89 |
- } |
|
90 |
- EL = sum(na.omit(as.vector(Z*log(Lik_Fun(Y,W,Alpha))))) + sum(na.omit(as.vector(Z*log(Lik_Fun(1,X,Beta))))) + sum(na.omit(as.vector((1-Z)*log(Lik_Fun(0,X,Beta))))) |
|
91 |
- |
|
92 |
- return(list(Z = t(Z), EL = EL, P = t(Lik_Fun(1,W,Alpha)), W = W, X = X, Alpha = Alpha, Beta = Beta)) |
|
93 |
-} |
|
94 | 0 |
\ No newline at end of file |
... | ... |
@@ -2,7 +2,6 @@ |
2 | 2 |
#' |
3 | 3 |
#' This function is used internally in scone to parse the variables used to generate the design matrices. |
4 | 4 |
#' |
5 |
-#' |
|
6 | 5 |
#' @param pars character. A vector of parameters corresponding to a row of params. |
7 | 6 |
#' @param bio factor. The biological factor of interest. |
8 | 7 |
#' @param batch factor. The known batch effects. |
... | ... |
@@ -44,6 +43,8 @@ parse_row <- function(pars, bio, batch, ruv_factors, qc) { |
44 | 43 |
#' the bio variable. Here, nested means that each batch is made of observations from only one level of bio, |
45 | 44 |
#' while each level of bio may contain multiple batches. |
46 | 45 |
#' |
46 |
+#' @export |
|
47 |
+#' |
|
47 | 48 |
#' @param bio factor. The biological factor of interest. |
48 | 49 |
#' @param batch factor. The known batch effects. |
49 | 50 |
#' @param W numeric. Either a vector or matrix containing one or more continuous covariates (e.g. RUV factors). |
... | ... |
@@ -111,11 +112,15 @@ make_design <- function(bio, batch, W, nested=FALSE) { |
111 | 112 |
#' for which expression needs to be adjusted, start with either the word "batch" or the letter "W" (case sensitive). |
112 | 113 |
#' Any other covariate (including the intercept) is kept. |
113 | 114 |
#' |
115 |
+#' @importFrom limma lmFit |
|
116 |
+#' @export |
|
117 |
+#' |
|
114 | 118 |
#' @param log_expr matrix. The log gene expression (genes in row, samples in columns). |
115 | 119 |
#' @param design_mat matrix. The design matrix (usually the result of make_design). |
116 |
-#' |
|
120 |
+#' @param batch factor. A factor with the batch information. |
|
121 |
+#' @param weights matrix. A matrix of weights. |
|
117 | 122 |
#' @return The corrected log gene expression. |
118 |
-lm_adjust <- function(log_expr, design_mat, weights=NULL) { |
|
123 |
+lm_adjust <- function(log_expr, design_mat, batch=NULL, weights=NULL) { |
|
119 | 124 |
lm_object <- lmFit(log_expr, design = design_mat, weights = weights) |
120 | 125 |
|
121 | 126 |
uvind <- grep("^W", colnames(design_mat)) |
... | ... |
@@ -28,18 +28,24 @@ library(class) |
28 | 28 |
#' If NULL, condition KNN concordance will be returned NA. |
29 | 29 |
#' @param batch factor. The known batch variable (variation to be removed), NA is allowed. |
30 | 30 |
#' If NULL, batch KNN concordance will be returned NA. |
31 |
-#' @param qc_factors. Factors of unwanted variation derived from quality metrics. |
|
31 |
+#' @param qc_factors Factors of unwanted variation derived from quality metrics. |
|
32 | 32 |
#' If NULL, qc correlations will be returned NA. |
33 |
-#' @param ruv_factors. Factors of unwanted variation derived from negative control genes (adjustable set). |
|
33 |
+#' @param ruv_factors Factors of unwanted variation derived from negative control genes (adjustable set). |
|
34 | 34 |
#' If NULL, ruv correlations will be returned NA. |
35 |
-#' @param uv_factors. Factors of unwanted variation derived from negative control genes (un-adjustable set). |
|
35 |
+#' @param uv_factors Factors of unwanted variation derived from negative control genes (un-adjustable set). |
|
36 | 36 |
#' If NULL, uv correlations will be returned NA. |
37 |
-#' @param wv_factors. Factors of wanted variation derived from positive control genes (un-adjustable set). |
|
37 |
+#' @param wv_factors Factors of wanted variation derived from positive control genes (un-adjustable set). |
|
38 | 38 |
#' If NULL, wv correlations will be returned NA. |
39 | 39 |
#' @param is_log logical. If TRUE the expr matrix is already logged and log transformation will not be carried out. |
40 | 40 |
#' @param conditional_pam logical. If TRUE then maximum ASW is separately computed for each biological condition (including NA), |
41 | 41 |
#' and a weighted average is returned. |
42 | 42 |
#' |
43 |
+#' @importFrom scde bwpca |
|
44 |
+#' @importFrom class knn |
|
45 |
+#' @importFrom fpc pamk |
|
46 |
+#' |
|
47 |
+#' @export |
|
48 |
+#' |
|
43 | 49 |
#' @return A list with the following elements: |
44 | 50 |
#' \itemize{ |
45 | 51 |
#' \item{KNN_BIO}{K-NN concordance rate by biological condition.} |
... | ... |
@@ -72,7 +78,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, |
72 | 78 |
if(is.null(weights)){ |
73 | 79 |
proj = prcomp(t(expr),center = TRUE,scale. = TRUE)$x[,1:eval_pcs] |
74 | 80 |
}else{ |
75 |
- proj = scde::bwpca(mat = t(expr),matw = t(weights),npcs = eval_pcs, seed = seed, em.maxiter = em.maxiter)$scores |
|
81 |
+ proj = bwpca(mat = t(expr),matw = t(weights),npcs = eval_pcs, seed = seed, em.maxiter = em.maxiter)$scores |
|
76 | 82 |
} |
77 | 83 |
}else{ |
78 | 84 |
eval_pcs = dim(proj)[2] |
... | ... |
@@ -83,7 +89,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, |
83 | 89 |
if( !is.null(eval_knn) ){ |
84 | 90 |
|
85 | 91 |
if( !is.null(bio) | !any(!is.na(bio)) ){ |
86 |
- KNN_BIO = mean(attributes(class::knn(train = proj[!is.na(bio),],test = proj[!is.na(bio),],cl = bio[!is.na(bio)], k = eval_knn,prob = TRUE))$prob) |
|
92 |
+ KNN_BIO = mean(attributes(knn(train = proj[!is.na(bio),],test = proj[!is.na(bio),],cl = bio[!is.na(bio)], k = eval_knn,prob = TRUE))$prob) |
|
87 | 93 |
}else{ |
88 | 94 |
KNN_BIO = NA |
89 | 95 |
} |
... | ... |
@@ -91,7 +97,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, |
91 | 97 |
|
92 | 98 |
# K-NN Batch |
93 | 99 |
if( !is.null(batch) | !any(!is.na(batch)) ){ |
94 |
- KNN_BATCH = mean(attributes(class::knn(train = proj[!is.na(batch),],test = proj[!is.na(batch),],cl = batch[!is.na(batch)], k = eval_knn,prob = TRUE))$prob) |
|
100 |
+ KNN_BATCH = mean(attributes(knn(train = proj[!is.na(batch),],test = proj[!is.na(batch),],cl = batch[!is.na(batch)], k = eval_knn,prob = TRUE))$prob) |
|
95 | 101 |
}else{ |
96 | 102 |
KNN_BATCH = NA |
97 | 103 |
} |
... | ... |
@@ -116,7 +122,7 @@ score_matrix <- function(expr, eval_pcs = 3, proj = NULL, |
116 | 122 |
is_cond = which(bio == cond) |
117 | 123 |
cond_w = length(is_cond) |
118 | 124 |
if(cond_w > max(eval_kclust)){ |
119 |
- pamk_object = fpc::pamk(proj[is_cond,],krange = eval_kclust) |
|
125 |
+ pamk_object = pamk(proj[is_cond,],krange = eval_kclust) |
|
120 | 126 |
PAM_SIL = PAM_SIL + cond_w*pamk_object$pamobject$silinfo$avg.width |
121 | 127 |
}else{ |
122 | 128 |
stop("Number of clusters 'k' must be smaller than bio class size") |
... | ... |
@@ -128,7 +134,7 @@ score_matri |