... | ... |
@@ -44,8 +44,9 @@ Imports: |
44 | 44 |
readxl, |
45 | 45 |
tools, |
46 | 46 |
Rcapture, |
47 |
- grDevices, |
|
48 |
- zip |
|
47 |
+ grDevices, |
|
48 |
+ forcats, |
|
49 |
+ glue |
|
49 | 50 |
Encoding: UTF-8 |
50 | 51 |
LazyData: false |
51 | 52 |
Roxygen: list(markdown = TRUE) |
... | ... |
@@ -71,7 +72,8 @@ Suggests: |
71 | 72 |
circlize, |
72 | 73 |
plotly, |
73 | 74 |
gtools, |
74 |
- eulerr |
|
75 |
+ eulerr, |
|
76 |
+ openxlsx |
|
75 | 77 |
VignetteBuilder: knitr |
76 | 78 |
RdMacros: |
77 | 79 |
lifecycle |
... | ... |
@@ -20,12 +20,15 @@ export(compute_abundance) |
20 | 20 |
export(compute_near_integrations) |
21 | 21 |
export(cumulative_count_union) |
22 | 22 |
export(cumulative_is) |
23 |
-export(date_columns_coll) |
|
24 | 23 |
export(date_formats) |
24 |
+export(default_af_transform) |
|
25 | 25 |
export(default_iss_file_prefixes) |
26 | 26 |
export(default_meta_agg) |
27 |
+export(default_rec_agg_lambdas) |
|
27 | 28 |
export(default_report_path) |
28 | 29 |
export(default_stats) |
30 |
+export(fisher_scatterplot) |
|
31 |
+export(gene_frequency_fisher) |
|
29 | 32 |
export(generate_Vispa2_launch_AF) |
30 | 33 |
export(generate_blank_association_file) |
31 | 34 |
export(generate_default_folder_structure) |
... | ... |
@@ -55,6 +58,7 @@ export(refGene_table_cols) |
55 | 58 |
export(remove_collisions) |
56 | 59 |
export(reset_af_columns_def) |
57 | 60 |
export(reset_annotation_IS_vars) |
61 |
+export(reset_dyn_vars_config) |
|
58 | 62 |
export(reset_iss_stats_specs) |
59 | 63 |
export(reset_mandatory_IS_vars) |
60 | 64 |
export(reset_matrix_file_suffixes) |
... | ... |
@@ -70,6 +74,7 @@ export(sharing_venn) |
70 | 74 |
export(threshold_filter) |
71 | 75 |
export(top_abund_tableGrob) |
72 | 76 |
export(top_integrations) |
77 |
+export(top_targeted_genes) |
|
73 | 78 |
export(transform_columns) |
74 | 79 |
export(unzip_file_system) |
75 | 80 |
import(ggplot2) |
... | ... |
@@ -87,39 +92,28 @@ importFrom(data.table,.N) |
87 | 92 |
importFrom(data.table,.SD) |
88 | 93 |
importFrom(data.table,`%chin%`) |
89 | 94 |
importFrom(data.table,fread) |
90 |
-importFrom(data.table,melt.data.table) |
|
91 |
-importFrom(data.table,rbindlist) |
|
92 | 95 |
importFrom(data.table,setDT) |
93 | 96 |
importFrom(data.table,setnames) |
94 | 97 |
importFrom(dplyr,across) |
95 | 98 |
importFrom(dplyr,all_of) |
96 |
-importFrom(dplyr,anti_join) |
|
97 | 99 |
importFrom(dplyr,arrange) |
98 | 100 |
importFrom(dplyr,bind_cols) |
99 | 101 |
importFrom(dplyr,bind_rows) |
100 |
-importFrom(dplyr,contains) |
|
101 |
-importFrom(dplyr,cur_column) |
|
102 | 102 |
importFrom(dplyr,desc) |
103 | 103 |
importFrom(dplyr,distinct) |
104 | 104 |
importFrom(dplyr,everything) |
105 | 105 |
importFrom(dplyr,filter) |
106 |
-importFrom(dplyr,first) |
|
107 |
-importFrom(dplyr,full_join) |
|
108 | 106 |
importFrom(dplyr,group_by) |
109 | 107 |
importFrom(dplyr,group_keys) |
110 | 108 |
importFrom(dplyr,group_modify) |
111 | 109 |
importFrom(dplyr,group_split) |
112 |
-importFrom(dplyr,if_else) |
|
113 |
-importFrom(dplyr,intersect) |
|
114 | 110 |
importFrom(dplyr,left_join) |
115 | 111 |
importFrom(dplyr,mutate) |
116 | 112 |
importFrom(dplyr,n) |
117 |
-importFrom(dplyr,n_distinct) |
|
118 | 113 |
importFrom(dplyr,pull) |
119 | 114 |
importFrom(dplyr,rename) |
120 | 115 |
importFrom(dplyr,rename_with) |
121 | 116 |
importFrom(dplyr,select) |
122 |
-importFrom(dplyr,semi_join) |
|
123 | 117 |
importFrom(dplyr,slice) |
124 | 118 |
importFrom(dplyr,slice_head) |
125 | 119 |
importFrom(dplyr,starts_with) |
... | ... |
@@ -129,10 +123,8 @@ importFrom(fs,as_fs_path) |
129 | 123 |
importFrom(fs,dir_create) |
130 | 124 |
importFrom(fs,dir_exists) |
131 | 125 |
importFrom(fs,dir_ls) |
132 |
-importFrom(fs,file_exists) |
|
133 | 126 |
importFrom(fs,is_dir) |
134 | 127 |
importFrom(fs,path) |
135 |
-importFrom(fs,path_dir) |
|
136 | 128 |
importFrom(fs,path_ext) |
137 | 129 |
importFrom(fs,path_ext_set) |
138 | 130 |
importFrom(fs,path_home) |
... | ... |
@@ -141,8 +133,6 @@ importFrom(ggplot2,aes) |
141 | 133 |
importFrom(ggplot2,aes_) |
142 | 134 |
importFrom(ggplot2,element_blank) |
143 | 135 |
importFrom(ggplot2,element_text) |
144 |
-importFrom(ggplot2,geom_hline) |
|
145 |
-importFrom(ggplot2,geom_point) |
|
146 | 136 |
importFrom(ggplot2,geom_raster) |
147 | 137 |
importFrom(ggplot2,geom_text) |
148 | 138 |
importFrom(ggplot2,ggplot) |
... | ... |
@@ -152,12 +142,8 @@ importFrom(ggplot2,rel) |
152 | 142 |
importFrom(ggplot2,scale_alpha_continuous) |
153 | 143 |
importFrom(ggplot2,scale_fill_gradientn) |
154 | 144 |
importFrom(ggplot2,scale_fill_viridis_d) |
155 |
-importFrom(ggplot2,scale_x_continuous) |
|
156 |
-importFrom(ggplot2,scale_y_continuous) |
|
157 | 145 |
importFrom(ggplot2,sym) |
158 | 146 |
importFrom(ggplot2,theme) |
159 |
-importFrom(ggplot2,unit) |
|
160 |
-importFrom(ggrepel,geom_label_repel) |
|
161 | 147 |
importFrom(lifecycle,deprecate_warn) |
162 | 148 |
importFrom(lubridate,today) |
163 | 149 |
importFrom(magrittr,`%>%`) |
... | ... |
@@ -166,8 +152,6 @@ importFrom(purrr,detect_index) |
166 | 152 |
importFrom(purrr,flatten) |
167 | 153 |
importFrom(purrr,flatten_chr) |
168 | 154 |
importFrom(purrr,is_empty) |
169 |
-importFrom(purrr,is_formula) |
|
170 |
-importFrom(purrr,is_function) |
|
171 | 155 |
importFrom(purrr,map) |
172 | 156 |
importFrom(purrr,map2) |
173 | 157 |
importFrom(purrr,map2_dfr) |
... | ... |
@@ -206,27 +190,19 @@ importFrom(rlang,list2) |
206 | 190 |
importFrom(rlang,parse_expr) |
207 | 191 |
importFrom(rlang,sym) |
208 | 192 |
importFrom(stats,dt) |
209 |
-importFrom(stats,na.omit) |
|
210 | 193 |
importFrom(stats,setNames) |
211 | 194 |
importFrom(stats,shapiro.test) |
212 | 195 |
importFrom(stringr,str_detect) |
213 |
-importFrom(stringr,str_pad) |
|
214 |
-importFrom(stringr,str_replace) |
|
215 | 196 |
importFrom(stringr,str_replace_all) |
216 | 197 |
importFrom(stringr,str_split) |
217 |
-importFrom(stringr,str_to_lower) |
|
218 | 198 |
importFrom(stringr,str_to_upper) |
219 | 199 |
importFrom(tibble,add_column) |
220 | 200 |
importFrom(tibble,as_tibble) |
221 | 201 |
importFrom(tibble,tibble) |
222 | 202 |
importFrom(tibble,tribble) |
223 | 203 |
importFrom(tidyr,nest) |
224 |
-importFrom(tidyr,pivot_longer) |
|
225 | 204 |
importFrom(tidyr,pivot_wider) |
226 |
-importFrom(tidyr,separate) |
|
227 | 205 |
importFrom(tidyr,unite) |
228 | 206 |
importFrom(tidyr,unnest) |
229 | 207 |
importFrom(tools,file_path_sans_ext) |
230 |
-importFrom(utils,read.csv) |
|
231 | 208 |
importFrom(utils,read.delim) |
232 |
-importFrom(zip,unzip) |
... | ... |
@@ -11,5 +11,9 @@ |
11 | 11 |
#' \code{\link{import_parallel_Vispa2Matrices}} |
12 | 12 |
#' * import_parallel_Vispa2Matrices_interactive: |
13 | 13 |
#' \code{\link{import_parallel_Vispa2Matrices}} |
14 |
+#' * unzip_file_system: |
|
15 |
+#' \code{\link{generate_default_folder_structure}} |
|
16 |
+#' * cumulative_count_union: |
|
17 |
+#' \code{\link{cumulative_is}} |
|
14 | 18 |
#' @keywords internal |
15 | 19 |
NULL |
... | ... |
@@ -3,7 +3,8 @@ |
3 | 3 |
#------------------------------------------------------------------------------# |
4 | 4 |
#' Performs aggregation on metadata contained in the association file. |
5 | 5 |
#' |
6 |
-#' \lifecycle{stable} |
|
6 |
+#' @description |
|
7 |
+#' `r lifecycle::badge("stable")` |
|
7 | 8 |
#' Groups metadata by the specified grouping keys and returns a |
8 | 9 |
#' summary of info for each group. For more details on how to use this function: |
9 | 10 |
#' \code{vignette("aggregate_function_usage", package = "ISAnalytics")} |
... | ... |
@@ -20,10 +21,8 @@ |
20 | 21 |
#' @param import_stats `r lifecycle::badge("deprecated")` The import |
21 | 22 |
#' of VISPA2 stats has been moved to its dedicated function, |
22 | 23 |
#' see \link{import_Vispa2_stats}. |
23 |
-#' @family Aggregate functions |
|
24 |
-#' @importFrom rlang abort inform |
|
25 |
-#' @importFrom purrr is_empty |
|
26 |
-#' @import lifecycle |
|
24 |
+#' |
|
25 |
+#' @family Data cleaning and pre-processing |
|
27 | 26 |
#' |
28 | 27 |
#' @return An aggregated data frame |
29 | 28 |
#' @export |
... | ... |
@@ -101,9 +100,8 @@ aggregate_metadata <- function(association_file, |
101 | 100 |
#' * `Output_colname`: a `glue` specification that will be used to determine |
102 | 101 |
#' a unique output column name. See \link[glue]{glue} for more details. |
103 | 102 |
#' |
104 |
-#' @importFrom tibble tribble |
|
105 | 103 |
#' @return A data frame |
106 |
-#' @family Aggregate functions |
|
104 |
+#' @family Data cleaning and pre-processing |
|
107 | 105 |
#' @export |
108 | 106 |
#' |
109 | 107 |
#' @examples |
... | ... |
@@ -146,7 +144,8 @@ default_meta_agg <- function() { |
146 | 144 |
|
147 | 145 |
#' Aggregates matrices values based on specified key. |
148 | 146 |
#' |
149 |
-#' \lifecycle{stable} |
|
147 |
+#' @description |
|
148 |
+#' `r lifecycle::badge("stable")` |
|
150 | 149 |
#' Performs aggregation on values contained in the integration matrices based |
151 | 150 |
#' on the key and the specified lambda. For more details on how to use this |
152 | 151 |
#' function: |
... | ... |
@@ -206,12 +205,13 @@ default_meta_agg <- function() { |
206 | 205 |
#' @param join_af_by A character vector representing the joining key |
207 | 206 |
#' between the matrix and the metadata. Useful to re-aggregate already |
208 | 207 |
#' aggregated matrices. |
209 |
-#' @family Aggregate functions |
|
208 |
+#' |
|
209 |
+#' @family Data cleaning and pre-processing |
|
210 | 210 |
#' |
211 | 211 |
#' @importFrom purrr walk set_names map_lgl |
212 | 212 |
#' @importFrom rlang expr eval_tidy abort |
213 | 213 |
#' |
214 |
-#' @return A list of tibbles or a single tibble aggregated according to |
|
214 |
+#' @return A list of data frames or a single data frame aggregated according to |
|
215 | 215 |
#' the specified arguments |
216 | 216 |
#' @export |
217 | 217 |
#' |
... | ... |
@@ -248,43 +248,46 @@ aggregate_values_by_key <- function(x, |
248 | 248 |
data.table::setDT(association_file) |
249 | 249 |
stopifnot(is.list(lambda) && !is.null(names(lambda))) |
250 | 250 |
join_key_err <- c("Join key not present in in both data frames", |
251 |
- x = paste("Fields specified in the argument", |
|
252 |
- "`join_af_by` must appear in both", |
|
253 |
- "the association file and the matrix(es)")) |
|
251 |
+ x = paste( |
|
252 |
+ "Fields specified in the argument", |
|
253 |
+ "`join_af_by` must appear in both", |
|
254 |
+ "the association file and the matrix(es)" |
|
255 |
+ ) |
|
256 |
+ ) |
|
254 | 257 |
check_single_matrix <- function(df) { |
255 |
- stopifnot(is.data.frame(df)) |
|
256 |
- is_numeric_col <- purrr::map_lgl( |
|
257 |
- value_cols, |
|
258 |
- ~ is.numeric(df[[.x]]) || |
|
259 |
- is.double(df[[.x]]) || |
|
260 |
- is.integer(df[[.x]]) |
|
261 |
- ) %>% purrr::set_names(value_cols) |
|
262 |
- if (any(!is_numeric_col)) { |
|
263 |
- rlang::abort(.non_num_user_cols_error( |
|
264 |
- names(is_numeric_col)[!is_numeric_col] |
|
265 |
- )) |
|
266 |
- } |
|
267 |
- if (!all(join_af_by %in% colnames(df))) { |
|
268 |
- rlang::abort(join_key_err, class = "join_key_err_agg") |
|
269 |
- } |
|
258 |
+ stopifnot(is.data.frame(df)) |
|
259 |
+ is_numeric_col <- purrr::map_lgl( |
|
260 |
+ value_cols, |
|
261 |
+ ~ is.numeric(df[[.x]]) || |
|
262 |
+ is.double(df[[.x]]) || |
|
263 |
+ is.integer(df[[.x]]) |
|
264 |
+ ) %>% purrr::set_names(value_cols) |
|
265 |
+ if (any(!is_numeric_col)) { |
|
266 |
+ rlang::abort(.non_num_user_cols_error( |
|
267 |
+ names(is_numeric_col)[!is_numeric_col] |
|
268 |
+ )) |
|
269 |
+ } |
|
270 |
+ if (!all(join_af_by %in% colnames(df))) { |
|
271 |
+ rlang::abort(join_key_err, class = "join_key_err_agg") |
|
272 |
+ } |
|
270 | 273 |
} |
271 | 274 |
if (!is.data.frame(x)) { |
272 |
- purrr::walk(x, check_single_matrix) |
|
275 |
+ purrr::walk(x, check_single_matrix) |
|
273 | 276 |
} else { |
274 |
- check_single_matrix(x) |
|
277 |
+ check_single_matrix(x) |
|
275 | 278 |
} |
276 |
- join_key_in_af <- all(join_af_by %in% colnames(association_file)) |
|
279 |
+ join_key_in_af <- all(join_af_by %in% colnames(association_file)) |
|
277 | 280 |
if (!join_key_in_af) { |
278 |
- rlang::abort(join_key_err, class = "join_key_err_agg") |
|
281 |
+ rlang::abort(join_key_err, class = "join_key_err_agg") |
|
279 | 282 |
} |
280 | 283 |
agg_matrix <- if (is.data.frame(x)) { |
281 | 284 |
.aggregate_lambda( |
282 | 285 |
x, association_file, key, value_cols, lambda, group, join_af_by |
283 | 286 |
) |
284 | 287 |
} else { |
285 |
- agg_matrix <- purrr::map(x, ~ .aggregate_lambda( |
|
286 |
- .x, association_file, key, value_cols, lambda, group, join_af_by |
|
287 |
- )) |
|
288 |
+ agg_matrix <- purrr::map(x, ~ .aggregate_lambda( |
|
289 |
+ .x, association_file, key, value_cols, lambda, group, join_af_by |
|
290 |
+ )) |
|
288 | 291 |
} |
289 | 292 |
return(agg_matrix) |
290 | 293 |
} |
... | ... |
@@ -4,7 +4,8 @@ |
4 | 4 |
|
5 | 5 |
#' Computes the abundance for every integration event in the input data frame. |
6 | 6 |
#' |
7 |
-#' \lifecycle{stable} |
|
7 |
+#' @description |
|
8 |
+#' `r lifecycle::badge("stable")` |
|
8 | 9 |
#' Abundance is obtained for every integration event by calculating the ratio |
9 | 10 |
#' between the single value and the total value for the given group. |
10 | 11 |
#' |
... | ... |
@@ -13,6 +14,11 @@ |
13 | 14 |
#' relative abundance column (and optionally a percentage abundance |
14 | 15 |
#' column) will be produced. |
15 | 16 |
#' |
17 |
+#' @section Required tags: |
|
18 |
+#' The function will explicitly check for the presence of these tags: |
|
19 |
+#' |
|
20 |
+#' * All columns declared in `mandatory_IS_vars()` |
|
21 |
+#' |
|
16 | 22 |
#' @param x An integration matrix - aka a data frame that includes |
17 | 23 |
#' the `mandatory_IS_vars()` as columns. The matrix can either be aggregated |
18 | 24 |
#' (via `aggregate_values_by_key()`) or not. |
... | ... |
@@ -29,12 +35,6 @@ |
29 | 35 |
#' |
30 | 36 |
#' @family Analysis functions |
31 | 37 |
#' |
32 |
-#' @importFrom magrittr `%>%` |
|
33 |
-#' @importFrom dplyr group_by across all_of summarise left_join mutate |
|
34 |
-#' @importFrom dplyr cur_column distinct select contains rename_with |
|
35 |
-#' @importFrom rlang .data eval_tidy parse_expr abort |
|
36 |
-#' @importFrom purrr map_lgl |
|
37 |
-#' @importFrom stringr str_replace |
|
38 | 38 |
#' @return Either a single data frame with computed abundance values or |
39 | 39 |
#' a list of 2 data frames (abundance_df, quant_totals) |
40 | 40 |
#' @export |
... | ... |
@@ -130,7 +130,7 @@ compute_abundance <- function(x, |
130 | 130 |
#' Filter data frames with custom predicates |
131 | 131 |
#' |
132 | 132 |
#' @description |
133 |
-#' \lifecycle{experimental} |
|
133 |
+#' `r lifecycle::badge("stable")` |
|
134 | 134 |
#' Filter a single data frame or a list of data frames with custom |
135 | 135 |
#' predicates assembled from the function parameters. |
136 | 136 |
#' |
... | ... |
@@ -242,7 +242,7 @@ compute_abundance <- function(x, |
242 | 242 |
#' character vectors. Must be one of the allowed values between |
243 | 243 |
#' `c("<", ">", "==", "!=", ">=", "<=")` |
244 | 244 |
#' |
245 |
-#' @family Analysis functions |
|
245 |
+#' @family Data cleaning and pre-processing |
|
246 | 246 |
#' |
247 | 247 |
#' @return A data frame or a list of data frames |
248 | 248 |
#' @export |
... | ... |
@@ -291,7 +291,8 @@ threshold_filter <- function(x, |
291 | 291 |
#' Sorts and keeps the top n integration sites based on the values |
292 | 292 |
#' in a given column. |
293 | 293 |
#' |
294 |
-#' \lifecycle{experimental} |
|
294 |
+#' @description |
|
295 |
+#' `r lifecycle::badge("stable")` |
|
295 | 296 |
#' The input data frame will be sorted by the highest values in |
296 | 297 |
#' the columns specified and the top n rows will be returned as output. |
297 | 298 |
#' The user can choose to keep additional columns in the output |
... | ... |
@@ -300,6 +301,11 @@ threshold_filter <- function(x, |
300 | 301 |
#' * `keep = "nothing"` only keeps the mandatory columns |
301 | 302 |
#' (`mandatory_IS_vars()`) plus the columns in the `columns` parameter. |
302 | 303 |
#' |
304 |
+#' @section Required tags: |
|
305 |
+#' The function will explicitly check for the presence of these tags: |
|
306 |
+#' |
|
307 |
+#' * All columns declared in `mandatory_IS_vars()` |
|
308 |
+#' |
|
303 | 309 |
#' @param x An integration matrix (data frame containing |
304 | 310 |
#' `mandatory_IS_vars()`) |
305 | 311 |
#' @param n How many integrations should be sliced (in total or |
... | ... |
@@ -316,8 +322,6 @@ threshold_filter <- function(x, |
316 | 322 |
#' |
317 | 323 |
#' @family Analysis functions |
318 | 324 |
#' |
319 |
-#' @importFrom magrittr `%>%` |
|
320 |
-#' @importFrom rlang abort |
|
321 | 325 |
#' |
322 | 326 |
#' @return Either a data frame with at most n rows or |
323 | 327 |
#' a data frames with at most n*(number of groups) rows. |
... | ... |
@@ -344,7 +348,7 @@ threshold_filter <- function(x, |
344 | 348 |
#' keep = "Value2", |
345 | 349 |
#' key = "CompleteAmplificationID" |
346 | 350 |
#' ) |
347 |
-#top_abundant_is |
|
351 |
+# top_abundant_is |
|
348 | 352 |
top_integrations <- function(x, |
349 | 353 |
n = 20, |
350 | 354 |
columns = "fragmentEstimate_sum_RelAbundance", |
... | ... |
@@ -408,6 +412,75 @@ top_integrations <- function(x, |
408 | 412 |
} |
409 | 413 |
|
410 | 414 |
|
415 |
+#' Top n targeted genes based on number of IS. |
|
416 |
+#' |
|
417 |
+#' @description |
|
418 |
+#' `r lifecycle::badge("experimental")` |
|
419 |
+#' Produces a summary of the number of integration events per gene, orders |
|
420 |
+#' the table in decreasing order and slices the first n rows - either on |
|
421 |
+#' all the data frame or by group. |
|
422 |
+#' |
|
423 |
+#' @details |
|
424 |
+#' ## Gene grouping |
|
425 |
+#' When producing a summary of IS by gene, there are different options that |
|
426 |
+#' can be chosen. |
|
427 |
+#' The argument `consider_chr` accounts for the fact that some genes (same |
|
428 |
+#' gene symbol) may span more than one chromosome: if set to `TRUE` |
|
429 |
+#' counts of IS will be separated for those genes that span 2 or more |
|
430 |
+#' chromosomes - in other words they will be in 2 different rows of the |
|
431 |
+#' output table. On the contrary, if the argument is set to `FALSE`, |
|
432 |
+#' counts will be produced in a single row. |
|
433 |
+#' |
|
434 |
+#' NOTE: the function counts **DISTINCT** integration events, which logically |
|
435 |
+#' corresponds to a union of sets. Be aware of the fact that counts per group |
|
436 |
+#' and counts with different arguments might be different: if for example |
|
437 |
+#' counts are performed by considering chromosome and there is one gene symbol |
|
438 |
+#' with 2 different counts, the sum of those 2 will likely not be equal to |
|
439 |
+#' the count obtained by performing the calculations without |
|
440 |
+#' considering the chromosome. |
|
441 |
+#' |
|
442 |
+#' The same reasoning can be applied for the argument `consider_gene_strand`, |
|
443 |
+#' that takes into account the strand of the gene. |
|
444 |
+#' |
|
445 |
+#' @section Required tags: |
|
446 |
+#' The function will explicitly check for the presence of these tags: |
|
447 |
+#' |
|
448 |
+#' ```{r echo=FALSE, results="asis"} |
|
449 |
+#' all_tags <- available_tags() |
|
450 |
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")), |
|
451 |
+#' ~ "top_targeted_genes" %in% .x)][["tag"]]) |
|
452 |
+#' cat(paste0("* ", needed, collapse="\n")) |
|
453 |
+#' ``` |
|
454 |
+#' |
|
455 |
+#' Note that the tags "gene_strand" and "chromosome" are explicitly required |
|
456 |
+#' only if `consider_chr = TRUE` and/or `consider_gene_strand = TRUE`. |
|
457 |
+#' |
|
458 |
+#' @param x An integration matrix - must be annotated |
|
459 |
+#' @param n Number of rows to slice |
|
460 |
+#' @param key If slice has to be performed for each group, the character |
|
461 |
+#' vector of column names that identify the groups. If `NULL` considers the |
|
462 |
+#' whole input data frame. |
|
463 |
+#' @param consider_chr Logical, should the chromosome be taken into account? |
|
464 |
+#' See details. |
|
465 |
+#' @param consider_gene_strand Logical, should the gene strand be taken into |
|
466 |
+#' account? See details. |
|
467 |
+#' @param as_df If computation is performed by group, `TRUE` returns all |
|
468 |
+#' groups merged in a single data frame with a column containing the group id. |
|
469 |
+#' If `FALSE` returns a named list. |
|
470 |
+#' |
|
471 |
+#' @importFrom rlang sym |
|
472 |
+#' |
|
473 |
+#' @return A data frame or a list of data frames |
|
474 |
+#' @export |
|
475 |
+#' @family Analysis functions |
|
476 |
+#' |
|
477 |
+#' @examples |
|
478 |
+#' data("integration_matrices", package = "ISAnalytics") |
|
479 |
+#' top_targ <- top_targeted_genes( |
|
480 |
+#' integration_matrices, |
|
481 |
+#' key = NULL |
|
482 |
+#' ) |
|
483 |
+#' top_targ |
|
411 | 484 |
top_targeted_genes <- function(x, |
412 | 485 |
n = 20, |
413 | 486 |
key = c( |
... | ... |
@@ -417,70 +490,134 @@ top_targeted_genes <- function(x, |
417 | 490 |
consider_chr = TRUE, |
418 | 491 |
consider_gene_strand = TRUE, |
419 | 492 |
as_df = TRUE) { |
420 |
- stopifnot(is.data.frame(x)) |
|
421 |
- data.table::setDT(x) |
|
422 |
- stopifnot(is.numeric(n) || is.integer(n)) |
|
423 |
- stopifnot(is.null(key) || is.character(key)) |
|
424 |
- stopifnot(is.logical(consider_chr)) |
|
425 |
- stopifnot(is.logical(consider_gene_strand)) |
|
493 |
+ stopifnot(is.data.frame(x)) |
|
494 |
+ data.table::setDT(x) |
|
495 |
+ stopifnot(is.numeric(n) || is.integer(n)) |
|
496 |
+ stopifnot(is.null(key) || is.character(key)) |
|
497 |
+ stopifnot(is.logical(consider_chr)) |
|
498 |
+ stopifnot(is.logical(consider_gene_strand)) |
|
426 | 499 |
|
427 |
- required_annot_tags <- c("gene_symbol") |
|
428 |
- if (consider_gene_strand) { |
|
429 |
- required_annot_tags <- c(required_annot_tags, "gene_strand") |
|
430 |
- } |
|
431 |
- annot_tag_cols <- .check_required_cols(required_annot_tags, |
|
432 |
- annotation_IS_vars(TRUE), |
|
433 |
- "error") |
|
434 |
- if (consider_chr) { |
|
435 |
- chr_tag_col <- .check_required_cols(c("chromosome", "locus"), |
|
436 |
- mandatory_IS_vars(TRUE), |
|
437 |
- "error") |
|
438 |
- annot_tag_cols <- annot_tag_cols %>% |
|
439 |
- dplyr::bind_rows(chr_tag_col) |
|
440 |
- } |
|
441 |
- data.table::setDT(annot_tag_cols) |
|
442 |
- cols_to_check <- c(annot_tag_cols$names, key) |
|
443 |
- if (!all(cols_to_check %in% colnames(x))) { |
|
444 |
- rlang::abort(.missing_needed_cols( |
|
445 |
- cols_to_check[!cols_to_check %in% colnames(x)])) |
|
446 |
- } |
|
500 |
+ required_annot_tags <- c("gene_symbol") |
|
501 |
+ if (consider_gene_strand) { |
|
502 |
+ required_annot_tags <- c(required_annot_tags, "gene_strand") |
|
503 |
+ } |
|
504 |
+ annot_tag_cols <- .check_required_cols( |
|
505 |
+ required_annot_tags, |
|
506 |
+ annotation_IS_vars(TRUE), |
|
507 |
+ "error" |
|
508 |
+ ) |
|
509 |
+ if (consider_chr) { |
|
510 |
+ chr_tag_col <- .check_required_cols( |
|
511 |
+ c("chromosome", "locus"), |
|
512 |
+ mandatory_IS_vars(TRUE), |
|
513 |
+ "error" |
|
514 |
+ ) |
|
515 |
+ annot_tag_cols <- annot_tag_cols %>% |
|
516 |
+ dplyr::bind_rows(chr_tag_col) |
|
517 |
+ } |
|
518 |
+ data.table::setDT(annot_tag_cols) |
|
519 |
+ cols_to_check <- c(annot_tag_cols$names, key) |
|
520 |
+ if (!all(cols_to_check %in% colnames(x))) { |
|
521 |
+ rlang::abort(.missing_needed_cols( |
|
522 |
+ cols_to_check[!cols_to_check %in% colnames(x)] |
|
523 |
+ )) |
|
524 |
+ } |
|
447 | 525 |
|
448 |
- df_with_is_counts <- if (is.null(key)) { |
|
449 |
- .count_distinct_is_per_gene( |
|
450 |
- x = x, include_chr = consider_chr, |
|
451 |
- include_gene_strand = consider_gene_strand, |
|
452 |
- gene_sym_col = annot_tag_cols[ |
|
453 |
- eval(sym("tag")) == "gene_symbol"][["names"]], |
|
454 |
- gene_strand_col = annot_tag_cols[ |
|
455 |
- eval(sym("tag")) == "gene_strand"][["names"]], |
|
456 |
- chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]], |
|
457 |
- mand_vars_to_check = mandatory_IS_vars(TRUE) |
|
458 |
- ) %>% |
|
459 |
- dplyr::arrange(dplyr::desc(.data$n_IS)) %>% |
|
460 |
- dplyr::slice_head(n = n) |
|
461 |
- } else { |
|
462 |
- tmp <- x[, .count_distinct_is_per_gene( |
|
463 |
- x = .SD, include_chr = consider_chr, |
|
464 |
- include_gene_strand = consider_gene_strand, |
|
465 |
- gene_sym_col = annot_tag_cols[ |
|
466 |
- eval(sym("tag")) == "gene_symbol"][["names"]], |
|
467 |
- gene_strand_col = annot_tag_cols[ |
|
468 |
- eval(sym("tag")) == "gene_strand"][["names"]], |
|
469 |
- chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]], |
|
470 |
- mand_vars_to_check = mandatory_IS_vars(TRUE) |
|
471 |
- ), by = eval(key)] |
|
472 |
- tmp[, |
|
473 |
- .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>% |
|
474 |
- dplyr::slice_head(n = n), |
|
475 |
- by = eval(key)] |
|
476 |
- } |
|
477 |
- if (as_df) { |
|
478 |
- return(df_with_is_counts) |
|
479 |
- } |
|
480 |
- return(split(df_with_is_counts, by = key)) |
|
526 |
+ df_with_is_counts <- if (is.null(key)) { |
|
527 |
+ .count_distinct_is_per_gene( |
|
528 |
+ x = x, include_chr = consider_chr, |
|
529 |
+ include_gene_strand = consider_gene_strand, |
|
530 |
+ gene_sym_col = annot_tag_cols[ |
|
531 |
+ eval(sym("tag")) == "gene_symbol" |
|
532 |
+ ][["names"]], |
|
533 |
+ gene_strand_col = annot_tag_cols[ |
|
534 |
+ eval(sym("tag")) == "gene_strand" |
|
535 |
+ ][["names"]], |
|
536 |
+ chr_col = annot_tag_cols[eval(sym("tag")) == |
|
537 |
+ "chromosome"][["names"]], |
|
538 |
+ mand_vars_to_check = mandatory_IS_vars(TRUE) |
|
539 |
+ ) %>% |
|
540 |
+ dplyr::arrange(dplyr::desc(.data$n_IS)) %>% |
|
541 |
+ dplyr::slice_head(n = n) |
|
542 |
+ } else { |
|
543 |
+ tmp <- x[, .count_distinct_is_per_gene( |
|
544 |
+ x = .SD, include_chr = consider_chr, |
|
545 |
+ include_gene_strand = consider_gene_strand, |
|
546 |
+ gene_sym_col = annot_tag_cols[ |
|
547 |
+ eval(sym("tag")) == "gene_symbol" |
|
548 |
+ ][["names"]], |
|
549 |
+ gene_strand_col = annot_tag_cols[ |
|
550 |
+ eval(sym("tag")) == "gene_strand" |
|
551 |
+ ][["names"]], |
|
552 |
+ chr_col = annot_tag_cols[eval(sym("tag")) == |
|
553 |
+ "chromosome"][["names"]], |
|
554 |
+ mand_vars_to_check = mandatory_IS_vars(TRUE) |
|
555 |
+ ), by = eval(key)] |
|
556 |
+ tmp[, |
|
557 |
+ .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>% |
|
558 |
+ dplyr::slice_head(n = n), |
|
559 |
+ by = eval(key) |
|
560 |
+ ] |
|
561 |
+ } |
|
562 |
+ if (as_df) { |
|
563 |
+ return(df_with_is_counts) |
|
564 |
+ } |
|
565 |
+ return(split(df_with_is_counts, by = key)) |
|
481 | 566 |
} |
482 | 567 |
|
483 | 568 |
|
569 |
+#' Compute Fisher's exact test on gene frequencies. |
|
570 |
+#' |
|
571 |
+#' @description |
|
572 |
+#' `r lifecycle::badge("experimental")` |
|
573 |
+#' Provided 2 data frames with calculations for CIS, via `CIS_grubbs()`, |
|
574 |
+#' computes Fisher's exact test. |
|
575 |
+#' Results can be plotted via `fisher_scatterplot()`. |
|
576 |
+#' |
|
577 |
+#' @param cis_x A data frame obtained via `CIS_grubbs()` |
|
578 |
+#' @param cis_y A data frame obtained via `CIS_grubbs()` |
|
579 |
+#' @param min_is_per_gene Used for pre-filtering purposes. Genes with a |
|
580 |
+#' number of distinct integration less than this number will be filtered out |
|
581 |
+#' prior calculations. Single numeric or integer. |
|
582 |
+#' @param gene_set_method One between "intersection" and "union". When merging |
|
583 |
+#' the 2 data frames, `intersection` will perform an inner join operation, |
|
584 |
+#' while `union` will perform a full join operation. |
|
585 |
+#' @param significance_threshold Significance threshold for the Fisher's |
|
586 |
+#' test p-value |
|
587 |
+#' @param remove_unbalanced_0 Remove from the final output those pairs in |
|
588 |
+#' which there are no IS for one group or the other and the number of |
|
589 |
+#' IS of the non-missing group are less than the mean number of IS for that |
|
590 |
+#' group |
|
591 |
+#' |
|
592 |
+#' @template genes_db |
|
593 |
+#' |
|
594 |
+#' @section Required tags: |
|
595 |
+#' The function will explicitly check for the presence of these tags: |
|
596 |
+#' |
|
597 |
+#' ```{r echo=FALSE, results="asis"} |
|
598 |
+#' all_tags <- available_tags() |
|
599 |
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")), |
|
600 |
+#' ~ "gene_frequency_fisher" %in% .x)][["tag"]]) |
|
601 |
+#' cat(paste0("* ", needed, collapse="\n")) |
|
602 |
+#' ``` |
|
603 |
+#' |
|
604 |
+#' @return A data frame |
|
605 |
+#' @export |
|
606 |
+#' @family Analysis functions |
|
607 |
+#' |
|
608 |
+#' @examples |
|
609 |
+#' data("integration_matrices", package = "ISAnalytics") |
|
610 |
+#' data("association_file", package = "ISAnalytics") |
|
611 |
+#' aggreg <- aggregate_values_by_key( |
|
612 |
+#' x = integration_matrices, |
|
613 |
+#' association_file = association_file, |
|
614 |
+#' value_cols = c("seqCount", "fragmentEstimate") |
|
615 |
+#' ) |
|
616 |
+#' cis <- CIS_grubbs(aggreg, by = "SubjectID") |
|
617 |
+#' fisher <- gene_frequency_fisher(cis$cis$PT001, cis$cis$PT002, |
|
618 |
+#' min_is_per_gene = 2 |
|
619 |
+#' ) |
|
620 |
+#' fisher |
|
484 | 621 |
gene_frequency_fisher <- function(cis_x, |
485 | 622 |
cis_y, |
486 | 623 |
min_is_per_gene = 3, |
... | ... |
@@ -506,142 +643,169 @@ gene_frequency_fisher <- function(cis_x, |
506 | 643 |
stopifnot(is.logical(remove_unbalanced_0)) |
507 | 644 |
## -- Fetch gene symbol column |
508 | 645 |
gene_sym_col <- .check_required_cols( |
509 |
- "gene_symbol", annotation_IS_vars(TRUE), duplicate_politic = "error" |
|
646 |
+ "gene_symbol", annotation_IS_vars(TRUE), |
|
647 |
+ duplicate_politic = "error" |
|
510 | 648 |
)[["names"]] |
511 |
- req_cis_cols <- c(gene_sym_col, "n_IS_perGene", "average_TxLen", |
|
512 |
- "raw_gene_integration_frequency") |
|
649 |
+ req_cis_cols <- c( |
|
650 |
+ gene_sym_col, "n_IS_perGene", "average_TxLen", |
|
651 |
+ "raw_gene_integration_frequency" |
|
652 |
+ ) |
|
513 | 653 |
quiet_expand <- purrr::quietly(.expand_cis_df) |
514 |
- cols_for_join <- c(gene_sym_col, |
|
515 |
- "Onco1_TS2", "ClinicalRelevance", "DOIReference", |
|
516 |
- "KnownGeneClass", "KnownClonalExpansion", |
|
517 |
- "CriticalForInsMut") |
|
654 |
+ cols_for_join <- c( |
|
655 |
+ gene_sym_col, |
|
656 |
+ "Onco1_TS2", "ClinicalRelevance", "DOIReference", |
|
657 |
+ "KnownGeneClass", "KnownClonalExpansion", |
|
658 |
+ "CriticalForInsMut" |
|
659 |
+ ) |
|
518 | 660 |
## --- Calculations to perform on each df |
519 | 661 |
append_calc <- function(df, group_n) { |
520 |
- if (!all(req_cis_cols %in% colnames(df))) { |
|
521 |
- rlang::abort( |
|
522 |
- .missing_needed_cols(req_cis_cols[!req_cis_cols %in% colnames(df)])) |
|
523 |
- } |
|
524 |
- modified <- quiet_expand(df, gene_sym_col, |
|
525 |
- onco_db_file, tumor_suppressors_db_file, |
|
526 |
- species, known_onco, suspicious_genes)$result |
|
527 |
- modified <- modified %>% |
|
528 |
- dplyr::mutate( |
|
529 |
- IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000, |
|
530 |
- Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen, na.rm = TRUE), |
|
531 |
- IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen / |
|
532 |
- .data$Sum_IS_per_kbGeneLen) * 1e6 |
|
533 |
- ) %>% |
|
534 |
- dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>% |
|
535 |
- dplyr::select(dplyr::all_of(c(req_cis_cols, cols_for_join, |
|
536 |
- "IS_per_kbGeneLen", |
|
537 |
- "Sum_IS_per_kbGeneLen", |
|
538 |
- "IS_per_kbGeneLen_perMDepth_TPM"))) |
|
539 |
- colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste( |
|
540 |
- colnames(modified)[!colnames(modified) %in% cols_for_join], group_n, |
|
541 |
- sep = "_" |
|
542 |
- ) |
|
543 |
- return(modified) |
|
662 |
+ if (!all(req_cis_cols %in% colnames(df))) { |
|
663 |
+ rlang::abort( |
|
664 |
+ .missing_needed_cols(req_cis_cols[!req_cis_cols %in% |
|
665 |
+ colnames(df)]) |
|
666 |
+ ) |
|
667 |
+ } |
|
668 |
+ modified <- quiet_expand( |
|
669 |
+ df, gene_sym_col, |
|
670 |
+ onco_db_file, tumor_suppressors_db_file, |
|
671 |
+ species, known_onco, suspicious_genes |
|
672 |
+ )$result |
|
673 |
+ modified <- modified %>% |
|
674 |
+ dplyr::mutate( |
|
675 |
+ IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000, |
|
676 |
+ Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen, |
|
677 |
+ na.rm = TRUE |
|
678 |
+ ), |
|
679 |
+ IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen / |
|
680 |
+ .data$Sum_IS_per_kbGeneLen) * 1e6 |
|
681 |
+ ) %>% |
|
682 |
+ dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>% |
|
683 |
+ dplyr::select(dplyr::all_of(c( |
|
684 |
+ req_cis_cols, cols_for_join, |
|
685 |
+ "IS_per_kbGeneLen", |
|
686 |
+ "Sum_IS_per_kbGeneLen", |
|
687 |
+ "IS_per_kbGeneLen_perMDepth_TPM" |
|
688 |
+ ))) |
|
689 |
+ colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste( |
|
690 |
+ colnames(modified)[!colnames(modified) %in% cols_for_join], group_n, |
|
691 |
+ sep = "_" |
|
692 |
+ ) |
|
693 |
+ return(modified) |
|
544 | 694 |
} |
545 |
- cis_mod <- purrr::map2(list(cis_x, cis_y), c(1,2), append_calc) |
|
695 |
+ cis_mod <- purrr::map2(list(cis_x, cis_y), c(1, 2), append_calc) |
|
546 | 696 |
## --- Merge the two in 1 df |
547 | 697 |
merged <- if (gene_set_method == "union") { |
548 |
- purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join)) |
|
698 |
+ purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join)) |
|
549 | 699 |
} else { |
550 |
- purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join)) |
|
700 |
+ purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join)) |
|
551 | 701 |
} |
552 | 702 |
if (nrow(merged) == 0) { |
553 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
554 |
- msg <- c("Data frame empty after filtering", |
|
555 |
- i = paste("Data frame is empty after applying filter on IS,", |
|
556 |
- "is your filter too stringent?"), |
|
557 |
- x = "Nothing to return") |
|
558 |
- rlang::inform(msg, class = "empty_df_gene_freq") |
|
559 |
- } |
|
560 |
- return(NULL) |
|
703 |
+ if (getOption("ISAnalytics.verbose") == TRUE) { |
|
704 |
+ msg <- c("Data frame empty after filtering", |
|
705 |
+ i = paste( |
|
706 |
+ "Data frame is empty after applying filter on IS,", |
|
707 |
+ "is your filter too stringent?" |
|
708 |
+ ), |
|
709 |
+ x = "Nothing to return" |
|
710 |
+ ) |
|
711 |
+ rlang::inform(msg, class = "empty_df_gene_freq") |
|
712 |
+ } |
|
713 |
+ return(NULL) |
|
561 | 714 |
} |
562 | 715 |
## --- Actual computation of fisher test: test is applied on each row |
563 | 716 |
## (each gene) |
564 | 717 |
merged <- merged %>% |
565 |
- dplyr::mutate( |
|
566 |
- tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE), |
|
567 |
- tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE) |
|
568 |
- ) |
|
718 |
+ dplyr::mutate( |
|
719 |
+ tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE), |
|
720 |
+ tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE) |
|
721 |
+ ) |
|
569 | 722 |
compute_fisher <- function(...) { |
570 |
- row <- list(...) |
|
571 |
- n_IS_perGene_1 <- row$n_IS_perGene_1 |
|
572 |
- n_IS_perGene_2 <- row$n_IS_perGene_2 |
|
573 |
- n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0 |
|
574 |
- n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0 |
|
575 |
- matrix <- matrix( |
|
576 |
- data = c(n_IS_perGene_1, |
|
577 |
- row$tot_n_IS_perGene_1 - n_IS_perGene_1, |
|
578 |
- n_IS_perGene_2, |
|
579 |
- row$tot_n_IS_perGene_2 - n_IS_perGene_2), |
|
580 |
- nrow = 2, |
|
581 |
- dimnames = list(G1 = c("IS_of_gene", "TotalIS"), |
|
582 |
- G2 = c("IS_of_gene", "TotalIS")) |
|
583 |
- ) |
|
584 |
- ft <- stats::fisher.test(matrix) |
|
585 |
- return(ft$p.value) |
|
723 |
+ row <- list(...) |
|
724 |
+ n_IS_perGene_1 <- row$n_IS_perGene_1 |
|
725 |
+ n_IS_perGene_2 <- row$n_IS_perGene_2 |
|
726 |
+ n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0 |
|
727 |
+ n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0 |
|
728 |
+ matrix <- matrix( |
|
729 |
+ data = c( |
|
730 |
+ n_IS_perGene_1, |
|
731 |
+ row$tot_n_IS_perGene_1 - n_IS_perGene_1, |
|
732 |
+ n_IS_perGene_2, |
|
733 |
+ row$tot_n_IS_perGene_2 - n_IS_perGene_2 |
|
734 |
+ ), |
|
735 |
+ nrow = 2, |
|
736 |
+ dimnames = list( |
|
737 |
+ G1 = c("IS_of_gene", "TotalIS"), |
|
738 |
+ G2 = c("IS_of_gene", "TotalIS") |
|
739 |
+ ) |
|
740 |
+ ) |
|
741 |
+ ft <- stats::fisher.test(matrix) |
|
742 |
+ return(ft$p.value) |
|
586 | 743 |
} |
587 | 744 |
merged <- merged %>% |
588 |
- dplyr::mutate( |
|
589 |
- Fisher_p_value = purrr::pmap_dbl(., compute_fisher) |
|
590 |
- ) %>% |
|
591 |
- dplyr::mutate( |
|
592 |
- Fisher_p_value_significant = dplyr::if_else( |
|
593 |
- condition = .data$Fisher_p_value < significance_threshold, |
|
594 |
- true = TRUE, false = FALSE |
|
745 |
+ dplyr::mutate( |
|
746 |
+ Fisher_p_value = purrr::pmap_dbl(., compute_fisher) |
|
747 |
+ ) %>% |
|
748 |
+ dplyr::mutate( |
|
749 |
+ Fisher_p_value_significant = dplyr::if_else( |
|
750 |
+ condition = .data$Fisher_p_value < significance_threshold, |
|
751 |
+ true = TRUE, false = FALSE |
|
752 |
+ ) |
|
595 | 753 |
) |
596 |
- ) |
|
597 | 754 |
## --- Removing unbalanced 0s if requested - this scenario applies |
598 | 755 |
## only if "union" is selected as method for join |
599 | 756 |
if (remove_unbalanced_0) { |
600 |
- mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE)) |
|
601 |
- mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE)) |
|
602 |
- test_exclude <- function(...) { |
|
603 |
- row <- list(...) |
|
604 |
- if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) { |
|
605 |
- to_ex <- ifelse( |
|
606 |
- test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) & |
|
607 |
- (is.na(row$n_IS_perGene_2))) | |
|
608 |
- ((is.na(row$n_IS_perGene_1)) & |
|
609 |
- (row$n_IS_perGene_2 < mean_is_per_gene_2)), |
|
610 |
- yes = TRUE, |
|
611 |
- no = FALSE |
|
612 |
- ) |
|
613 |
- return(to_ex) |
|
757 |
+ mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE)) |
|
758 |
+ mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE)) |
|
759 |
+ test_exclude <- function(...) { |
|
760 |
+ row <- list(...) |
|
761 |
+ if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) { |
|
762 |
+ to_ex <- ifelse( |
|
763 |
+ test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) & |
|
764 |
+ (is.na(row$n_IS_perGene_2))) | |
|
765 |
+ ((is.na(row$n_IS_perGene_1)) & |
|
766 |
+ (row$n_IS_perGene_2 < mean_is_per_gene_2)), |
|
767 |
+ yes = TRUE, |
|
768 |
+ no = FALSE |
|
769 |
+ ) |
|
770 |
+ return(to_ex) |
|
771 |
+ } |
|
772 |
+ return(FALSE) |
|
614 | 773 |
} |
615 |
- return(FALSE) |
|
616 |
- } |
|
617 |
- merged <- merged %>% |
|
618 |
- dplyr::mutate( |
|
619 |
- to_exclude_from_test = purrr::pmap(., test_exclude) |
|
620 |
- ) %>% |
|
621 |
- dplyr::filter(.data$to_exclude_from_test == FALSE) %>% |
|
622 |
- dplyr::select(-.data$to_exclude_from_test) |
|
623 |
- if (nrow(merged) == 0) { |
|
624 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
625 |
- msg <- c("Data frame empty after filtering", |
|
626 |
- i = paste("Data frame is after removing unbalanced IS,", |
|
627 |
- "nothing to return")) |
|
628 |
- rlang::inform(msg, class = "empty_df_gene_freq_unbal") |
|
774 |
+ merged <- merged %>% |
|
775 |
+ dplyr::mutate( |
|
776 |
+ to_exclude_from_test = purrr::pmap(., test_exclude) |
|
777 |
+ ) %>% |
|
778 |
+ dplyr::filter(.data$to_exclude_from_test == FALSE) %>% |
|
779 |
+ dplyr::select(-.data$to_exclude_from_test) |
|
780 |
+ if (nrow(merged) == 0) { |
|
781 |
+ if (getOption("ISAnalytics.verbose") == TRUE) { |
|
782 |
+ msg <- c("Data frame empty after filtering", |
|
783 |
+ i = paste( |
|
784 |
+ "Data frame is after removing unbalanced IS,", |
|
785 |
+ "nothing to return" |
|
786 |
+ ) |
|
787 |
+ ) |
|
788 |
+ rlang::inform(msg, class = "empty_df_gene_freq_unbal") |
|
789 |
+ } |
|
790 |
+ return(NULL) |
|
629 | 791 |
} |
630 |
- return(NULL) |
|
631 |
- } |
|
632 | 792 |
} |
633 | 793 |
## --- Apply statistical corrections to p-value |
634 | 794 |
merged <- merged %>% |
635 |
- dplyr::mutate( |
|
636 |
- Fisher_p_value_fdr = p.adjust(.data$Fisher_p_value, method = "fdr", |
|
637 |
- n = length(.data$Fisher_p_value)), |
|
638 |
- Fisher_p_value_benjamini = p.adjust(.data$Fisher_p_value, method = "BY", |
|
639 |
- n = length(.data$Fisher_p_value)), |
|
640 |
- minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10) |
|
641 |
- ) %>% |
|
642 |
- dplyr::mutate( |
|
643 |
- minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10), |
|
644 |
- ) |
|
795 |
+ dplyr::mutate( |
|
796 |
+ Fisher_p_value_fdr = stats::p.adjust(.data$Fisher_p_value, |
|
797 |
+ method = "fdr", |
|
798 |
+ n = length(.data$Fisher_p_value) |
|
799 |
+ ), |
|
800 |
+ Fisher_p_value_benjamini = stats::p.adjust(.data$Fisher_p_value, |
|
801 |
+ method = "BY", |
|
802 |
+ n = length(.data$Fisher_p_value) |
|
803 |
+ ), |
|
804 |
+ minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10) |
|
805 |
+ ) %>% |
|
806 |
+ dplyr::mutate( |
|
807 |
+ minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10), |
|
808 |
+ ) |
|
645 | 809 |
return(merged) |
646 | 810 |
} |
647 | 811 |
|
... | ... |
@@ -650,7 +814,7 @@ gene_frequency_fisher <- function(cis_x, |
650 | 814 |
#' the metadata data frame accordingly. |
651 | 815 |
#' |
652 | 816 |
#' @description |
653 |
-#' \lifecycle{experimental} |
|
817 |
+#' `r lifecycle::badge("stable")` |
|
654 | 818 |
#' The function operates on a data frame by grouping the content by |
655 | 819 |
#' the sample key and computing every function specified on every |
656 | 820 |
#' column in the `value_columns` parameter. After that the metadata |
... | ... |
@@ -686,14 +850,17 @@ gene_frequency_fisher <- function(cis_x, |
686 | 850 |
#' @param functions A named list of function or purrr-style lambdas |
687 | 851 |
#' @param add_integrations_count Add the count of distinct integration sites |
688 | 852 |
#' for each group? Can be computed only if `x` contains the mandatory columns |
689 |
-#' `chr`, `integration_locus`, `strand` |
|
853 |
+#' `mandatory_IS_vars()` |
|
854 |
+#' |
|
855 |
+#' @section Required tags: |
|
856 |
+#' The function will explicitly check for the presence of these tags: |
|
857 |
+#' |
|
858 |
+#' * All columns declared in `mandatory_IS_vars()` |
|
859 |
+#' |
|
860 |
+#' These are checked only if `add_integrations_count = TRUE`. |
|
690 | 861 |
#' |
691 | 862 |
#' @family Analysis functions |
692 |
-#' @importFrom rlang eval_tidy expr abort .data sym inform |
|
693 |
-#' @importFrom purrr is_function is_formula map_lgl walk map set_names |
|
694 |
-#' @importFrom dplyr group_by across all_of summarise rename_with bind_cols |
|
695 |
-#' @importFrom dplyr n_distinct left_join |
|
696 |
-#' @importFrom magrittr `%>%` |
|
863 |
+#' @importFrom rlang .data sym |
|
697 | 864 |
#' |
698 | 865 |
#' @return A list with modified x and metadata data frames |
699 | 866 |
#' @export |
... | ... |
@@ -797,7 +964,8 @@ sample_statistics <- function(x, |
797 | 964 |
|
798 | 965 |
#' Grubbs test for Common Insertion Sites (CIS). |
799 | 966 |
#' |
800 |
-#' \lifecycle{stable} |
|
967 |
+#' @description |
|
968 |
+#' `r lifecycle::badge("stable")` |
|
801 | 969 |
#' Statistical approach for the validation of common insertion sites |
802 | 970 |
#' significance based on the comparison of the integration frequency |
803 | 971 |
#' at the CIS gene with respect to other genes contained in the |
... | ... |
@@ -824,6 +992,16 @@ sample_statistics <- function(x, |
824 | 992 |
#' |
825 | 993 |
#' `r refGene_table_cols()` |
826 | 994 |
#' |
995 |
+#' @section Required tags: |
|
996 |
+#' The function will explicitly check for the presence of these tags: |
|
997 |
+#' |
|
998 |
+#' ```{r echo=FALSE, results="asis"} |
|
999 |
+#' all_tags <- available_tags() |
|
1000 |
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")), |
|
1001 |
+#' ~ "CIS_grubbs" %in% .x)][["tag"]]) |
|
1002 |
+#' cat(paste0("* ", needed, collapse="\n")) |
|
1003 |
+#' ``` |
|
1004 |
+#' |
|
827 | 1005 |
#' @param x An integration matrix, must include the `mandatory_IS_vars()` |
828 | 1006 |
#' columns and the `annotation_IS_vars()` columns |
829 | 1007 |
#' @param genomic_annotation_file Database file for gene annotation, |
... | ... |
@@ -834,17 +1012,16 @@ sample_statistics <- function(x, |
834 | 1012 |
#' NULL, the function will perform calculations for each group and return |
835 | 1013 |
#' a list of data frames with the results. E.g. for `by = "SubjectID"`, |
836 | 1014 |
#' CIS will be computed for each distinct SubjectID found in the table |
837 |
-#' (of course, "SubjectID" column must be included in the input data frame). |
|
1015 |
+#' ("SubjectID" column must be included in the input data frame). |
|
1016 |
+#' @param return_missing_as_df Returns those genes present in the input df |
|
1017 |
+#' but not in the refgenes as a data frame? |
|
1018 |
+#' @param results_as_list Relevant only if `by` is not `NULL` - if `TRUE` |
|
1019 |
+#' return the group computations as a named list, otherwise return a single |
|
1020 |
+#' df with an additional column containing the group id |
|
838 | 1021 |
#' |
839 | 1022 |
#' @family Analysis functions |
840 | 1023 |
#' |
841 |
-#' @importFrom tibble as_tibble |
|
842 |
-#' @importFrom rlang .data abort current_env eval_tidy sym |
|
843 |
-#' @importFrom magrittr `%>%` |
|
844 |
-#' @importFrom utils read.csv |
|
845 |
-#' @importFrom stringr str_replace_all |
|
846 |
-#' @importFrom tidyr unite |
|
847 |
-#' @importFrom purrr set_names map |
|
1024 |
+#' @importFrom rlang .data sym |
|
848 | 1025 |
#' |
849 | 1026 |
#' @return A data frame |
850 | 1027 |
#' @export |
... | ... |
@@ -852,7 +1029,7 @@ sample_statistics <- function(x, |
852 | 1029 |
#' @examples |
853 | 1030 |
#' data("integration_matrices", package = "ISAnalytics") |
854 | 1031 |
#' cis <- CIS_grubbs(integration_matrices) |
855 |
-#' head(cis) |
|
1032 |
+#' cis |
|
856 | 1033 |
CIS_grubbs <- function(x, |
857 | 1034 |
genomic_annotation_file = "hg19", |
858 | 1035 |
grubbs_flanking_gene_bp = 100000, |
... | ... |
@@ -996,7 +1173,7 @@ CIS_grubbs <- function(x, |
996 | 1173 |
"a mismatch in the annotation phase of", |
997 | 1174 |
"the matrix. Here is a summary: " |
998 | 1175 |
), |
999 |
- paste0(capture.output({ |
|
1176 |
+ paste0(utils::capture.output({ |
|
1000 | 1177 |
print(missing_df, n = Inf) |
1001 | 1178 |
}), collapse = "\n"), |
1002 | 1179 |
sep = "\n" |
... | ... |
@@ -1030,36 +1207,11 @@ CIS_grubbs <- function(x, |
1030 | 1207 |
|
1031 | 1208 |
#' Integrations cumulative count in time by sample |
1032 | 1209 |
#' |
1033 |
-#' \lifecycle{experimental} |
|
1034 |
-#' This function computes the cumulative number of integrations |
|
1035 |
-#' observed in each sample at different time points by assuming that |
|
1036 |
-#' if an integration is observed at time point "t" then it is also observed in |
|
1037 |
-#' time point "t+1". |
|
1210 |
+#' @description |
|
1211 |
+#' `r lifecycle::badge("deprecated")` |
|
1212 |
+#' This function was deprecated in favour of a single function, |
|
1213 |
+#' please use `cumulative_is` instead. |
|
1038 | 1214 |
#' |
1039 |
-#' @details |
|
1040 |
-#' ## Input data frame |
|
1041 |
-#' The user can provide as input for the `x` parameter both a simple |
|
1042 |
-#' integration matrix AND setting the `aggregate` parameter to TRUE, |
|
1043 |
-#' or provide an already aggregated matrix via |
|
1044 |
-#' \link{aggregate_values_by_key}. |
|
1045 |
-#' If the user supplies a matrix to be aggregated the `association_file` |
|
1046 |
-#' parameter must not be NULL: aggregation will be done by an internal |
|
1047 |
-#' call to the aggregation function. |
|
1048 |
-#' If the user supplies an already aggregated matrix, the `key` parameter |
|
1049 |
-#' is the key used for aggregation - |
|
1050 |
-#' **NOTE: for this operation is mandatory |
|
1051 |
-#' that the time point column is included in the key.** |
|
1052 |
-#' ## Assumptions on time point format |
|
1053 |
-#' By using the functions provided by this package, when imported, |
|
1054 |
-#' an association file will be correctly formatted for future usage. |
|
1055 |
-#' In the formatting process there is also a padding operation performed on |
|
1056 |
-#' time points: this means the functions expects the time point column to |
|
1057 |
-#' be of type character and to be correctly padded with 0s. If the |
|
1058 |
-#' chosen column for time point is detected as numeric the function will |
|
1059 |
-#' attempt the conversion to character and automatic padding. |
|
1060 |
-#' If you choose to import the association file not using the |
|
1061 |
-#' \link{import_association_file} function, be sure to check the format of |
|
1062 |
-#' the chosen column to avoid undesired results. |
|
1063 | 1215 |
#' |
1064 | 1216 |
#' @param x A simple integration matrix or an aggregated matrix (see details) |
1065 | 1217 |
#' @param association_file NULL or the association file for x if `aggregate` |
... | ... |
@@ -1071,20 +1223,9 @@ CIS_grubbs <- function(x, |
1071 | 1223 |
#' @param aggregate Should x be aggregated? |
1072 | 1224 |
#' @param ... Additional parameters to pass to `aggregate_values_by_key` |
1073 | 1225 |
#' |
1074 |
-#' @family Analysis functions |
|
1075 |
-#' |
|
1076 |
-#' @importFrom dplyr mutate filter across all_of select summarise group_by |
|
1077 |
-#' @importFrom dplyr arrange group_split first full_join starts_with distinct |
|
1078 |
-#' @importFrom dplyr semi_join n rename |
|
1079 |
-#' @importFrom magrittr `%>%` |
|
1080 |
-#' @importFrom rlang .data abort inform `:=` |
|
1081 |
-#' @importFrom stringr str_pad |
|
1082 |
-#' @importFrom purrr reduce is_empty |
|
1083 |
-#' @importFrom tidyr pivot_longer |
|
1084 |
-#' @importFrom stats na.omit |
|
1085 |
-#' |
|
1086 | 1226 |
#' @return A data frame |
1087 | 1227 |
#' @export |
1228 |
+#' @keywords internal |
|
1088 | 1229 |
#' |
1089 | 1230 |
#' @examples |
1090 | 1231 |
#' data("integration_matrices", package = "ISAnalytics") |
... | ... |
@@ -1114,7 +1255,8 @@ cumulative_count_union <- function(x, |
1114 | 1255 |
what = "cumulative_count_union()", |
1115 | 1256 |
with = "cumulative_is()", |
1116 | 1257 |
details = c(paste( |
1117 |
- "Use option `counts = TRUE`. Function will be likely dropped in the", |
|
1258 |
+ "Use option `counts = TRUE`.", |
|
1259 |
+ "Function will be likely dropped in the", |
|
1118 | 1260 |
"next release cycle" |
1119 | 1261 |
)) |
1120 | 1262 |
) |
... | ... |
@@ -1124,9 +1266,10 @@ cumulative_count_union <- function(x, |
1124 | 1266 |
) |
1125 | 1267 |
} |
1126 | 1268 |
|
1127 |
-#' Expands integration matrix with the cumulative is union over time. |
|
1269 |
+#' Expands integration matrix with the cumulative IS union over time. |
|
1128 | 1270 |
#' |
1129 |
-#' @description \lifecycle{experimental} |
|
1271 |
+#' @description |
|
1272 |
+#' `r lifecycle::badge("experimental")` |
|
1130 | 1273 |
#' Given an input integration matrix that can be grouped over time, |
1131 | 1274 |
#' this function adds integrations in groups assuming that |
1132 | 1275 |
#' if an integration is observed at time point "t" then it is also observed in |
... | ... |
@@ -1141,6 +1284,14 @@ cumulative_count_union <- function(x, |
1141 | 1284 |
#' @param expand If `FALSE`, for each group, the set of integration sites is |
1142 | 1285 |
#' returned in a separate column as a nested table, otherwise the resulting |
1143 | 1286 |
#' column is unnested. |
1287 |
+#' @param counts Add cumulative counts? Logical |
|
1288 |
+#' |
|
1289 |
+#' @section Required tags: |
|
1290 |
+#' The function will explicitly check for the presence of these tags: |
|
1291 |
+#' |
|
1292 |
+#' * All columns declared in `mandatory_IS_vars()` |
|
1293 |
+#' * Checks if the matrix is annotated by assessing presence of |
|
1294 |
+#' `annotation_IS_vars()` |
|
1144 | 1295 |
#' |
1145 | 1296 |
#' @family Analysis functions |
1146 | 1297 |
#' @return A data frame |
... | ... |
@@ -1210,12 +1361,12 @@ cumulative_is <- function(x, |
1210 | 1361 |
.keep_all = TRUE |
1211 | 1362 |
) |
1212 | 1363 |
data.table::setDT(temp) |
1213 |
- temp <- temp[, .(is = list(.SD)), by = key] |
|
1364 |
+ temp <- temp[, list(is = list(.SD)), by = key] |
|
1214 | 1365 |
no_tp_key <- key[key != timepoint_col] |
1215 | 1366 |
split <- split(temp, by = no_tp_key) |
1216 | 1367 |
cumulate <- purrr::map(split, function(x) { |
1217 | 1368 |
x[, cumulative_is := purrr::accumulate( |
1218 |
- is, |
|
1369 |
+ get("is"), |
|
1219 | 1370 |
~ data.table::funion(.x, .y) |
1220 | 1371 |
)] |
1221 | 1372 |
}) |
... | ... |
@@ -1246,13 +1397,14 @@ cumulative_is <- function(x, |
1246 | 1397 |
|
1247 | 1398 |
#' Sharing of integration sites between given groups. |
1248 | 1399 |
#' |
1249 |
-#' \lifecycle{experimental} |
|
1400 |
+#' @description |
|
1401 |
+#' `r lifecycle::badge("stable")` |
|
1250 | 1402 |
#' Computes the amount of integration sites shared between the groups identified |
1251 | 1403 |
#' in the input data. |
1252 | 1404 |
#' |
1253 | 1405 |
#' @details |
1254 |
-#' An integration site is always identified by the triple |
|
1255 |
-#' `(chr, integration_locus, strand)`, thus these columns must be present |
|
1406 |
+#' An integration site is always identified by the combination of fields in |
|
1407 |
+#' `mandatory_IS_vars()`, thus these columns must be present |
|
1256 | 1408 |
#' in the input(s). |
1257 | 1409 |
#' |
1258 | 1410 |
#' The function accepts multiple inputs for different scenarios, please refer |
... | ... |
@@ -1270,6 +1422,11 @@ cumulative_is <- function(x, |
1270 | 1422 |
#' function \code{\link{sharing_heatmap}} or via the function |
1271 | 1423 |
#' \code{\link{sharing_venn}} |
1272 | 1424 |
#' |
1425 |
+#' @section Required tags: |
|
1426 |
+#' The function will explicitly check for the presence of these tags: |
|
1427 |
+#' |
|
1428 |
+#' * All columns declared in `mandatory_IS_vars()` |
|
1429 |
+#' |
|
1273 | 1430 |
#' @param ... One or more integration matrices |
1274 | 1431 |
#' @param group_key Character vector of column names which identify a |
1275 | 1432 |
#' single group. An associated group id will be derived by concatenating |
... | ... |
@@ -1522,7 +1679,8 @@ is_sharing <- function(..., |
1522 | 1679 |
|
1523 | 1680 |
#' Find the source of IS by evaluating sharing. |
1524 | 1681 |
#' |
1525 |
-#' @description \lifecycle{experimental} |
|
1682 |
+#' @description |
|
1683 |
+#' `r lifecycle::badge("stable")` |
|
1526 | 1684 |
#' The function computes the sharing between a reference group of interest |
1527 | 1685 |
#' for each time point and a selection of groups of interest. In this way |
1528 | 1686 |
#' it is possible to observe the percentage of shared integration sites between |
... | ... |
@@ -4,46 +4,51 @@ |
4 | 4 |
|
5 | 5 |
#' Identifies and removes collisions. |
6 | 6 |
#' |
7 |
-#' \lifecycle{stable} |
|
8 |
-#' A collision is an integration (aka a unique combination of |
|
9 |
-#' `chr`, `integration_locus` and `strand`) which is observed in more than one |
|
10 |
-#' independent sample (a unique pair of `ProjectID` and `SubjectID`). |
|
11 |
-#' The function tries to decide to which subject an integration |
|
12 |
-#' should be assigned to and, if no |
|
7 |
+#' @description |
|
8 |
+#' `r lifecycle::badge("stable")` |
|
9 |
+#' A collision is an integration (aka a unique combination of the provided |
|
10 |
+#' `mandatory_IS_vars()`) which is observed in more than one |
|
11 |
+#' independent sample. |
|
12 |
+#' The function tries to decide to which independent sample should |
|
13 |
+#' an integration event be assigned to, and if no |
|
13 | 14 |
#' decision can be taken, the integration is completely removed from the data |
14 | 15 |
#' frame. |
15 | 16 |
#' For more details refer to the vignette "Collision removal functionality": |
16 | 17 |
#' \code{vignette("collision_removal", package = "ISAnalytics")} |
17 | 18 |
#' |
18 |
-#' @param x Either a multi-quantification matrix or a |
|
19 |
+#' @param x Either a multi-quantification matrix (recommended) or a |
|
19 | 20 |
#' named list of matrices (names must be quantification types) |
20 | 21 |
#' @param association_file The association file imported via |
21 | 22 |
#' `import_association_file()` |
23 |
+#' @param independent_sample_id A character vector of column names that |
|
24 |
+#' identify independent samples |
|
22 | 25 |
#' @param date_col The date column that should be considered. |
23 |
-#' Must be one value in `date_columns_coll()` |
|
24 | 26 |
#' @param reads_ratio A single numeric value that represents the ratio that has |
25 | 27 |
#' to be considered when deciding between `seqCount` value. |
26 | 28 |
#' @param quant_cols A named character vector where names are |
27 | 29 |
#' quantification types and |
28 | 30 |
#' values are the names of the corresponding columns. The quantification |
29 | 31 |
#' `seqCount` MUST be included in the vector. |
30 |
-#' @param report_path The path where the report file should be saved. |
|
31 |
-#' Can be a folder, a file or NULL if no report should be produced. |
|
32 |
-#' Defaults to `{user_home}/ISAnalytics_reports`. |
|
33 | 32 |
#' @param max_workers Maximum number of parallel workers to distribute the |
34 | 33 |
#' workload. If `NULL` (default) produces the maximum amount of workers allowed, |
35 | 34 |
#' a numeric value is requested otherwise. WARNING: a higher number of workers |
36 | 35 |
#' speeds up computation at the cost of memory consumption! Tune this parameter |
37 | 36 |
#' accordingly. |
38 | 37 |
#' |
39 |
-#' @family Collision removal |
|
40 |
-#' @importFrom magrittr `%>%` |
|
41 |
-#' @importFrom rlang inform abort exec .data |
|
42 |
-#' @importFrom purrr map2 |
|
43 |
-#' @importFrom dplyr bind_rows select all_of group_by summarise across n |
|
44 |
-#' @importFrom dplyr distinct |
|
38 |
+#' @section Required tags: |
|
39 |
+#' The function will explicitly check for the presence of these tags: |
|
45 | 40 |
#' |
46 |
-#' @seealso \code{\link{date_columns_coll}} |
|
41 |
+#' ```{r echo=FALSE, results="asis"} |
|
42 |
+#' all_tags <- available_tags() |
|
43 |
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")), |
|
44 |
+#' ~ "remove_collisions" %in% .x)][["tag"]]) |
|
45 |
+#' cat(paste0("* ", needed, collapse="\n")) |
|
46 |
+#' ``` |
|
47 |
+#' |
|
48 |
+#' @template report_path_param |
|
49 |
+#' |
|
50 |
+#' @family Data cleaning and pre-processing |
|
51 |
+#' @importFrom rlang .data sym |
|
47 | 52 |
#' |
48 | 53 |
#' @return Either a multi-quantification matrix or a list of data frames |
49 | 54 |
#' @export |
... | ... |
@@ -191,32 +196,32 @@ remove_collisions <- function(x, |
191 | 196 |
if (getOption("ISAnalytics.reports") == TRUE & !is.null(report_path)) { |
192 | 197 |
post_joined <- association_file[final_matr, on = pcr_col] |
193 | 198 |
post_joined <- post_joined[, mget(c( |
194 |
- colnames(final_matr), date_col, |
|
195 |
- replicate_n_col, independent_sample_id, pool_col |
|
199 |
+ colnames(final_matr), date_col, |
|
200 |
+ replicate_n_col, independent_sample_id, pool_col |
|
196 | 201 |
))] |
197 | 202 |
summaries <- .collisions_obtain_report_summaries( |
198 |
- x = x, association_file = association_file, |
|
199 |
- quant_cols = quant_cols, missing_ind = missing_ind, |
|
200 |
- pcr_col = pcr_col, pre_process = pre_process, |
|
201 |
- collisions = split_df$collisions, removed = removed, |
|
202 |
- reassigned = reassigned, |
|
203 |
- joined = joined, |
|
204 |
- final_matr = final_matr, |
|
205 |
- post_joined = post_joined, |
|
206 |
- pool_col = pool_col, replicate_n_col = replicate_n_col, |
|
207 |
- independent_sample_id = independent_sample_id, |
|
208 |
- seq_count_col = seq_count_col |
|
203 |
+ x = x, association_file = association_file, |
|
204 |
+ quant_cols = quant_cols, missing_ind = missing_ind, |
|
205 |
+ pcr_col = pcr_col, pre_process = pre_process, |
|
206 |
+ collisions = split_df$collisions, removed = removed, |
|
207 |
+ reassigned = reassigned, |
|
208 |
+ joined = joined, |
|
209 |
+ final_matr = final_matr, |
|
210 |
+ post_joined = post_joined, |
|
211 |
+ pool_col = pool_col, replicate_n_col = replicate_n_col, |
|
212 |
+ independent_sample_id = independent_sample_id, |
|
213 |
+ seq_count_col = seq_count_col |
|
209 | 214 |
) |
210 | 215 |
sharing_heatmaps <- .collisions_obtain_sharing_heatmaps( |
211 |
- joined = joined, independent_sample_id = independent_sample_id, |
|
212 |
- post_joined = post_joined, report_path = report_path |
|
216 |
+ joined = joined, independent_sample_id = independent_sample_id, |
|
217 |
+ post_joined = post_joined, report_path = report_path |
|
213 | 218 |
) |
214 | 219 |
report_params <- append(summaries, sharing_heatmaps) |
215 | 220 |
report_params[["additional_info"]] <- add_samples |
216 | 221 |
report_params[["sample_key"]] <- independent_sample_id |
217 | 222 |
report_params[["dynamic_cols"]] <- list( |
218 |
- pcr_id = pcr_col, |
|
219 |
- pool = pool_col |
|
223 |
+ pcr_id = pcr_col, |
|
224 |
+ pool = pool_col |
|
220 | 225 |
) |
221 | 226 |
withCallingHandlers( |
222 | 227 |
{ |
... | ... |
@@ -248,7 +253,8 @@ remove_collisions <- function(x, |
248 | 253 |
#' Re-aligns matrices of other quantification types based on the processed |
249 | 254 |
#' sequence count matrix. |
250 | 255 |
#' |
251 |
-#' \lifecycle{stable} |
|
256 |
+#' @description |
|
257 |
+#' `r lifecycle::badge("stable")` |
|
252 | 258 |
#' This function should be used to keep data consistent among the same analysis: |
253 | 259 |
#' if for some reason you removed the collisions by passing only the sequence |
254 | 260 |
#' count matrix to `remove_collisions()`, you should call this |
... | ... |
@@ -264,10 +270,9 @@ remove_collisions <- function(x, |
264 | 270 |
#' @param other_matrices A named list of matrices to re-align. Names in the list |
265 | 271 |
#' must be quantification types (\code{quantification_types()}) except |
266 | 272 |
#' "seqCount". |
267 |
-#' @importFrom dplyr semi_join |
|
268 |
-#' @importFrom purrr map_lgl |
|
269 |
-#' @importFrom magrittr `%>%` |
|
270 |
-#' @family Collision removal |
|
273 |
+#' @param sample_column The name of the column containing the sample identifier |
|
274 |
+#' |
|
275 |
+#' @family Data cleaning and pre-processing |
|
271 | 276 |
#' @seealso \code{\link{remove_collisions}} |
272 | 277 |
#' |
273 | 278 |
#' @return A named list with re-aligned matrices |
... | ... |
@@ -291,8 +296,8 @@ remove_collisions <- function(x, |
291 | 296 |
#' ) |
292 | 297 |
#' realigned |
293 | 298 |
realign_after_collisions <- function(sc_matrix, |
294 |
- other_matrices, |
|
295 |
- sample_column = pcr_id_column()) { |
|
299 |
+ other_matrices, |
|
300 |
+ sample_column = pcr_id_column()) { |
|
296 | 301 |
stopifnot(is.list(other_matrices) & !is.null(names(other_matrices))) |
297 | 302 |
stopifnot(all(names(other_matrices) %in% quantification_types())) |
298 | 303 |
stopifnot(is.character(sample_column)) |
... | ... |
@@ -301,8 +306,10 @@ realign_after_collisions <- function(sc_matrix, |
301 | 306 |
if (!all(all_ISm)) { |
302 | 307 |
rlang::abort(.non_ISM_error()) |
303 | 308 |
} |
304 |
- all_campid <- purrr::map_lgl(other_matrices, |
|
305 |
- ~ sample_column %in% colnames(.x)) |
|
309 |
+ all_campid <- purrr::map_lgl( |
|
310 |
+ other_matrices, |
|
311 |
+ ~ sample_column %in% colnames(.x) |
|