Browse code

[UPDATE] Final partial update for 1.5.4 - finished documentation, ready to merge

Giulia Pais authored on 20/04/2022 15:58:57
Showing 113 changed files

... ...
@@ -11,3 +11,4 @@
11 11
 ^doc$
12 12
 ^Design$
13 13
 ^sample_reports$
14
+^man-roxygen$
... ...
@@ -44,8 +44,9 @@ Imports:
44 44
     readxl,
45 45
     tools,
46 46
     Rcapture,
47
-    grDevices, 
48
-    zip
47
+    grDevices,
48
+    forcats,
49
+    glue
49 50
 Encoding: UTF-8
50 51
 LazyData: false
51 52
 Roxygen: list(markdown = TRUE)
... ...
@@ -71,7 +72,8 @@ Suggests:
71 72
     circlize,
72 73
     plotly,
73 74
     gtools,
74
-    eulerr
75
+    eulerr,
76
+    openxlsx
75 77
 VignetteBuilder: knitr
76 78
 RdMacros: 
77 79
     lifecycle
... ...
@@ -20,12 +20,15 @@ export(compute_abundance)
20 20
 export(compute_near_integrations)
21 21
 export(cumulative_count_union)
22 22
 export(cumulative_is)
23
-export(date_columns_coll)
24 23
 export(date_formats)
24
+export(default_af_transform)
25 25
 export(default_iss_file_prefixes)
26 26
 export(default_meta_agg)
27
+export(default_rec_agg_lambdas)
27 28
 export(default_report_path)
28 29
 export(default_stats)
30
+export(fisher_scatterplot)
31
+export(gene_frequency_fisher)
29 32
 export(generate_Vispa2_launch_AF)
30 33
 export(generate_blank_association_file)
31 34
 export(generate_default_folder_structure)
... ...
@@ -55,6 +58,7 @@ export(refGene_table_cols)
55 58
 export(remove_collisions)
56 59
 export(reset_af_columns_def)
57 60
 export(reset_annotation_IS_vars)
61
+export(reset_dyn_vars_config)
58 62
 export(reset_iss_stats_specs)
59 63
 export(reset_mandatory_IS_vars)
60 64
 export(reset_matrix_file_suffixes)
... ...
@@ -70,6 +74,7 @@ export(sharing_venn)
70 74
 export(threshold_filter)
71 75
 export(top_abund_tableGrob)
72 76
 export(top_integrations)
77
+export(top_targeted_genes)
73 78
 export(transform_columns)
74 79
 export(unzip_file_system)
75 80
 import(ggplot2)
... ...
@@ -87,39 +92,28 @@ importFrom(data.table,.N)
87 92
 importFrom(data.table,.SD)
88 93
 importFrom(data.table,`%chin%`)
89 94
 importFrom(data.table,fread)
90
-importFrom(data.table,melt.data.table)
91
-importFrom(data.table,rbindlist)
92 95
 importFrom(data.table,setDT)
93 96
 importFrom(data.table,setnames)
94 97
 importFrom(dplyr,across)
95 98
 importFrom(dplyr,all_of)
96
-importFrom(dplyr,anti_join)
97 99
 importFrom(dplyr,arrange)
98 100
 importFrom(dplyr,bind_cols)
99 101
 importFrom(dplyr,bind_rows)
100
-importFrom(dplyr,contains)
101
-importFrom(dplyr,cur_column)
102 102
 importFrom(dplyr,desc)
103 103
 importFrom(dplyr,distinct)
104 104
 importFrom(dplyr,everything)
105 105
 importFrom(dplyr,filter)
106
-importFrom(dplyr,first)
107
-importFrom(dplyr,full_join)
108 106
 importFrom(dplyr,group_by)
109 107
 importFrom(dplyr,group_keys)
110 108
 importFrom(dplyr,group_modify)
111 109
 importFrom(dplyr,group_split)
112
-importFrom(dplyr,if_else)
113
-importFrom(dplyr,intersect)
114 110
 importFrom(dplyr,left_join)
115 111
 importFrom(dplyr,mutate)
116 112
 importFrom(dplyr,n)
117
-importFrom(dplyr,n_distinct)
118 113
 importFrom(dplyr,pull)
119 114
 importFrom(dplyr,rename)
120 115
 importFrom(dplyr,rename_with)
121 116
 importFrom(dplyr,select)
122
-importFrom(dplyr,semi_join)
123 117
 importFrom(dplyr,slice)
124 118
 importFrom(dplyr,slice_head)
125 119
 importFrom(dplyr,starts_with)
... ...
@@ -129,10 +123,8 @@ importFrom(fs,as_fs_path)
129 123
 importFrom(fs,dir_create)
130 124
 importFrom(fs,dir_exists)
131 125
 importFrom(fs,dir_ls)
132
-importFrom(fs,file_exists)
133 126
 importFrom(fs,is_dir)
134 127
 importFrom(fs,path)
135
-importFrom(fs,path_dir)
136 128
 importFrom(fs,path_ext)
137 129
 importFrom(fs,path_ext_set)
138 130
 importFrom(fs,path_home)
... ...
@@ -141,8 +133,6 @@ importFrom(ggplot2,aes)
141 133
 importFrom(ggplot2,aes_)
142 134
 importFrom(ggplot2,element_blank)
143 135
 importFrom(ggplot2,element_text)
144
-importFrom(ggplot2,geom_hline)
145
-importFrom(ggplot2,geom_point)
146 136
 importFrom(ggplot2,geom_raster)
147 137
 importFrom(ggplot2,geom_text)
148 138
 importFrom(ggplot2,ggplot)
... ...
@@ -152,12 +142,8 @@ importFrom(ggplot2,rel)
152 142
 importFrom(ggplot2,scale_alpha_continuous)
153 143
 importFrom(ggplot2,scale_fill_gradientn)
154 144
 importFrom(ggplot2,scale_fill_viridis_d)
155
-importFrom(ggplot2,scale_x_continuous)
156
-importFrom(ggplot2,scale_y_continuous)
157 145
 importFrom(ggplot2,sym)
158 146
 importFrom(ggplot2,theme)
159
-importFrom(ggplot2,unit)
160
-importFrom(ggrepel,geom_label_repel)
161 147
 importFrom(lifecycle,deprecate_warn)
162 148
 importFrom(lubridate,today)
163 149
 importFrom(magrittr,`%>%`)
... ...
@@ -166,8 +152,6 @@ importFrom(purrr,detect_index)
166 152
 importFrom(purrr,flatten)
167 153
 importFrom(purrr,flatten_chr)
168 154
 importFrom(purrr,is_empty)
169
-importFrom(purrr,is_formula)
170
-importFrom(purrr,is_function)
171 155
 importFrom(purrr,map)
172 156
 importFrom(purrr,map2)
173 157
 importFrom(purrr,map2_dfr)
... ...
@@ -206,27 +190,19 @@ importFrom(rlang,list2)
206 190
 importFrom(rlang,parse_expr)
207 191
 importFrom(rlang,sym)
208 192
 importFrom(stats,dt)
209
-importFrom(stats,na.omit)
210 193
 importFrom(stats,setNames)
211 194
 importFrom(stats,shapiro.test)
212 195
 importFrom(stringr,str_detect)
213
-importFrom(stringr,str_pad)
214
-importFrom(stringr,str_replace)
215 196
 importFrom(stringr,str_replace_all)
216 197
 importFrom(stringr,str_split)
217
-importFrom(stringr,str_to_lower)
218 198
 importFrom(stringr,str_to_upper)
219 199
 importFrom(tibble,add_column)
220 200
 importFrom(tibble,as_tibble)
221 201
 importFrom(tibble,tibble)
222 202
 importFrom(tibble,tribble)
223 203
 importFrom(tidyr,nest)
224
-importFrom(tidyr,pivot_longer)
225 204
 importFrom(tidyr,pivot_wider)
226
-importFrom(tidyr,separate)
227 205
 importFrom(tidyr,unite)
228 206
 importFrom(tidyr,unnest)
229 207
 importFrom(tools,file_path_sans_ext)
230
-importFrom(utils,read.csv)
231 208
 importFrom(utils,read.delim)
232
-importFrom(zip,unzip)
... ...
@@ -11,5 +11,9 @@
11 11
 #'  \code{\link{import_parallel_Vispa2Matrices}}
12 12
 #'  * import_parallel_Vispa2Matrices_interactive:
13 13
 #'  \code{\link{import_parallel_Vispa2Matrices}}
14
+#'  * unzip_file_system:
15
+#'  \code{\link{generate_default_folder_structure}}
16
+#'  * cumulative_count_union:
17
+#'  \code{\link{cumulative_is}}
14 18
 #' @keywords internal
15 19
 NULL
... ...
@@ -3,7 +3,8 @@
3 3
 #------------------------------------------------------------------------------#
4 4
 #' Performs aggregation on metadata contained in the association file.
5 5
 #'
6
-#' \lifecycle{stable}
6
+#' @description
7
+#' `r lifecycle::badge("stable")`
7 8
 #' Groups metadata by the specified grouping keys and returns a
8 9
 #' summary of info for each group. For more details on how to use this function:
9 10
 #' \code{vignette("aggregate_function_usage", package = "ISAnalytics")}
... ...
@@ -20,10 +21,8 @@
20 21
 #' @param import_stats `r lifecycle::badge("deprecated")` The import
21 22
 #' of VISPA2 stats has been moved to its dedicated function,
22 23
 #' see \link{import_Vispa2_stats}.
23
-#' @family Aggregate functions
24
-#' @importFrom rlang abort inform
25
-#' @importFrom purrr is_empty
26
-#' @import lifecycle
24
+#'
25
+#' @family Data cleaning and pre-processing
27 26
 #'
28 27
 #' @return An aggregated data frame
29 28
 #' @export
... ...
@@ -101,9 +100,8 @@ aggregate_metadata <- function(association_file,
101 100
 #' * `Output_colname`: a `glue` specification that will be used to determine
102 101
 #' a unique output column name. See \link[glue]{glue} for more details.
103 102
 #'
104
-#' @importFrom tibble tribble
105 103
 #' @return A data frame
106
-#' @family Aggregate functions
104
+#' @family Data cleaning and pre-processing
107 105
 #' @export
108 106
 #'
109 107
 #' @examples
... ...
@@ -146,7 +144,8 @@ default_meta_agg <- function() {
146 144
 
147 145
 #' Aggregates matrices values based on specified key.
148 146
 #'
149
-#' \lifecycle{stable}
147
+#' @description
148
+#' `r lifecycle::badge("stable")`
150 149
 #' Performs aggregation on values contained in the integration matrices based
151 150
 #' on the key and the specified lambda. For more details on how to use this
152 151
 #' function:
... ...
@@ -206,12 +205,13 @@ default_meta_agg <- function() {
206 205
 #' @param join_af_by A character vector representing the joining key
207 206
 #' between the matrix and the metadata. Useful to re-aggregate already
208 207
 #' aggregated matrices.
209
-#' @family Aggregate functions
208
+#'
209
+#' @family Data cleaning and pre-processing
210 210
 #'
211 211
 #' @importFrom purrr walk set_names map_lgl
212 212
 #' @importFrom rlang expr eval_tidy abort
213 213
 #'
214
-#' @return A list of tibbles or a single tibble aggregated according to
214
+#' @return A list of data frames or a single data frame aggregated according to
215 215
 #' the specified arguments
216 216
 #' @export
217 217
 #'
... ...
@@ -248,43 +248,46 @@ aggregate_values_by_key <- function(x,
248 248
     data.table::setDT(association_file)
249 249
     stopifnot(is.list(lambda) && !is.null(names(lambda)))
250 250
     join_key_err <- c("Join key not present in in both data frames",
251
-                      x = paste("Fields specified in the argument",
252
-                                "`join_af_by` must appear in both",
253
-                                "the association file and the matrix(es)"))
251
+        x = paste(
252
+            "Fields specified in the argument",
253
+            "`join_af_by` must appear in both",
254
+            "the association file and the matrix(es)"
255
+        )
256
+    )
254 257
     check_single_matrix <- function(df) {
255
-      stopifnot(is.data.frame(df))
256
-      is_numeric_col <- purrr::map_lgl(
257
-        value_cols,
258
-        ~ is.numeric(df[[.x]]) ||
259
-          is.double(df[[.x]]) ||
260
-          is.integer(df[[.x]])
261
-      ) %>% purrr::set_names(value_cols)
262
-      if (any(!is_numeric_col)) {
263
-        rlang::abort(.non_num_user_cols_error(
264
-          names(is_numeric_col)[!is_numeric_col]
265
-        ))
266
-      }
267
-      if (!all(join_af_by %in% colnames(df))) {
268
-        rlang::abort(join_key_err, class = "join_key_err_agg")
269
-      }
258
+        stopifnot(is.data.frame(df))
259
+        is_numeric_col <- purrr::map_lgl(
260
+            value_cols,
261
+            ~ is.numeric(df[[.x]]) ||
262
+                is.double(df[[.x]]) ||
263
+                is.integer(df[[.x]])
264
+        ) %>% purrr::set_names(value_cols)
265
+        if (any(!is_numeric_col)) {
266
+            rlang::abort(.non_num_user_cols_error(
267
+                names(is_numeric_col)[!is_numeric_col]
268
+            ))
269
+        }
270
+        if (!all(join_af_by %in% colnames(df))) {
271
+            rlang::abort(join_key_err, class = "join_key_err_agg")
272
+        }
270 273
     }
271 274
     if (!is.data.frame(x)) {
272
-      purrr::walk(x, check_single_matrix)
275
+        purrr::walk(x, check_single_matrix)
273 276
     } else {
274
-      check_single_matrix(x)
277
+        check_single_matrix(x)
275 278
     }
276
-    join_key_in_af  <- all(join_af_by %in% colnames(association_file))
279
+    join_key_in_af <- all(join_af_by %in% colnames(association_file))
277 280
     if (!join_key_in_af) {
278
-      rlang::abort(join_key_err, class = "join_key_err_agg")
281
+        rlang::abort(join_key_err, class = "join_key_err_agg")
279 282
     }
280 283
     agg_matrix <- if (is.data.frame(x)) {
281 284
         .aggregate_lambda(
282 285
             x, association_file, key, value_cols, lambda, group, join_af_by
283 286
         )
284 287
     } else {
285
-      agg_matrix <- purrr::map(x, ~ .aggregate_lambda(
286
-        .x, association_file, key, value_cols, lambda, group, join_af_by
287
-      ))
288
+        agg_matrix <- purrr::map(x, ~ .aggregate_lambda(
289
+            .x, association_file, key, value_cols, lambda, group, join_af_by
290
+        ))
288 291
     }
289 292
     return(agg_matrix)
290 293
 }
... ...
@@ -4,7 +4,8 @@
4 4
 
5 5
 #' Computes the abundance for every integration event in the input data frame.
6 6
 #'
7
-#' \lifecycle{stable}
7
+#' @description
8
+#' `r lifecycle::badge("stable")`
8 9
 #' Abundance is obtained for every integration event by calculating the ratio
9 10
 #' between the single value and the total value for the given group.
10 11
 #'
... ...
@@ -13,6 +14,11 @@
13 14
 #' relative abundance column (and optionally a percentage abundance
14 15
 #' column) will be produced.
15 16
 #'
17
+#' @section Required tags:
18
+#' The function will explicitly check for the presence of these tags:
19
+#'
20
+#' * All columns declared in `mandatory_IS_vars()`
21
+#'
16 22
 #' @param x An integration matrix - aka a data frame that includes
17 23
 #' the `mandatory_IS_vars()` as columns. The matrix can either be aggregated
18 24
 #' (via `aggregate_values_by_key()`) or not.
... ...
@@ -29,12 +35,6 @@
29 35
 #'
30 36
 #' @family Analysis functions
31 37
 #'
32
-#' @importFrom magrittr `%>%`
33
-#' @importFrom dplyr group_by across all_of summarise left_join mutate
34
-#' @importFrom dplyr cur_column distinct select contains rename_with
35
-#' @importFrom rlang .data eval_tidy parse_expr abort
36
-#' @importFrom purrr map_lgl
37
-#' @importFrom stringr str_replace
38 38
 #' @return Either a single data frame with computed abundance values or
39 39
 #' a list of 2 data frames (abundance_df, quant_totals)
40 40
 #' @export
... ...
@@ -130,7 +130,7 @@ compute_abundance <- function(x,
130 130
 #' Filter data frames with custom predicates
131 131
 #'
132 132
 #' @description
133
-#' \lifecycle{experimental}
133
+#' `r lifecycle::badge("stable")`
134 134
 #' Filter a single data frame or a list of data frames with custom
135 135
 #' predicates assembled from the function parameters.
136 136
 #'
... ...
@@ -242,7 +242,7 @@ compute_abundance <- function(x,
242 242
 #' character vectors. Must be one of the allowed values between
243 243
 #' `c("<", ">", "==", "!=", ">=", "<=")`
244 244
 #'
245
-#' @family Analysis functions
245
+#' @family Data cleaning and pre-processing
246 246
 #'
247 247
 #' @return A data frame or a list of data frames
248 248
 #' @export
... ...
@@ -291,7 +291,8 @@ threshold_filter <- function(x,
291 291
 #' Sorts and keeps the top n integration sites based on the values
292 292
 #' in a given column.
293 293
 #'
294
-#' \lifecycle{experimental}
294
+#' @description
295
+#' `r lifecycle::badge("stable")`
295 296
 #' The input data frame will be sorted by the highest values in
296 297
 #' the columns specified and the top n rows will be returned as output.
297 298
 #' The user can choose to keep additional columns in the output
... ...
@@ -300,6 +301,11 @@ threshold_filter <- function(x,
300 301
 #' * `keep = "nothing"` only keeps the mandatory columns
301 302
 #' (`mandatory_IS_vars()`) plus the columns in the `columns` parameter.
302 303
 #'
304
+#' @section Required tags:
305
+#' The function will explicitly check for the presence of these tags:
306
+#'
307
+#' * All columns declared in `mandatory_IS_vars()`
308
+#'
303 309
 #' @param x An integration matrix (data frame containing
304 310
 #' `mandatory_IS_vars()`)
305 311
 #' @param n How many integrations should be sliced (in total or
... ...
@@ -316,8 +322,6 @@ threshold_filter <- function(x,
316 322
 #'
317 323
 #' @family Analysis functions
318 324
 #'
319
-#' @importFrom magrittr `%>%`
320
-#' @importFrom rlang abort
321 325
 #'
322 326
 #' @return Either a data frame with at most n rows or
323 327
 #' a data frames with at most n*(number of groups) rows.
... ...
@@ -344,7 +348,7 @@ threshold_filter <- function(x,
344 348
 #'     keep = "Value2",
345 349
 #'     key = "CompleteAmplificationID"
346 350
 #' )
347
-#top_abundant_is
351
+# top_abundant_is
348 352
 top_integrations <- function(x,
349 353
     n = 20,
350 354
     columns = "fragmentEstimate_sum_RelAbundance",
... ...
@@ -408,6 +412,75 @@ top_integrations <- function(x,
408 412
 }
409 413
 
410 414
 
415
+#' Top n targeted genes based on number of IS.
416
+#'
417
+#' @description
418
+#' `r lifecycle::badge("experimental")`
419
+#' Produces a summary of the number of integration events per gene, orders
420
+#' the table in decreasing order and slices the first n rows - either on
421
+#' all the data frame or by group.
422
+#'
423
+#' @details
424
+#' ## Gene grouping
425
+#' When producing a summary of IS by gene, there are different options that
426
+#' can be chosen.
427
+#' The argument `consider_chr` accounts for the fact that some genes (same
428
+#' gene symbol) may span more than one chromosome: if set to `TRUE`
429
+#' counts of IS will be separated for those genes that span 2 or more
430
+#' chromosomes - in other words they will be in 2 different rows of the
431
+#' output table. On the contrary, if the argument is set to `FALSE`,
432
+#' counts will be produced in a single row.
433
+#'
434
+#' NOTE: the function counts **DISTINCT** integration events, which logically
435
+#' corresponds to a union of sets. Be aware of the fact that counts per group
436
+#' and counts with different arguments might be different: if for example
437
+#' counts are performed by considering chromosome and there is one gene symbol
438
+#' with 2 different counts, the sum of those 2 will likely not be equal to
439
+#' the count obtained by performing the calculations without
440
+#' considering the chromosome.
441
+#'
442
+#' The same reasoning can be applied for the argument `consider_gene_strand`,
443
+#' that takes into account the strand of the gene.
444
+#'
445
+#' @section Required tags:
446
+#' The function will explicitly check for the presence of these tags:
447
+#'
448
+#' ```{r echo=FALSE, results="asis"}
449
+#' all_tags <- available_tags()
450
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
451
+#'  ~ "top_targeted_genes" %in% .x)][["tag"]])
452
+#'  cat(paste0("* ", needed, collapse="\n"))
453
+#' ```
454
+#'
455
+#' Note that the tags "gene_strand" and "chromosome" are explicitly required
456
+#' only if `consider_chr = TRUE` and/or `consider_gene_strand = TRUE`.
457
+#'
458
+#' @param x An integration matrix - must be annotated
459
+#' @param n Number of rows to slice
460
+#' @param key If slice has to be performed for each group, the character
461
+#' vector of column names that identify the groups. If `NULL` considers the
462
+#' whole input data frame.
463
+#' @param consider_chr Logical, should the chromosome be taken into account?
464
+#' See details.
465
+#' @param consider_gene_strand Logical, should the gene strand be taken into
466
+#' account? See details.
467
+#' @param as_df If computation is performed by group, `TRUE` returns all
468
+#' groups merged in a single data frame with a column containing the group id.
469
+#' If `FALSE` returns a named list.
470
+#'
471
+#' @importFrom rlang sym
472
+#'
473
+#' @return A data frame or a list of data frames
474
+#' @export
475
+#' @family Analysis functions
476
+#'
477
+#' @examples
478
+#' data("integration_matrices", package = "ISAnalytics")
479
+#' top_targ <- top_targeted_genes(
480
+#'     integration_matrices,
481
+#'     key = NULL
482
+#' )
483
+#' top_targ
411 484
 top_targeted_genes <- function(x,
412 485
     n = 20,
413 486
     key = c(
... ...
@@ -417,70 +490,134 @@ top_targeted_genes <- function(x,
417 490
     consider_chr = TRUE,
418 491
     consider_gene_strand = TRUE,
419 492
     as_df = TRUE) {
420
-  stopifnot(is.data.frame(x))
421
-  data.table::setDT(x)
422
-  stopifnot(is.numeric(n) || is.integer(n))
423
-  stopifnot(is.null(key) || is.character(key))
424
-  stopifnot(is.logical(consider_chr))
425
-  stopifnot(is.logical(consider_gene_strand))
493
+    stopifnot(is.data.frame(x))
494
+    data.table::setDT(x)
495
+    stopifnot(is.numeric(n) || is.integer(n))
496
+    stopifnot(is.null(key) || is.character(key))
497
+    stopifnot(is.logical(consider_chr))
498
+    stopifnot(is.logical(consider_gene_strand))
426 499
 
427
-  required_annot_tags <- c("gene_symbol")
428
-  if (consider_gene_strand) {
429
-    required_annot_tags <- c(required_annot_tags, "gene_strand")
430
-  }
431
-  annot_tag_cols <- .check_required_cols(required_annot_tags,
432
-                                         annotation_IS_vars(TRUE),
433
-                                         "error")
434
-  if (consider_chr) {
435
-    chr_tag_col <- .check_required_cols(c("chromosome", "locus"),
436
-                                        mandatory_IS_vars(TRUE),
437
-                                        "error")
438
-    annot_tag_cols <- annot_tag_cols %>%
439
-      dplyr::bind_rows(chr_tag_col)
440
-  }
441
-  data.table::setDT(annot_tag_cols)
442
-  cols_to_check <- c(annot_tag_cols$names, key)
443
-  if (!all(cols_to_check %in% colnames(x))) {
444
-    rlang::abort(.missing_needed_cols(
445
-      cols_to_check[!cols_to_check %in% colnames(x)]))
446
-  }
500
+    required_annot_tags <- c("gene_symbol")
501
+    if (consider_gene_strand) {
502
+        required_annot_tags <- c(required_annot_tags, "gene_strand")
503
+    }
504
+    annot_tag_cols <- .check_required_cols(
505
+        required_annot_tags,
506
+        annotation_IS_vars(TRUE),
507
+        "error"
508
+    )
509
+    if (consider_chr) {
510
+        chr_tag_col <- .check_required_cols(
511
+            c("chromosome", "locus"),
512
+            mandatory_IS_vars(TRUE),
513
+            "error"
514
+        )
515
+        annot_tag_cols <- annot_tag_cols %>%
516
+            dplyr::bind_rows(chr_tag_col)
517
+    }
518
+    data.table::setDT(annot_tag_cols)
519
+    cols_to_check <- c(annot_tag_cols$names, key)
520
+    if (!all(cols_to_check %in% colnames(x))) {
521
+        rlang::abort(.missing_needed_cols(
522
+            cols_to_check[!cols_to_check %in% colnames(x)]
523
+        ))
524
+    }
447 525
 
448
-  df_with_is_counts <- if (is.null(key)) {
449
-    .count_distinct_is_per_gene(
450
-      x = x, include_chr = consider_chr,
451
-      include_gene_strand = consider_gene_strand,
452
-      gene_sym_col = annot_tag_cols[
453
-        eval(sym("tag")) == "gene_symbol"][["names"]],
454
-      gene_strand_col = annot_tag_cols[
455
-        eval(sym("tag")) == "gene_strand"][["names"]],
456
-      chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]],
457
-      mand_vars_to_check = mandatory_IS_vars(TRUE)
458
-    ) %>%
459
-      dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
460
-      dplyr::slice_head(n = n)
461
-  } else {
462
-    tmp <- x[, .count_distinct_is_per_gene(
463
-      x = .SD, include_chr = consider_chr,
464
-      include_gene_strand = consider_gene_strand,
465
-      gene_sym_col = annot_tag_cols[
466
-        eval(sym("tag")) == "gene_symbol"][["names"]],
467
-      gene_strand_col = annot_tag_cols[
468
-        eval(sym("tag")) == "gene_strand"][["names"]],
469
-      chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]],
470
-      mand_vars_to_check = mandatory_IS_vars(TRUE)
471
-    ), by = eval(key)]
472
-    tmp[,
473
-        .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
474
-          dplyr::slice_head(n = n),
475
-        by = eval(key)]
476
-  }
477
-  if (as_df) {
478
-    return(df_with_is_counts)
479
-  }
480
-  return(split(df_with_is_counts, by = key))
526
+    df_with_is_counts <- if (is.null(key)) {
527
+        .count_distinct_is_per_gene(
528
+            x = x, include_chr = consider_chr,
529
+            include_gene_strand = consider_gene_strand,
530
+            gene_sym_col = annot_tag_cols[
531
+                eval(sym("tag")) == "gene_symbol"
532
+            ][["names"]],
533
+            gene_strand_col = annot_tag_cols[
534
+                eval(sym("tag")) == "gene_strand"
535
+            ][["names"]],
536
+            chr_col = annot_tag_cols[eval(sym("tag")) ==
537
+                "chromosome"][["names"]],
538
+            mand_vars_to_check = mandatory_IS_vars(TRUE)
539
+        ) %>%
540
+            dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
541
+            dplyr::slice_head(n = n)
542
+    } else {
543
+        tmp <- x[, .count_distinct_is_per_gene(
544
+            x = .SD, include_chr = consider_chr,
545
+            include_gene_strand = consider_gene_strand,
546
+            gene_sym_col = annot_tag_cols[
547
+                eval(sym("tag")) == "gene_symbol"
548
+            ][["names"]],
549
+            gene_strand_col = annot_tag_cols[
550
+                eval(sym("tag")) == "gene_strand"
551
+            ][["names"]],
552
+            chr_col = annot_tag_cols[eval(sym("tag")) ==
553
+                "chromosome"][["names"]],
554
+            mand_vars_to_check = mandatory_IS_vars(TRUE)
555
+        ), by = eval(key)]
556
+        tmp[,
557
+            .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
558
+                dplyr::slice_head(n = n),
559
+            by = eval(key)
560
+        ]
561
+    }
562
+    if (as_df) {
563
+        return(df_with_is_counts)
564
+    }
565
+    return(split(df_with_is_counts, by = key))
481 566
 }
482 567
 
483 568
 
569
+#' Compute Fisher's exact test on gene frequencies.
570
+#'
571
+#' @description
572
+#' `r lifecycle::badge("experimental")`
573
+#' Provided 2 data frames with calculations for CIS, via `CIS_grubbs()`,
574
+#' computes Fisher's exact test.
575
+#' Results can be plotted via `fisher_scatterplot()`.
576
+#'
577
+#' @param cis_x A data frame obtained via `CIS_grubbs()`
578
+#' @param cis_y A data frame obtained via `CIS_grubbs()`
579
+#' @param min_is_per_gene Used for pre-filtering purposes. Genes with a
580
+#' number of distinct integration less than this number will be filtered out
581
+#' prior calculations. Single numeric or integer.
582
+#' @param gene_set_method One between "intersection" and "union". When merging
583
+#' the 2 data frames, `intersection` will perform an inner join operation,
584
+#' while `union` will perform a full join operation.
585
+#' @param significance_threshold Significance threshold for the Fisher's
586
+#' test p-value
587
+#' @param remove_unbalanced_0 Remove from the final output those pairs in
588
+#' which there are no IS for one group or the other and the number of
589
+#' IS of the non-missing group are less than the mean number of IS for that
590
+#' group
591
+#'
592
+#' @template genes_db
593
+#'
594
+#' @section Required tags:
595
+#' The function will explicitly check for the presence of these tags:
596
+#'
597
+#' ```{r echo=FALSE, results="asis"}
598
+#' all_tags <- available_tags()
599
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
600
+#'  ~ "gene_frequency_fisher" %in% .x)][["tag"]])
601
+#'  cat(paste0("* ", needed, collapse="\n"))
602
+#' ```
603
+#'
604
+#' @return A data frame
605
+#' @export
606
+#' @family Analysis functions
607
+#'
608
+#' @examples
609
+#' data("integration_matrices", package = "ISAnalytics")
610
+#' data("association_file", package = "ISAnalytics")
611
+#' aggreg <- aggregate_values_by_key(
612
+#'     x = integration_matrices,
613
+#'     association_file = association_file,
614
+#'     value_cols = c("seqCount", "fragmentEstimate")
615
+#' )
616
+#' cis <- CIS_grubbs(aggreg, by = "SubjectID")
617
+#' fisher <- gene_frequency_fisher(cis$cis$PT001, cis$cis$PT002,
618
+#'     min_is_per_gene = 2
619
+#' )
620
+#' fisher
484 621
 gene_frequency_fisher <- function(cis_x,
485 622
     cis_y,
486 623
     min_is_per_gene = 3,
... ...
@@ -506,142 +643,169 @@ gene_frequency_fisher <- function(cis_x,
506 643
     stopifnot(is.logical(remove_unbalanced_0))
507 644
     ## -- Fetch gene symbol column
508 645
     gene_sym_col <- .check_required_cols(
509
-      "gene_symbol", annotation_IS_vars(TRUE), duplicate_politic = "error"
646
+        "gene_symbol", annotation_IS_vars(TRUE),
647
+        duplicate_politic = "error"
510 648
     )[["names"]]
511
-    req_cis_cols <- c(gene_sym_col, "n_IS_perGene", "average_TxLen",
512
-                      "raw_gene_integration_frequency")
649
+    req_cis_cols <- c(
650
+        gene_sym_col, "n_IS_perGene", "average_TxLen",
651
+        "raw_gene_integration_frequency"
652
+    )
513 653
     quiet_expand <- purrr::quietly(.expand_cis_df)
514
-    cols_for_join <- c(gene_sym_col,
515
-                     "Onco1_TS2", "ClinicalRelevance", "DOIReference",
516
-                     "KnownGeneClass", "KnownClonalExpansion",
517
-                     "CriticalForInsMut")
654
+    cols_for_join <- c(
655
+        gene_sym_col,
656
+        "Onco1_TS2", "ClinicalRelevance", "DOIReference",
657
+        "KnownGeneClass", "KnownClonalExpansion",
658
+        "CriticalForInsMut"
659
+    )
518 660
     ## --- Calculations to perform on each df
519 661
     append_calc <- function(df, group_n) {
520
-      if (!all(req_cis_cols %in% colnames(df))) {
521
-        rlang::abort(
522
-          .missing_needed_cols(req_cis_cols[!req_cis_cols %in% colnames(df)]))
523
-      }
524
-      modified <- quiet_expand(df, gene_sym_col,
525
-                                 onco_db_file, tumor_suppressors_db_file,
526
-                                 species, known_onco, suspicious_genes)$result
527
-      modified <- modified %>%
528
-        dplyr::mutate(
529
-          IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000,
530
-          Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen, na.rm = TRUE),
531
-          IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen /
532
-                                              .data$Sum_IS_per_kbGeneLen) * 1e6
533
-        ) %>%
534
-        dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>%
535
-        dplyr::select(dplyr::all_of(c(req_cis_cols, cols_for_join,
536
-                                      "IS_per_kbGeneLen",
537
-                                      "Sum_IS_per_kbGeneLen",
538
-                                      "IS_per_kbGeneLen_perMDepth_TPM")))
539
-      colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste(
540
-        colnames(modified)[!colnames(modified) %in% cols_for_join], group_n,
541
-        sep = "_"
542
-      )
543
-      return(modified)
662
+        if (!all(req_cis_cols %in% colnames(df))) {
663
+            rlang::abort(
664
+                .missing_needed_cols(req_cis_cols[!req_cis_cols %in%
665
+                    colnames(df)])
666
+            )
667
+        }
668
+        modified <- quiet_expand(
669
+            df, gene_sym_col,
670
+            onco_db_file, tumor_suppressors_db_file,
671
+            species, known_onco, suspicious_genes
672
+        )$result
673
+        modified <- modified %>%
674
+            dplyr::mutate(
675
+                IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000,
676
+                Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen,
677
+                    na.rm = TRUE
678
+                ),
679
+                IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen /
680
+                    .data$Sum_IS_per_kbGeneLen) * 1e6
681
+            ) %>%
682
+            dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>%
683
+            dplyr::select(dplyr::all_of(c(
684
+                req_cis_cols, cols_for_join,
685
+                "IS_per_kbGeneLen",
686
+                "Sum_IS_per_kbGeneLen",
687
+                "IS_per_kbGeneLen_perMDepth_TPM"
688
+            )))
689
+        colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste(
690
+            colnames(modified)[!colnames(modified) %in% cols_for_join], group_n,
691
+            sep = "_"
692
+        )
693
+        return(modified)
544 694
     }
545
-    cis_mod <- purrr::map2(list(cis_x, cis_y), c(1,2), append_calc)
695
+    cis_mod <- purrr::map2(list(cis_x, cis_y), c(1, 2), append_calc)
546 696
     ## --- Merge the two in 1 df
547 697
     merged <- if (gene_set_method == "union") {
548
-      purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join))
698
+        purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join))
549 699
     } else {
550
-      purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
700
+        purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
551 701
     }
552 702
     if (nrow(merged) == 0) {
553
-      if (getOption("ISAnalytics.verbose") == TRUE) {
554
-        msg <- c("Data frame empty after filtering",
555
-                 i = paste("Data frame is empty after applying filter on IS,",
556
-                           "is your filter too stringent?"),
557
-                 x = "Nothing to return")
558
-        rlang::inform(msg, class = "empty_df_gene_freq")
559
-      }
560
-      return(NULL)
703
+        if (getOption("ISAnalytics.verbose") == TRUE) {
704
+            msg <- c("Data frame empty after filtering",
705
+                i = paste(
706
+                    "Data frame is empty after applying filter on IS,",
707
+                    "is your filter too stringent?"
708
+                ),
709
+                x = "Nothing to return"
710
+            )
711
+            rlang::inform(msg, class = "empty_df_gene_freq")
712
+        }
713
+        return(NULL)
561 714
     }
562 715
     ## --- Actual computation of fisher test: test is applied on each row
563 716
     ## (each gene)
564 717
     merged <- merged %>%
565
-      dplyr::mutate(
566
-        tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE),
567
-        tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE)
568
-      )
718
+        dplyr::mutate(
719
+            tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE),
720
+            tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE)
721
+        )
569 722
     compute_fisher <- function(...) {
570
-      row <- list(...)
571
-      n_IS_perGene_1 <- row$n_IS_perGene_1
572
-      n_IS_perGene_2 <- row$n_IS_perGene_2
573
-      n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0
574
-      n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0
575
-      matrix <- matrix(
576
-        data = c(n_IS_perGene_1,
577
-                 row$tot_n_IS_perGene_1 - n_IS_perGene_1,
578
-                 n_IS_perGene_2,
579
-                 row$tot_n_IS_perGene_2 - n_IS_perGene_2),
580
-        nrow = 2,
581
-        dimnames = list(G1 = c("IS_of_gene", "TotalIS"),
582
-                        G2 = c("IS_of_gene", "TotalIS"))
583
-      )
584
-      ft <- stats::fisher.test(matrix)
585
-      return(ft$p.value)
723
+        row <- list(...)
724
+        n_IS_perGene_1 <- row$n_IS_perGene_1
725
+        n_IS_perGene_2 <- row$n_IS_perGene_2
726
+        n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0
727
+        n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0
728
+        matrix <- matrix(
729
+            data = c(
730
+                n_IS_perGene_1,
731
+                row$tot_n_IS_perGene_1 - n_IS_perGene_1,
732
+                n_IS_perGene_2,
733
+                row$tot_n_IS_perGene_2 - n_IS_perGene_2
734
+            ),
735
+            nrow = 2,
736
+            dimnames = list(
737
+                G1 = c("IS_of_gene", "TotalIS"),
738
+                G2 = c("IS_of_gene", "TotalIS")
739
+            )
740
+        )
741
+        ft <- stats::fisher.test(matrix)
742
+        return(ft$p.value)
586 743
     }
587 744
     merged <- merged %>%
588
-      dplyr::mutate(
589
-        Fisher_p_value = purrr::pmap_dbl(., compute_fisher)
590
-      ) %>%
591
-      dplyr::mutate(
592
-        Fisher_p_value_significant = dplyr::if_else(
593
-          condition = .data$Fisher_p_value < significance_threshold,
594
-          true = TRUE, false = FALSE
745
+        dplyr::mutate(
746
+            Fisher_p_value = purrr::pmap_dbl(., compute_fisher)
747
+        ) %>%
748
+        dplyr::mutate(
749
+            Fisher_p_value_significant = dplyr::if_else(
750
+                condition = .data$Fisher_p_value < significance_threshold,
751
+                true = TRUE, false = FALSE
752
+            )
595 753
         )
596
-      )
597 754
     ## --- Removing unbalanced 0s if requested - this scenario applies
598 755
     ## only if "union" is selected as method for join
599 756
     if (remove_unbalanced_0) {
600
-      mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE))
601
-      mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE))
602
-      test_exclude <- function(...) {
603
-        row <- list(...)
604
-        if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) {
605
-          to_ex <- ifelse(
606
-              test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) &
607
-                (is.na(row$n_IS_perGene_2))) |
608
-                ((is.na(row$n_IS_perGene_1)) &
609
-                   (row$n_IS_perGene_2 < mean_is_per_gene_2)),
610
-              yes = TRUE,
611
-              no = FALSE
612
-          )
613
-          return(to_ex)
757
+        mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE))
758
+        mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE))
759
+        test_exclude <- function(...) {
760
+            row <- list(...)
761
+            if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) {
762
+                to_ex <- ifelse(
763
+                    test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) &
764
+                        (is.na(row$n_IS_perGene_2))) |
765
+                        ((is.na(row$n_IS_perGene_1)) &
766
+                            (row$n_IS_perGene_2 < mean_is_per_gene_2)),
767
+                    yes = TRUE,
768
+                    no = FALSE
769
+                )
770
+                return(to_ex)
771
+            }
772
+            return(FALSE)
614 773
         }
615
-        return(FALSE)
616
-      }
617
-      merged <- merged %>%
618
-        dplyr::mutate(
619
-          to_exclude_from_test = purrr::pmap(., test_exclude)
620
-        ) %>%
621
-        dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
622
-        dplyr::select(-.data$to_exclude_from_test)
623
-      if (nrow(merged) == 0) {
624
-        if (getOption("ISAnalytics.verbose") == TRUE) {
625
-          msg <- c("Data frame empty after filtering",
626
-                   i = paste("Data frame is after removing unbalanced IS,",
627
-                             "nothing to return"))
628
-          rlang::inform(msg, class = "empty_df_gene_freq_unbal")
774
+        merged <- merged %>%
775
+            dplyr::mutate(
776
+                to_exclude_from_test = purrr::pmap(., test_exclude)
777
+            ) %>%
778
+            dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
779
+            dplyr::select(-.data$to_exclude_from_test)
780
+        if (nrow(merged) == 0) {
781
+            if (getOption("ISAnalytics.verbose") == TRUE) {
782
+                msg <- c("Data frame empty after filtering",
783
+                    i = paste(
784
+                        "Data frame is after removing unbalanced IS,",
785
+                        "nothing to return"
786
+                    )
787
+                )
788
+                rlang::inform(msg, class = "empty_df_gene_freq_unbal")
789
+            }
790
+            return(NULL)
629 791
         }
630
-        return(NULL)
631
-      }
632 792
     }
633 793
     ## --- Apply statistical corrections to p-value
634 794
     merged <- merged %>%
635
-      dplyr::mutate(
636
-        Fisher_p_value_fdr = p.adjust(.data$Fisher_p_value, method = "fdr",
637
-                                      n = length(.data$Fisher_p_value)),
638
-        Fisher_p_value_benjamini = p.adjust(.data$Fisher_p_value, method = "BY",
639
-                                            n = length(.data$Fisher_p_value)),
640
-        minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10)
641
-      ) %>%
642
-      dplyr::mutate(
643
-        minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10),
644
-      )
795
+        dplyr::mutate(
796
+            Fisher_p_value_fdr = stats::p.adjust(.data$Fisher_p_value,
797
+                method = "fdr",
798
+                n = length(.data$Fisher_p_value)
799
+            ),
800
+            Fisher_p_value_benjamini = stats::p.adjust(.data$Fisher_p_value,
801
+                method = "BY",
802
+                n = length(.data$Fisher_p_value)
803
+            ),
804
+            minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10)
805
+        ) %>%
806
+        dplyr::mutate(
807
+            minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10),
808
+        )
645 809
     return(merged)
646 810
 }
647 811
 
... ...
@@ -650,7 +814,7 @@ gene_frequency_fisher <- function(cis_x,
650 814
 #' the metadata data frame accordingly.
651 815
 #'
652 816
 #' @description
653
-#' \lifecycle{experimental}
817
+#' `r lifecycle::badge("stable")`
654 818
 #' The function operates on a data frame by grouping the content by
655 819
 #' the sample key and computing every function specified on every
656 820
 #' column in the `value_columns` parameter. After that the metadata
... ...
@@ -686,14 +850,17 @@ gene_frequency_fisher <- function(cis_x,
686 850
 #' @param functions A named list of function or purrr-style lambdas
687 851
 #' @param add_integrations_count Add the count of distinct integration sites
688 852
 #' for each group? Can be computed only if `x` contains the mandatory columns
689
-#' `chr`, `integration_locus`, `strand`
853
+#' `mandatory_IS_vars()`
854
+#'
855
+#' @section Required tags:
856
+#' The function will explicitly check for the presence of these tags:
857
+#'
858
+#' * All columns declared in `mandatory_IS_vars()`
859
+#'
860
+#' These are checked only if `add_integrations_count = TRUE`.
690 861
 #'
691 862
 #' @family Analysis functions
692
-#' @importFrom rlang eval_tidy expr abort .data sym inform
693
-#' @importFrom purrr is_function is_formula map_lgl walk map set_names
694
-#' @importFrom dplyr group_by across all_of summarise rename_with bind_cols
695
-#' @importFrom dplyr n_distinct left_join
696
-#' @importFrom magrittr `%>%`
863
+#' @importFrom rlang .data sym
697 864
 #'
698 865
 #' @return A list with modified x and metadata data frames
699 866
 #' @export
... ...
@@ -797,7 +964,8 @@ sample_statistics <- function(x,
797 964
 
798 965
 #' Grubbs test for Common Insertion Sites (CIS).
799 966
 #'
800
-#' \lifecycle{stable}
967
+#' @description
968
+#' `r lifecycle::badge("stable")`
801 969
 #' Statistical approach for the validation of common insertion sites
802 970
 #' significance based on the comparison of the integration frequency
803 971
 #' at the CIS gene with respect to other genes contained in the
... ...
@@ -824,6 +992,16 @@ sample_statistics <- function(x,
824 992
 #'
825 993
 #' `r refGene_table_cols()`
826 994
 #'
995
+#' @section Required tags:
996
+#' The function will explicitly check for the presence of these tags:
997
+#'
998
+#' ```{r echo=FALSE, results="asis"}
999
+#' all_tags <- available_tags()
1000
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
1001
+#'  ~ "CIS_grubbs" %in% .x)][["tag"]])
1002
+#'  cat(paste0("* ", needed, collapse="\n"))
1003
+#' ```
1004
+#'
827 1005
 #' @param x An integration matrix, must include the `mandatory_IS_vars()`
828 1006
 #' columns and the `annotation_IS_vars()` columns
829 1007
 #' @param genomic_annotation_file Database file for gene annotation,
... ...
@@ -834,17 +1012,16 @@ sample_statistics <- function(x,
834 1012
 #' NULL, the function will perform calculations for each group and return
835 1013
 #' a list of data frames with the results. E.g. for `by = "SubjectID"`,
836 1014
 #' CIS will be computed for each distinct SubjectID found in the table
837
-#' (of course, "SubjectID" column must be included in the input data frame).
1015
+#' ("SubjectID" column must be included in the input data frame).
1016
+#' @param return_missing_as_df Returns those genes present in the input df
1017
+#' but not in the refgenes as a data frame?
1018
+#' @param results_as_list Relevant only if `by` is not `NULL` - if `TRUE`
1019
+#' return the group computations as a named list, otherwise return a single
1020
+#' df with an additional column containing the group id
838 1021
 #'
839 1022
 #' @family Analysis functions
840 1023
 #'
841
-#' @importFrom tibble as_tibble
842
-#' @importFrom rlang .data abort current_env eval_tidy sym
843
-#' @importFrom magrittr `%>%`
844
-#' @importFrom utils read.csv
845
-#' @importFrom stringr str_replace_all
846
-#' @importFrom tidyr unite
847
-#' @importFrom purrr set_names map
1024
+#' @importFrom rlang .data sym
848 1025
 #'
849 1026
 #' @return A data frame
850 1027
 #' @export
... ...
@@ -852,7 +1029,7 @@ sample_statistics <- function(x,
852 1029
 #' @examples
853 1030
 #' data("integration_matrices", package = "ISAnalytics")
854 1031
 #' cis <- CIS_grubbs(integration_matrices)
855
-#' head(cis)
1032
+#' cis
856 1033
 CIS_grubbs <- function(x,
857 1034
     genomic_annotation_file = "hg19",
858 1035
     grubbs_flanking_gene_bp = 100000,
... ...
@@ -996,7 +1173,7 @@ CIS_grubbs <- function(x,
996 1173
                 "a mismatch in the annotation phase of",
997 1174
                 "the matrix. Here is a summary: "
998 1175
             ),
999
-            paste0(capture.output({
1176
+            paste0(utils::capture.output({
1000 1177
                 print(missing_df, n = Inf)
1001 1178
             }), collapse = "\n"),
1002 1179
             sep = "\n"
... ...
@@ -1030,36 +1207,11 @@ CIS_grubbs <- function(x,
1030 1207
 
1031 1208
 #' Integrations cumulative count in time by sample
1032 1209
 #'
1033
-#' \lifecycle{experimental}
1034
-#' This function computes the cumulative number of integrations
1035
-#' observed in each sample at different time points by assuming that
1036
-#' if an integration is observed at time point "t" then it is also observed in
1037
-#' time point "t+1".
1210
+#' @description
1211
+#' `r lifecycle::badge("deprecated")`
1212
+#' This function was deprecated in favour of a single function,
1213
+#' please use `cumulative_is` instead.
1038 1214
 #'
1039
-#' @details
1040
-#' ## Input data frame
1041
-#' The user can provide as input for the `x` parameter both a simple
1042
-#' integration matrix AND setting the `aggregate` parameter to TRUE,
1043
-#' or provide an already aggregated matrix via
1044
-#' \link{aggregate_values_by_key}.
1045
-#' If the user supplies a matrix to be aggregated the `association_file`
1046
-#' parameter must not be NULL: aggregation will be done by an internal
1047
-#' call to the aggregation function.
1048
-#' If the user supplies an already aggregated matrix, the `key` parameter
1049
-#' is the key used for aggregation -
1050
-#' **NOTE: for this operation is mandatory
1051
-#' that the time point column is included in the key.**
1052
-#' ## Assumptions on time point format
1053
-#' By using the functions provided by this package, when imported,
1054
-#' an association file will be correctly formatted for future usage.
1055
-#' In the formatting process there is also a padding operation performed on
1056
-#' time points: this means the functions expects the time point column to
1057
-#' be of type character and to be correctly padded with 0s. If the
1058
-#' chosen column for time point is detected as numeric the function will
1059
-#' attempt the conversion to character and automatic padding.
1060
-#' If you choose to import the association file not using the
1061
-#' \link{import_association_file} function, be sure to check the format of
1062
-#' the chosen column to avoid undesired results.
1063 1215
 #'
1064 1216
 #' @param x A simple integration matrix or an aggregated matrix (see details)
1065 1217
 #' @param association_file NULL or the association file for x if `aggregate`
... ...
@@ -1071,20 +1223,9 @@ CIS_grubbs <- function(x,
1071 1223
 #' @param aggregate Should x be aggregated?
1072 1224
 #' @param ... Additional parameters to pass to `aggregate_values_by_key`
1073 1225
 #'
1074
-#' @family Analysis functions
1075
-#'
1076
-#' @importFrom dplyr mutate filter across all_of select summarise group_by
1077
-#' @importFrom dplyr arrange group_split first full_join starts_with distinct
1078
-#' @importFrom dplyr semi_join n rename
1079
-#' @importFrom magrittr `%>%`
1080
-#' @importFrom rlang .data abort inform `:=`
1081
-#' @importFrom stringr str_pad
1082
-#' @importFrom purrr reduce is_empty
1083
-#' @importFrom tidyr pivot_longer
1084
-#' @importFrom stats na.omit
1085
-#'
1086 1226
 #' @return A data frame
1087 1227
 #' @export
1228
+#' @keywords internal
1088 1229
 #'
1089 1230
 #' @examples
1090 1231
 #' data("integration_matrices", package = "ISAnalytics")
... ...
@@ -1114,7 +1255,8 @@ cumulative_count_union <- function(x,
1114 1255
         what = "cumulative_count_union()",
1115 1256
         with = "cumulative_is()",
1116 1257
         details = c(paste(
1117
-            "Use option `counts = TRUE`. Function will be likely dropped in the",
1258
+            "Use option `counts = TRUE`.",
1259
+            "Function will be likely dropped in the",
1118 1260
             "next release cycle"
1119 1261
         ))
1120 1262
     )
... ...
@@ -1124,9 +1266,10 @@ cumulative_count_union <- function(x,
1124 1266
     )
1125 1267
 }
1126 1268
 
1127
-#' Expands integration matrix with the cumulative is union over time.
1269
+#' Expands integration matrix with the cumulative IS union over time.
1128 1270
 #'
1129
-#' @description \lifecycle{experimental}
1271
+#' @description
1272
+#' `r lifecycle::badge("experimental")`
1130 1273
 #' Given an input integration matrix that can be grouped over time,
1131 1274
 #' this function adds integrations in groups assuming that
1132 1275
 #' if an integration is observed at time point "t" then it is also observed in
... ...
@@ -1141,6 +1284,14 @@ cumulative_count_union <- function(x,
1141 1284
 #' @param expand If `FALSE`, for each group, the set of integration sites is
1142 1285
 #' returned in a separate column as a nested table, otherwise the resulting
1143 1286
 #' column is unnested.
1287
+#' @param counts Add cumulative counts? Logical
1288
+#'
1289
+#' @section Required tags:
1290
+#' The function will explicitly check for the presence of these tags:
1291
+#'
1292
+#' * All columns declared in `mandatory_IS_vars()`
1293
+#' * Checks if the matrix is annotated by assessing presence of
1294
+#' `annotation_IS_vars()`
1144 1295
 #'
1145 1296
 #' @family Analysis functions
1146 1297
 #' @return A data frame
... ...
@@ -1210,12 +1361,12 @@ cumulative_is <- function(x,
1210 1361
             .keep_all = TRUE
1211 1362
         )
1212 1363
     data.table::setDT(temp)
1213
-    temp <- temp[, .(is = list(.SD)), by = key]
1364
+    temp <- temp[, list(is = list(.SD)), by = key]
1214 1365
     no_tp_key <- key[key != timepoint_col]
1215 1366
     split <- split(temp, by = no_tp_key)
1216 1367
     cumulate <- purrr::map(split, function(x) {
1217 1368
         x[, cumulative_is := purrr::accumulate(
1218
-            is,
1369
+            get("is"),
1219 1370
             ~ data.table::funion(.x, .y)
1220 1371
         )]
1221 1372
     })
... ...
@@ -1246,13 +1397,14 @@ cumulative_is <- function(x,
1246 1397
 
1247 1398
 #' Sharing of integration sites between given groups.
1248 1399
 #'
1249
-#' \lifecycle{experimental}
1400
+#' @description
1401
+#' `r lifecycle::badge("stable")`
1250 1402
 #' Computes the amount of integration sites shared between the groups identified
1251 1403
 #' in the input data.
1252 1404
 #'
1253 1405
 #' @details
1254
-#' An integration site is always identified by the triple
1255
-#' `(chr, integration_locus, strand)`, thus these columns must be present
1406
+#' An integration site is always identified by the combination of fields in
1407
+#' `mandatory_IS_vars()`, thus these columns must be present
1256 1408
 #' in the input(s).
1257 1409
 #'
1258 1410
 #' The function accepts multiple inputs for different scenarios, please refer
... ...
@@ -1270,6 +1422,11 @@ cumulative_is <- function(x,
1270 1422
 #' function \code{\link{sharing_heatmap}} or via the function
1271 1423
 #' \code{\link{sharing_venn}}
1272 1424
 #'
1425
+#' @section Required tags:
1426
+#' The function will explicitly check for the presence of these tags:
1427
+#'
1428
+#' * All columns declared in `mandatory_IS_vars()`
1429
+#'
1273 1430
 #' @param ... One or more integration matrices
1274 1431
 #' @param group_key Character vector of column names which identify a
1275 1432
 #' single group. An associated group id will be derived by concatenating
... ...
@@ -1522,7 +1679,8 @@ is_sharing <- function(...,
1522 1679
 
1523 1680
 #' Find the source of IS by evaluating sharing.
1524 1681
 #'
1525
-#' @description \lifecycle{experimental}
1682
+#' @description
1683
+#' `r lifecycle::badge("stable")`
1526 1684
 #' The function computes the sharing between a reference group of interest
1527 1685
 #' for each time point and a selection of groups of interest. In this way
1528 1686
 #' it is possible to observe the percentage of shared integration sites between
... ...
@@ -4,46 +4,51 @@
4 4
 
5 5
 #' Identifies and removes collisions.
6 6
 #'
7
-#' \lifecycle{stable}
8
-#' A collision is an integration (aka a unique combination of
9
-#' `chr`, `integration_locus` and `strand`) which is observed in more than one
10
-#' independent sample (a unique pair of `ProjectID` and `SubjectID`).
11
-#' The function tries to decide to which subject an integration
12
-#' should be assigned to and, if no
7
+#' @description
8
+#' `r lifecycle::badge("stable")`
9
+#' A collision is an integration (aka a unique combination of the provided
10
+#' `mandatory_IS_vars()`) which is observed in more than one
11
+#' independent sample.
12
+#' The function tries to decide to which independent sample should
13
+#' an integration event be assigned to, and if no
13 14
 #' decision can be taken, the integration is completely removed from the data
14 15
 #' frame.
15 16
 #' For more details refer to the vignette "Collision removal functionality":
16 17
 #' \code{vignette("collision_removal", package = "ISAnalytics")}
17 18
 #'
18
-#' @param x Either a multi-quantification matrix or a
19
+#' @param x Either a multi-quantification matrix (recommended) or a
19 20
 #' named list of matrices (names must be quantification types)
20 21
 #' @param association_file The association file imported via
21 22
 #' `import_association_file()`
23
+#' @param independent_sample_id A character vector of column names that
24
+#' identify independent samples
22 25
 #' @param date_col The date column that should be considered.
23
-#' Must be one value in `date_columns_coll()`
24 26
 #' @param reads_ratio A single numeric value that represents the ratio that has
25 27
 #' to be considered when deciding between `seqCount` value.
26 28
 #' @param quant_cols A named character vector where names are
27 29
 #' quantification types and
28 30
 #' values are the names of the corresponding columns. The quantification
29 31
 #' `seqCount` MUST be included in the vector.
30
-#' @param report_path The path where the report file should be saved.
31
-#' Can be a folder, a file or NULL if no report should be produced.
32
-#' Defaults to `{user_home}/ISAnalytics_reports`.
33 32
 #' @param max_workers Maximum number of parallel workers to distribute the
34 33
 #' workload. If `NULL` (default) produces the maximum amount of workers allowed,
35 34
 #' a numeric value is requested otherwise. WARNING: a higher number of workers
36 35
 #' speeds up computation at the cost of memory consumption! Tune this parameter
37 36
 #' accordingly.
38 37
 #'
39
-#' @family Collision removal
40
-#' @importFrom magrittr `%>%`
41
-#' @importFrom rlang inform abort exec .data
42
-#' @importFrom purrr map2
43
-#' @importFrom dplyr bind_rows select all_of group_by summarise across n
44
-#' @importFrom dplyr distinct
38
+#' @section Required tags:
39
+#' The function will explicitly check for the presence of these tags:
45 40
 #'
46
-#' @seealso \code{\link{date_columns_coll}}
41
+#' ```{r echo=FALSE, results="asis"}
42
+#' all_tags <- available_tags()
43
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
44
+#'  ~ "remove_collisions" %in% .x)][["tag"]])
45
+#'  cat(paste0("* ", needed, collapse="\n"))
46
+#' ```
47
+#'
48
+#' @template report_path_param
49
+#'
50
+#' @family Data cleaning and pre-processing
51
+#' @importFrom rlang .data sym
47 52
 #'
48 53
 #' @return Either a multi-quantification matrix or a list of data frames
49 54
 #' @export
... ...
@@ -191,32 +196,32 @@ remove_collisions <- function(x,
191 196
     if (getOption("ISAnalytics.reports") == TRUE & !is.null(report_path)) {
192 197
         post_joined <- association_file[final_matr, on = pcr_col]
193 198
         post_joined <- post_joined[, mget(c(
194
-          colnames(final_matr), date_col,
195
-          replicate_n_col, independent_sample_id, pool_col
199
+            colnames(final_matr), date_col,
200
+            replicate_n_col, independent_sample_id, pool_col
196 201
         ))]
197 202
         summaries <- .collisions_obtain_report_summaries(
198
-          x = x, association_file = association_file,
199
-          quant_cols = quant_cols, missing_ind = missing_ind,
200
-          pcr_col = pcr_col, pre_process = pre_process,
201
-          collisions = split_df$collisions, removed = removed,
202
-          reassigned = reassigned,
203
-          joined = joined,
204
-          final_matr = final_matr,
205
-          post_joined = post_joined,
206
-          pool_col = pool_col, replicate_n_col = replicate_n_col,
207
-          independent_sample_id = independent_sample_id,
208
-          seq_count_col = seq_count_col
203
+            x = x, association_file = association_file,
204
+            quant_cols = quant_cols, missing_ind = missing_ind,
205
+            pcr_col = pcr_col, pre_process = pre_process,
206
+            collisions = split_df$collisions, removed = removed,
207
+            reassigned = reassigned,
208
+            joined = joined,
209
+            final_matr = final_matr,
210
+            post_joined = post_joined,
211
+            pool_col = pool_col, replicate_n_col = replicate_n_col,
212
+            independent_sample_id = independent_sample_id,
213
+            seq_count_col = seq_count_col
209 214
         )
210 215
         sharing_heatmaps <- .collisions_obtain_sharing_heatmaps(
211
-          joined = joined, independent_sample_id = independent_sample_id,
212
-          post_joined = post_joined, report_path = report_path
216
+            joined = joined, independent_sample_id = independent_sample_id,
217
+            post_joined = post_joined, report_path = report_path
213 218
         )
214 219
         report_params <- append(summaries, sharing_heatmaps)
215 220
         report_params[["additional_info"]] <- add_samples
216 221
         report_params[["sample_key"]] <- independent_sample_id
217 222
         report_params[["dynamic_cols"]] <- list(
218
-          pcr_id = pcr_col,
219
-          pool = pool_col
223
+            pcr_id = pcr_col,
224
+            pool = pool_col
220 225
         )
221 226
         withCallingHandlers(
222 227
             {
... ...
@@ -248,7 +253,8 @@ remove_collisions <- function(x,
248 253
 #' Re-aligns matrices of other quantification types based on the processed
249 254
 #' sequence count matrix.
250 255
 #'
251
-#' \lifecycle{stable}
256
+#' @description
257
+#' `r lifecycle::badge("stable")`
252 258
 #' This function should be used to keep data consistent among the same analysis:
253 259
 #' if for some reason you removed the collisions by passing only the sequence
254 260
 #' count matrix to `remove_collisions()`, you should call this
... ...
@@ -264,10 +270,9 @@ remove_collisions <- function(x,
264 270
 #' @param other_matrices A named list of matrices to re-align. Names in the list
265 271
 #' must be quantification types (\code{quantification_types()}) except
266 272
 #' "seqCount".
267
-#' @importFrom dplyr semi_join
268
-#' @importFrom purrr map_lgl
269
-#' @importFrom magrittr `%>%`
270
-#' @family Collision removal
273
+#' @param sample_column The name of the column containing the sample identifier
274
+#'
275
+#' @family Data cleaning and pre-processing
271 276
 #' @seealso \code{\link{remove_collisions}}
272 277
 #'
273 278
 #' @return A named list with re-aligned matrices
... ...
@@ -291,8 +296,8 @@ remove_collisions <- function(x,
291 296
 #' )
292 297
 #' realigned
293 298
 realign_after_collisions <- function(sc_matrix,
294
-                                     other_matrices,
295
-                                     sample_column = pcr_id_column()) {
299
+    other_matrices,
300
+    sample_column = pcr_id_column()) {
296 301
     stopifnot(is.list(other_matrices) & !is.null(names(other_matrices)))
297 302
     stopifnot(all(names(other_matrices) %in% quantification_types()))
298 303
     stopifnot(is.character(sample_column))
... ...
@@ -301,8 +306,10 @@ realign_after_collisions <- function(sc_matrix,
301 306
     if (!all(all_ISm)) {
302 307
         rlang::abort(.non_ISM_error())
303 308
     }
304
-    all_campid <- purrr::map_lgl(other_matrices,
305
-                                 ~ sample_column %in% colnames(.x))
309
+    all_campid <- purrr::map_lgl(
310
+        other_matrices,
311
+        ~ sample_column %in% colnames(.x)