Browse code

[UPDATE] Partial update 1.5.4 - fixed all functions, increased package coverage, documentation still to fix

Giulia Pais authored on 15/04/2022 16:47:59
Showing73 changed files

... ...
@@ -11,6 +11,7 @@ export(annotation_issues)
11 11
 export(as_sparse_matrix)
12 12
 export(association_file_columns)
13 13
 export(available_outlier_tests)
14
+export(available_tags)
14 15
 export(blood_lineages_default)
15 16
 export(circos_genomic_density)
16 17
 export(clinical_relevant_suspicious_genes)
... ...
@@ -27,20 +28,25 @@ export(default_report_path)
27 28
 export(default_stats)
28 29
 export(generate_Vispa2_launch_AF)
29 30
 export(generate_blank_association_file)
31
+export(generate_default_folder_structure)
30 32
 export(import_Vispa2_stats)
31 33
 export(import_association_file)
32 34
 export(import_parallel_Vispa2Matrices)
33 35
 export(import_parallel_Vispa2Matrices_auto)
34 36
 export(import_parallel_Vispa2Matrices_interactive)
35 37
 export(import_single_Vispa2Matrix)
38
+export(inspect_tags)
36 39
 export(integration_alluvial_plot)
37 40
 export(is_sharing)
38 41
 export(iss_source)
42
+export(iss_stats_specs)
39 43
 export(known_clinical_oncogenes)
40 44
 export(mandatory_IS_vars)
41 45
 export(matching_options)
46
+export(matrix_file_suffixes)
42 47
 export(outlier_filter)
43 48
 export(outliers_by_pool_fragments)
49
+export(pcr_id_column)
44 50
 export(purity_filter)
45 51
 export(quantification_types)
46 52
 export(realign_after_collisions)
... ...
@@ -49,17 +55,22 @@ export(refGene_table_cols)
49 55
 export(remove_collisions)
50 56
 export(reset_af_columns_def)
51 57
 export(reset_annotation_IS_vars)
58
+export(reset_iss_stats_specs)
52 59
 export(reset_mandatory_IS_vars)
60
+export(reset_matrix_file_suffixes)
53 61
 export(sample_statistics)
54 62
 export(separate_quant_matrices)
55 63
 export(set_af_columns_def)
56 64
 export(set_annotation_IS_vars)
65
+export(set_iss_stats_specs)
57 66
 export(set_mandatory_IS_vars)
67
+export(set_matrix_file_suffixes)
58 68
 export(sharing_heatmap)
59 69
 export(sharing_venn)
60 70
 export(threshold_filter)
61 71
 export(top_abund_tableGrob)
62 72
 export(top_integrations)
73
+export(transform_columns)
63 74
 export(unzip_file_system)
64 75
 import(ggplot2)
65 76
 import(lifecycle)
... ...
@@ -71,6 +82,7 @@ importFrom(BiocParallel,bpstop)
71 82
 importFrom(BiocParallel,bptry)
72 83
 importFrom(Rcapture,closedp.0)
73 84
 importFrom(Rcapture,closedp.bc)
85
+importFrom(data.table,"%chin%")
74 86
 importFrom(data.table,.N)
75 87
 importFrom(data.table,.SD)
76 88
 importFrom(data.table,`%chin%`)
... ...
@@ -240,115 +240,51 @@ aggregate_values_by_key <- function(x,
240 240
     ),
241 241
     join_af_by = "CompleteAmplificationID") {
242 242
     stopifnot(is.data.frame(x) || is.list(x))
243
-    if (!is.data.frame(x)) {
244
-        purrr::walk(x, function(df) {
245
-            stopifnot(is.data.frame(df))
246
-            if (.check_mandatory_vars(df) == FALSE) {
247
-                rlang::abort(.missing_mand_vars())
248
-            }
249
-            if (!all(join_af_by %in% colnames(df))) {
250
-                rlang::abort(c(
251
-                    x = paste(
252
-                        "Missing common columns",
253
-                        "to join metadata"
254
-                    ),
255
-                    i = paste(
256
-                        "Missing: ",
257
-                        paste0(join_af_by[!join_af_by %in%
258
-                            colnames(df)],
259
-                        collapse = ", "
260
-                        )
261
-                    )
262
-                ))
263
-            }
264
-            if (!all(value_cols %in% colnames(df))) {
265
-                rlang::abort(.missing_user_cols_error(
266
-                    value_cols[!value_cols %in% colnames(df)]
267
-                ))
268
-            }
269
-            is_numeric_col <- purrr::map_lgl(value_cols, function(col) {
270
-                if (!is.double(df[[col]]) &&
271
-                    !is.integer(df[[col]])) {
272
-                    FALSE
273
-                } else {
274
-                    TRUE
275
-                }
276
-            }) %>% purrr::set_names(value_cols)
277
-            if (any(!is_numeric_col)) {
278
-                rlang::abort(.non_num_user_cols_error(
279
-                    names(is_numeric_col)[!is_numeric_col]
280
-                ))
281
-            }
282
-        })
283
-    } else {
284
-        if (.check_mandatory_vars(x) == FALSE) {
285
-            rlang::abort(.missing_mand_vars())
286
-        }
287
-        if (!all(join_af_by %in% colnames(x))) {
288
-            rlang::abort(c(
289
-                x = paste(
290
-                    "Missing common columns",
291
-                    "to join metadata"
292
-                ),
293
-                i = paste(
294
-                    "Missing: ",
295
-                    paste0(join_af_by[!join_af_by %in%
296
-                        colnames(x)],
297
-                    collapse = ", "
298
-                    )
299
-                )
300
-            ))
301
-        }
302
-        if (!all(value_cols %in% colnames(x))) {
303
-            rlang::abort(.missing_user_cols_error(
304
-                value_cols[!value_cols %in% colnames(x)]
305
-            ))
306
-        }
307
-        is_numeric_col <- purrr::map_lgl(value_cols, function(col) {
308
-            if (!is.double(x[[col]]) &&
309
-                !is.integer(x[[col]])) {
310
-                FALSE
311
-            } else {
312
-                TRUE
313
-            }
314
-        }) %>% purrr::set_names(value_cols)
315
-        if (any(!is_numeric_col)) {
316
-            rlang::abort(.non_num_user_cols_error(
317
-                names(is_numeric_col)[!is_numeric_col]
318
-            ))
319
-        }
320
-    }
321
-    # Check association file
322
-    stopifnot(is.data.frame(association_file))
323
-    # Check key
243
+    stopifnot(is.character(value_cols))
324 244
     stopifnot(is.character(key))
325
-    if (!all(key %in% colnames(association_file))) {
326
-        rlang::abort(c(x = "Key fields are missing from association file"))
245
+    stopifnot(is.null(group) || is.character(group))
246
+    stopifnot(is.character(join_af_by))
247
+    stopifnot(is.data.frame(association_file))
248
+    data.table::setDT(association_file)
249
+    stopifnot(is.list(lambda) && !is.null(names(lambda)))
250
+    join_key_err <- c("Join key not present in in both data frames",
251
+                      x = paste("Fields specified in the argument",
252
+                                "`join_af_by` must appear in both",
253
+                                "the association file and the matrix(es)"))
254
+    check_single_matrix <- function(df) {
255
+      stopifnot(is.data.frame(df))
256
+      is_numeric_col <- purrr::map_lgl(
257
+        value_cols,
258
+        ~ is.numeric(df[[.x]]) ||
259
+          is.double(df[[.x]]) ||
260
+          is.integer(df[[.x]])
261
+      ) %>% purrr::set_names(value_cols)
262
+      if (any(!is_numeric_col)) {
263
+        rlang::abort(.non_num_user_cols_error(
264
+          names(is_numeric_col)[!is_numeric_col]
265
+        ))
266
+      }
267
+      if (!all(join_af_by %in% colnames(df))) {
268
+        rlang::abort(join_key_err, class = "join_key_err_agg")
269
+      }
327 270
     }
328
-    # Check lambda
329
-    stopifnot(is.list(lambda))
330
-    # Check group
331
-    stopifnot(is.character(group) | is.null(group))
332
-    if (is.data.frame(x)) {
333
-        if (!all(group %in% c(colnames(association_file), colnames(x)))) {
334
-            rlang::abort(paste("Grouping variables not found"))
335
-        }
271
+    if (!is.data.frame(x)) {
272
+      purrr::walk(x, check_single_matrix)
336 273
     } else {
337
-        purrr::walk(x, function(df) {
338
-            if (!all(group %in% c(colnames(association_file), colnames(df)))) {
339
-                rlang::abort(paste("Grouping variables not found"))
340
-            }
341
-        })
274
+      check_single_matrix(x)
275
+    }
276
+    join_key_in_af  <- all(join_af_by %in% colnames(association_file))
277
+    if (!join_key_in_af) {
278
+      rlang::abort(join_key_err, class = "join_key_err_agg")
342 279
     }
343
-    if (is.data.frame(x)) {
344
-        x <- list(x)
345
-        agg_matrix <- .aggregate_lambda(
280
+    agg_matrix <- if (is.data.frame(x)) {
281
+        .aggregate_lambda(
346 282
             x, association_file, key, value_cols, lambda, group, join_af_by
347 283
         )
348
-        return(agg_matrix[[1]])
284
+    } else {
285
+      agg_matrix <- purrr::map(x, ~ .aggregate_lambda(
286
+        .x, association_file, key, value_cols, lambda, group, join_af_by
287
+      ))
349 288
     }
350
-    agg_matrix <- .aggregate_lambda(
351
-        x, association_file, key, value_cols, lambda, group, join_af_by
352
-    )
353
-    agg_matrix
289
+    return(agg_matrix)
354 290
 }
... ...
@@ -59,24 +59,20 @@ compute_abundance <- function(x,
59 59
     if (.check_mandatory_vars(x) == FALSE) {
60 60
         rlang::abort(.missing_mand_vars())
61 61
     }
62
-    stopifnot(is.logical(percentage) & length(percentage) == 1)
63
-    if (!all(columns %in% colnames(x)) | !all(key %in% colnames(x))) {
62
+    stopifnot(is.logical(percentage))
63
+    percentage <- percentage[1]
64
+    if (!all(c(columns, key) %in% colnames(x))) {
64 65
         missing_cols <- c(
65 66
             columns[!columns %in% colnames(x)],
66 67
             key[!key %in% colnames(x)]
67 68
         )
68 69
         rlang::abort(.missing_user_cols_error(missing_cols))
69 70
     }
70
-    non_num_cols <- purrr::map_lgl(columns, function(col) {
71
-        expr <- rlang::expr(`$`(x, !!col))
72
-        if (is.numeric(rlang::eval_tidy(expr)) |
73
-            is.integer(rlang::eval_tidy(expr))) {
74
-            return(FALSE)
75
-        } else {
76
-            return(TRUE)
77
-        }
78
-    })
79
-    if (any(non_num_cols)) {
71
+    non_num_cols <- purrr::map_lgl(
72
+        columns,
73
+        ~ is.numeric(x[[.x]]) || is.integer(x[[.x]])
74
+    )
75
+    if (any(!non_num_cols)) {
80 76
         rlang::abort(.non_num_user_cols_error(columns[non_num_cols]))
81 77
     }
82 78
     stopifnot(is.logical(keep_totals) || keep_totals == "df")
... ...
@@ -131,182 +127,6 @@ compute_abundance <- function(x,
131 127
     }
132 128
 }
133 129
 
134
-
135
-#' obtain a single integration matrix from individual quantification
136
-#' matrices.
137
-#'
138
-#' \lifecycle{stable}
139
-#' Takes a list of integration matrices referring to different quantification
140
-#' types and merges them in a single data frame that has multiple
141
-#' value columns, each renamed according to their quantification type
142
-#' of reference.
143
-#'
144
-#' @param x A named list of integration matrices, ideally obtained via
145
-#' \link{import_parallel_Vispa2Matrices_interactive} or
146
-#' \link{import_parallel_Vispa2Matrices_auto}. Names must be
147
-#' quantification types.
148
-#' @param fragmentEstimate The name of the output column for fragment
149
-#' estimate values
150
-#' @param seqCount The name of the output column for sequence
151
-#' count values
152
-#' @param barcodeCount The name of the output column for barcode count
153
-#' values
154
-#' @param cellCount The name of the output column for cell count values
155
-#' @param ShsCount The name of the output column for Shs count values
156
-#'
157
-#' @importFrom purrr walk map2 reduce
158
-#' @importFrom dplyr rename full_join intersect
159
-#' @importFrom magrittr `%>%`
160
-#' @importFrom rlang .data `:=`
161
-#'
162
-#' @family Analysis functions
163
-#'
164
-#' @seealso \link{quantification_types}
165
-#'
166
-#' @return A tibble
167
-#' @export
168
-#'
169
-#' @examples
170
-#' fs_path <- system.file("extdata", "fs.zip", package = "ISAnalytics")
171
-#' fs <- unzip_file_system(fs_path, "fs")
172
-#' af_path <- system.file("extdata", "asso.file.tsv.gz",
173
-#'     package = "ISAnalytics"
174
-#' )
175
-#' af <- import_association_file(af_path,
176
-#'     root = fs,
177
-#'     import_iss = FALSE,
178
-#'     report_path = NULL
179
-#' )
180
-#' matrices <- import_parallel_Vispa2Matrices(af,
181
-#'     c("seqCount", "fragmentEstimate"),
182
-#'     mode = "AUTO", report_path = NULL, multi_quant_matrix = FALSE
183
-#' )
184
-#' multi_quant <- comparison_matrix(matrices)
185
-#' head(multi_quant)
186
-comparison_matrix <- function(x,
187
-    fragmentEstimate = "fragmentEstimate",
188
-    seqCount = "seqCount",
189
-    barcodeCount = "barcodeCount",
190
-    cellCount = "cellCount",
191
-    ShsCount = "ShsCount") {
192
-    stopifnot(is.list(x) & !is.data.frame(x))
193
-    stopifnot(all(names(x) %in% quantification_types()))
194
-    stopifnot(is.character(fragmentEstimate) & length(fragmentEstimate) == 1)
195
-    stopifnot(is.character(seqCount) & length(seqCount) == 1)
196
-    stopifnot(is.character(barcodeCount) & length(barcodeCount) == 1)
197
-    stopifnot(is.character(cellCount) & length(cellCount) == 1)
198
-    stopifnot(is.character(ShsCount) & length(ShsCount) == 1)
199
-    param_names <- c(
200
-        fragmentEstimate = fragmentEstimate,
201
-        seqCount = seqCount, barcodeCount = barcodeCount,
202
-        cellCount = cellCount, ShsCount = ShsCount
203
-    )
204
-    x <- purrr::map2(x, names(x), function(matrix, quant_type) {
205
-        quant_name <- param_names[names(param_names) %in% quant_type]
206
-        matrix %>% dplyr::rename(!!quant_name := .data$Value)
207
-    })
208
-    result <- purrr::reduce(x, function(matrix1, matrix2) {
209
-        commoncols <- dplyr::intersect(colnames(matrix1), colnames(matrix2))
210
-        matrix1 %>%
211
-            dplyr::full_join(matrix2, by = commoncols)
212
-    })
213
-    na_introduced <- purrr::map_lgl(param_names, function(p) {
214
-        any(is.na(result[[p]]))
215
-    })
216
-    if (any(na_introduced) & getOption("ISAnalytics.verbose") == TRUE) {
217
-        rlang::inform(.nas_introduced_msg())
218
-    }
219
-    result
220
-}
221
-
222
-
223
-#' Separate a multiple-quantification matrix into single quantification
224
-#' matrices.
225
-#'
226
-#' \lifecycle{stable}
227
-#' The function separates a single multi-quantification integration
228
-#' matrix, obtained via \link{comparison_matrix}, into single
229
-#' quantification matrices as a named list of tibbles.
230
-#'
231
-#' @param x Single integration matrix with multiple quantification
232
-#' value columns, likely obtained via \link{comparison_matrix}.
233
-#' @param fragmentEstimate Name of the fragment estimate values column
234
-#' in input
235
-#' @param seqCount Name of the sequence count values column
236
-#' in input
237
-#' @param barcodeCount Name of the barcode count values column
238
-#' in input
239
-#' @param cellCount Name of the cell count values column
240
-#' in input
241
-#' @param ShsCount Name of the shs count values column
242
-#' in input
243
-#' @param key Key columns to perform the joining operation
244
-#'
245
-#' @importFrom purrr is_empty map set_names
246
-#' @importFrom dplyr rename
247
-#' @importFrom magrittr `%>%`
248
-#'
249
-#' @family Analysis functions
250
-#'
251
-#' @return A named list of tibbles, where names are quantification types
252
-#' @seealso \link{quantification_types}
253
-#' @export
254
-#'
255
-#' @examples
256
-#' data("integration_matrices", package = "ISAnalytics")
257
-#' separated <- separate_quant_matrices(
258
-#'     integration_matrices
259
-#' )
260
-#' separated
261
-separate_quant_matrices <- function(x,
262
-    fragmentEstimate = "fragmentEstimate",
263
-    seqCount = "seqCount",
264
-    barcodeCount = "barcodeCount",
265
-    cellCount = "cellCount",
266
-    ShsCount = "ShsCount",
267
-    key = c(
268
-        mandatory_IS_vars(),
269
-        annotation_IS_vars(),
270
-        "CompleteAmplificationID"
271
-    )) {
272
-    stopifnot(is.data.frame(x))
273
-    if (!all(key %in% colnames(x))) {
274
-        rlang::abort(.missing_user_cols_error(key[!key %in% colnames(x)]))
275
-    }
276
-    num_cols <- .find_exp_cols(x, key)
277
-    if (purrr::is_empty(num_cols)) {
278
-        rlang::abort(.missing_num_cols_error())
279
-    }
280
-    stopifnot(is.character(fragmentEstimate) & length(fragmentEstimate) == 1)
281
-    stopifnot(is.character(seqCount) & length(seqCount) == 1)
282
-    stopifnot(is.character(barcodeCount) & length(barcodeCount) == 1)
283
-    stopifnot(is.character(cellCount) & length(cellCount) == 1)
284
-    stopifnot(is.character(ShsCount) & length(ShsCount) == 1)
285
-    param_col <- c(
286
-        fragmentEstimate = fragmentEstimate,
287
-        seqCount = seqCount, barcodeCount = barcodeCount,
288
-        cellCount = cellCount,
289
-        ShsCount = ShsCount
290
-    )
291
-    to_copy <- if (any(!num_cols %in% param_col)) {
292
-        if (all(!num_cols %in% param_col)) {
293
-            rlang::abort(.non_quant_cols_error())
294
-        }
295
-        num_cols[!num_cols %in% param_col]
296
-    }
297
-    num_cols <- param_col[param_col %in% num_cols]
298
-    if (!purrr::is_empty(to_copy) & getOption("ISAnalytics.verbose") == TRUE) {
299
-        rlang::inform(.non_quant_cols_msg(to_copy))
300
-    }
301
-    separated <- purrr::map(num_cols, function(quant) {
302
-        x %>%
303
-            dplyr::select(dplyr::all_of(c(key, to_copy, quant))) %>%
304
-            dplyr::rename(Value = quant)
305
-    }) %>% purrr::set_names(names(num_cols))
306
-    separated
307
-}
308
-
309
-
310 130
 #' Filter data frames with custom predicates
311 131
 #'
312 132
 #' @description
... ...
@@ -468,7 +288,6 @@ threshold_filter <- function(x,
468 288
     return(.tf_list(x, threshold, cols_to_compare, comparators))
469 289
 }
470 290
 
471
-
472 291
 #' Sorts and keeps the top n integration sites based on the values
473 292
 #' in a given column.
474 293
 #'
... ...
@@ -525,6 +344,7 @@ threshold_filter <- function(x,
525 344
 #'     keep = "Value2",
526 345
 #'     key = "CompleteAmplificationID"
527 346
 #' )
347
+#top_abundant_is
528 348
 top_integrations <- function(x,
529 349
     n = 20,
530 350
     columns = "fragmentEstimate_sum_RelAbundance",
... ...
@@ -588,6 +408,244 @@ top_integrations <- function(x,
588 408
 }
589 409
 
590 410
 
411
+top_targeted_genes <- function(x,
412
+    n = 20,
413
+    key = c(
414
+        "SubjectID", "CellMarker",
415
+        "Tissue", "TimePoint"
416
+    ),
417
+    consider_chr = TRUE,
418
+    consider_gene_strand = TRUE,
419
+    as_df = TRUE) {
420
+  stopifnot(is.data.frame(x))
421
+  data.table::setDT(x)
422
+  stopifnot(is.numeric(n) || is.integer(n))
423
+  stopifnot(is.null(key) || is.character(key))
424
+  stopifnot(is.logical(consider_chr))
425
+  stopifnot(is.logical(consider_gene_strand))
426
+
427
+  required_annot_tags <- c("gene_symbol")
428
+  if (consider_gene_strand) {
429
+    required_annot_tags <- c(required_annot_tags, "gene_strand")
430
+  }
431
+  annot_tag_cols <- .check_required_cols(required_annot_tags,
432
+                                         annotation_IS_vars(TRUE),
433
+                                         "error")
434
+  if (consider_chr) {
435
+    chr_tag_col <- .check_required_cols(c("chromosome", "locus"),
436
+                                        mandatory_IS_vars(TRUE),
437
+                                        "error")
438
+    annot_tag_cols <- annot_tag_cols %>%
439
+      dplyr::bind_rows(chr_tag_col)
440
+  }
441
+  data.table::setDT(annot_tag_cols)
442
+  cols_to_check <- c(annot_tag_cols$names, key)
443
+  if (!all(cols_to_check %in% colnames(x))) {
444
+    rlang::abort(.missing_needed_cols(
445
+      cols_to_check[!cols_to_check %in% colnames(x)]))
446
+  }
447
+
448
+  df_with_is_counts <- if (is.null(key)) {
449
+    .count_distinct_is_per_gene(
450
+      x = x, include_chr = consider_chr,
451
+      include_gene_strand = consider_gene_strand,
452
+      gene_sym_col = annot_tag_cols[
453
+        eval(sym("tag")) == "gene_symbol"][["names"]],
454
+      gene_strand_col = annot_tag_cols[
455
+        eval(sym("tag")) == "gene_strand"][["names"]],
456
+      chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]],
457
+      mand_vars_to_check = mandatory_IS_vars(TRUE)
458
+    ) %>%
459
+      dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
460
+      dplyr::slice_head(n = n)
461
+  } else {
462
+    tmp <- x[, .count_distinct_is_per_gene(
463
+      x = .SD, include_chr = consider_chr,
464
+      include_gene_strand = consider_gene_strand,
465
+      gene_sym_col = annot_tag_cols[
466
+        eval(sym("tag")) == "gene_symbol"][["names"]],
467
+      gene_strand_col = annot_tag_cols[
468
+        eval(sym("tag")) == "gene_strand"][["names"]],
469
+      chr_col = annot_tag_cols[eval(sym("tag")) == "chromosome"][["names"]],
470
+      mand_vars_to_check = mandatory_IS_vars(TRUE)
471
+    ), by = eval(key)]
472
+    tmp[,
473
+        .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
474
+          dplyr::slice_head(n = n),
475
+        by = eval(key)]
476
+  }
477
+  if (as_df) {
478
+    return(df_with_is_counts)
479
+  }
480
+  return(split(df_with_is_counts, by = key))
481
+}
482
+
483
+
484
+gene_frequency_fisher <- function(cis_x,
485
+    cis_y,
486
+    min_is_per_gene = 3,
487
+    gene_set_method = c("intersection", "union"),
488
+    onco_db_file = "proto_oncogenes",
489
+    tumor_suppressors_db_file = "tumor_suppressors",
490
+    species = "human",
491
+    known_onco = known_clinical_oncogenes(),
492
+    suspicious_genes =
493
+        clinical_relevant_suspicious_genes(),
494
+    significance_threshold = 0.05,
495
+    remove_unbalanced_0 = TRUE) {
496
+    ## --- Input checks
497
+    stopifnot(is.data.frame(cis_x) && is.data.frame(cis_y))
498
+    stopifnot(is.integer(min_is_per_gene) || is.numeric(min_is_per_gene))
499
+    gene_set_method <- rlang::arg_match(gene_set_method)
500
+    stopifnot(is.character(onco_db_file))
501
+    stopifnot(is.character(tumor_suppressors_db_file))
502
+    stopifnot(is.character(species))
503
+    stopifnot(is.data.frame(known_onco))
504
+    stopifnot(is.data.frame(suspicious_genes))
505
+    stopifnot(is.numeric(significance_threshold))
506
+    stopifnot(is.logical(remove_unbalanced_0))
507
+    ## -- Fetch gene symbol column
508
+    gene_sym_col <- .check_required_cols(
509
+      "gene_symbol", annotation_IS_vars(TRUE), duplicate_politic = "error"
510
+    )[["names"]]
511
+    req_cis_cols <- c(gene_sym_col, "n_IS_perGene", "average_TxLen",
512
+                      "raw_gene_integration_frequency")
513
+    quiet_expand <- purrr::quietly(.expand_cis_df)
514
+    cols_for_join <- c(gene_sym_col,
515
+                     "Onco1_TS2", "ClinicalRelevance", "DOIReference",
516
+                     "KnownGeneClass", "KnownClonalExpansion",
517
+                     "CriticalForInsMut")
518
+    ## --- Calculations to perform on each df
519
+    append_calc <- function(df, group_n) {
520
+      if (!all(req_cis_cols %in% colnames(df))) {
521
+        rlang::abort(
522
+          .missing_needed_cols(req_cis_cols[!req_cis_cols %in% colnames(df)]))
523
+      }
524
+      modified <- quiet_expand(df, gene_sym_col,
525
+                                 onco_db_file, tumor_suppressors_db_file,
526
+                                 species, known_onco, suspicious_genes)$result
527
+      modified <- modified %>%
528
+        dplyr::mutate(
529
+          IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000,
530
+          Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen, na.rm = TRUE),
531
+          IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen /
532
+                                              .data$Sum_IS_per_kbGeneLen) * 1e6
533
+        ) %>%
534
+        dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>%
535
+        dplyr::select(dplyr::all_of(c(req_cis_cols, cols_for_join,
536
+                                      "IS_per_kbGeneLen",
537
+                                      "Sum_IS_per_kbGeneLen",
538
+                                      "IS_per_kbGeneLen_perMDepth_TPM")))
539
+      colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste(
540
+        colnames(modified)[!colnames(modified) %in% cols_for_join], group_n,
541
+        sep = "_"
542
+      )
543
+      return(modified)
544
+    }
545
+    cis_mod <- purrr::map2(list(cis_x, cis_y), c(1,2), append_calc)
546
+    ## --- Merge the two in 1 df
547
+    merged <- if (gene_set_method == "union") {
548
+      purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join))
549
+    } else {
550
+      purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
551
+    }
552
+    if (nrow(merged) == 0) {
553
+      if (getOption("ISAnalytics.verbose") == TRUE) {
554
+        msg <- c("Data frame empty after filtering",
555
+                 i = paste("Data frame is empty after applying filter on IS,",
556
+                           "is your filter too stringent?"),
557
+                 x = "Nothing to return")
558
+        rlang::inform(msg, class = "empty_df_gene_freq")
559
+      }
560
+      return(NULL)
561
+    }
562
+    ## --- Actual computation of fisher test: test is applied on each row
563
+    ## (each gene)
564
+    merged <- merged %>%
565
+      dplyr::mutate(
566
+        tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE),
567
+        tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE)
568
+      )
569
+    compute_fisher <- function(...) {
570
+      row <- list(...)
571
+      n_IS_perGene_1 <- row$n_IS_perGene_1
572
+      n_IS_perGene_2 <- row$n_IS_perGene_2
573
+      n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0
574
+      n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0
575
+      matrix <- matrix(
576
+        data = c(n_IS_perGene_1,
577
+                 row$tot_n_IS_perGene_1 - n_IS_perGene_1,
578
+                 n_IS_perGene_2,
579
+                 row$tot_n_IS_perGene_2 - n_IS_perGene_2),
580
+        nrow = 2,
581
+        dimnames = list(G1 = c("IS_of_gene", "TotalIS"),
582
+                        G2 = c("IS_of_gene", "TotalIS"))
583
+      )
584
+      ft <- stats::fisher.test(matrix)
585
+      return(ft$p.value)
586
+    }
587
+    merged <- merged %>%
588
+      dplyr::mutate(
589
+        Fisher_p_value = purrr::pmap_dbl(., compute_fisher)
590
+      ) %>%
591
+      dplyr::mutate(
592
+        Fisher_p_value_significant = dplyr::if_else(
593
+          condition = .data$Fisher_p_value < significance_threshold,
594
+          true = TRUE, false = FALSE
595
+        )
596
+      )
597
+    ## --- Removing unbalanced 0s if requested - this scenario applies
598
+    ## only if "union" is selected as method for join
599
+    if (remove_unbalanced_0) {
600
+      mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE))
601
+      mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE))
602
+      test_exclude <- function(...) {
603
+        row <- list(...)
604
+        if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) {
605
+          to_ex <- ifelse(
606
+              test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) &
607
+                (is.na(row$n_IS_perGene_2))) |
608
+                ((is.na(row$n_IS_perGene_1)) &
609
+                   (row$n_IS_perGene_2 < mean_is_per_gene_2)),
610
+              yes = TRUE,
611
+              no = FALSE
612
+          )
613
+          return(to_ex)
614
+        }
615
+        return(FALSE)
616
+      }
617
+      merged <- merged %>%
618
+        dplyr::mutate(
619
+          to_exclude_from_test = purrr::pmap(., test_exclude)
620
+        ) %>%
621
+        dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
622
+        dplyr::select(-.data$to_exclude_from_test)
623
+      if (nrow(merged) == 0) {
624
+        if (getOption("ISAnalytics.verbose") == TRUE) {
625
+          msg <- c("Data frame empty after filtering",
626
+                   i = paste("Data frame is after removing unbalanced IS,",
627
+                             "nothing to return"))
628
+          rlang::inform(msg, class = "empty_df_gene_freq_unbal")
629
+        }
630
+        return(NULL)
631
+      }
632
+    }
633
+    ## --- Apply statistical corrections to p-value
634
+    merged <- merged %>%
635
+      dplyr::mutate(
636
+        Fisher_p_value_fdr = p.adjust(.data$Fisher_p_value, method = "fdr",
637
+                                      n = length(.data$Fisher_p_value)),
638
+        Fisher_p_value_benjamini = p.adjust(.data$Fisher_p_value, method = "BY",
639
+                                            n = length(.data$Fisher_p_value)),
640
+        minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10)
641
+      ) %>%
642
+      dplyr::mutate(
643
+        minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10),
644
+      )
645
+    return(merged)
646
+}
647
+
648
+
591 649
 #' Computes user specified functions on numerical columns and updates
592 650
 #' the metadata data frame accordingly.
593 651
 #'
... ...
@@ -649,7 +707,8 @@ top_integrations <- function(x,
649 707
 #'     value_columns = c("seqCount", "fragmentEstimate")
650 708
 #' )
651 709
 #' stats
652
-sample_statistics <- function(x, metadata,
710
+sample_statistics <- function(x,
711
+    metadata,
653 712
     sample_key = "CompleteAmplificationID",
654 713
     value_columns = "Value",
655 714
     functions = default_stats(),
... ...
@@ -676,7 +735,9 @@ sample_statistics <- function(x, metadata,
676 735
             is.integer(rlang::eval_tidy(expr))
677 736
     })
678 737
     if (any(vcols_are_numeric == FALSE)) {
679
-        rlang::abort(.non_num_user_cols_error(value_columns[!vcols_are_numeric]))
738
+        rlang::abort(.non_num_user_cols_error(
739
+            value_columns[!vcols_are_numeric]
740
+        ))
680 741
     }
681 742
     purrr::walk(functions, function(f) {
682 743
         if (!(purrr::is_function(f) | purrr::is_formula(f))) {
... ...
@@ -746,9 +807,14 @@ sample_statistics <- function(x, metadata,
746 807
 #'
747 808
 #' @details
748 809
 #' ## Genomic annotation file
749
-#' This file is a data base, or more simply a .tsv file to import, with
750
-#' genes annotation for the specific genome. The annotations for the
751
-#' human genome (hg19) and murine genome (mm9) are already
810
+#' A data frame containing
811
+#' genes annotation for the specific genome.
812
+#' From version `1.5.4` the argument `genomic_annotation_file` accepts only
813
+#' data frames or package provided defaults.
814
+#' The user is responsible for importing the appropriate tabular files if
815
+#' customization is needed.
816
+#' The annotations for the human genome (hg19) and
817
+#' murine genome (mm9) are already
752 818
 #' included in this package: to use one of them just
753 819
 #' set the argument `genomic_annotation_file` to either `"hg19"` or
754 820
 #' `"mm9"`.
... ...
@@ -791,68 +857,88 @@ CIS_grubbs <- function(x,
791 857
     genomic_annotation_file = "hg19",
792 858
     grubbs_flanking_gene_bp = 100000,
793 859
     threshold_alpha = 0.05,
794
-    by = NULL) {
795
-    # Check x has the correct structure
860
+    by = NULL,
861
+    return_missing_as_df = TRUE,
862
+    results_as_list = TRUE) {
863
+    ## Check x has the correct structure
796 864
     stopifnot(is.data.frame(x))
797
-    if (!all(mandatory_IS_vars() %in% colnames(x))) {
798
-        rlang::abort(.missing_mand_vars())
865
+    ## Check dyn vars for required tags
866
+    req_mand_vars <- .check_required_cols(c("chromosome", "locus"),
867
+        vars_df = mandatory_IS_vars(TRUE),
868
+        duplicate_politic = "error"
869
+    )
870
+    chrom_col <- req_mand_vars %>%
871
+        dplyr::filter(.data$tag == "chromosome") %>%
872
+        dplyr::pull(.data$names)
873
+    locus_col <- req_mand_vars %>%
874
+        dplyr::filter(.data$tag == "locus") %>%
875
+        dplyr::pull(.data$names)
876
+    strand_col <- if ("is_strand" %in% mandatory_IS_vars(TRUE)$tag) {
877
+        col <- mandatory_IS_vars(TRUE) %>%
878
+            dplyr::filter(.data$tag == "is_strand") %>%
879
+            dplyr::pull(.data$names)
880
+        if (col %in% colnames(x)) {
881
+            col
882
+        } else {
883
+            NULL
884
+        }
885
+    } else {
886
+        NULL
799 887
     }
800
-    if (!.is_annotated(x)) {
801
-        rlang::abort(.missing_annot())
888
+    req_annot_col <- .check_required_cols(
889
+        list(gene_symbol = "char", gene_strand = "char"),
890
+        vars_df = annotation_IS_vars(TRUE),
891
+        "error"
892
+    )
893
+    gene_symbol_col <- req_annot_col %>%
894
+        dplyr::filter(.data$tag == "gene_symbol") %>%
895
+        dplyr::pull(.data$names)
896
+    gene_strand_col <- req_annot_col %>%
897
+        dplyr::filter(.data$tag == "gene_strand") %>%
898
+        dplyr::pull(.data$names)
899
+    cols_required <- c(chrom_col, locus_col, gene_symbol_col, gene_strand_col)
900
+    if (!all(cols_required %in% colnames(x))) {
901
+        rlang::abort(.missing_user_cols_error(
902
+            cols_required[!cols_required %in% colnames(x)]
903
+        ))
802 904
     }
803 905
     # Check other parameters
804
-    stopifnot(is.character(genomic_annotation_file))
906
+    stopifnot(is.data.frame(genomic_annotation_file) ||
907
+        is.character(genomic_annotation_file))
805 908
     genomic_annotation_file <- genomic_annotation_file[1]
806
-    if (genomic_annotation_file %in% c("hg19", "mm9")) {
909
+    if (is.character(genomic_annotation_file) &&
910
+        !genomic_annotation_file %in% c("hg19", "mm9")) {
911
+        err_msg <- c("Genomic annotation file unknown",
912
+            x = paste(
913
+                "Since ISAnalytics 1.5.4, if provided as",
914
+                "character vector, `genomic_annotation_file`",
915
+                "parameter must be one of 'hg19' or 'mm9'"
916
+            ),
917
+            i = paste(
918
+                "For using other genome reference files",
919
+                "import them in the R environment and pass",
920
+                "them to the function"
921
+            )
922
+        )
923
+        rlang::abort(err_msg, class = "genomic_file_char")
924
+    }
925
+    if (is.character(genomic_annotation_file)) {
807 926
         gen_file <- paste0("refGenes_", genomic_annotation_file)
808 927
         utils::data(list = gen_file, envir = rlang::current_env())
809 928
         refgenes <- rlang::eval_tidy(rlang::sym(gen_file))
810 929
     } else {
811
-        stopifnot(file.exists(genomic_annotation_file))
812
-        # Determine file extension
813
-        ext <- .check_file_extension(genomic_annotation_file)
814
-        # Try to import annotation file
815
-        if (ext == "tsv") {
816
-            refgenes <- utils::read.csv(
817
-                file = genomic_annotation_file,
818
-                header = TRUE, fill = TRUE, sep = "\t",
819
-                check.names = FALSE,
820
-                na.strings = c("NONE", "NA", "NULL", "NaN", "")
821
-            )
822
-            # Check annotation file format
823
-            if (!all(refGene_table_cols() %in% colnames(refgenes))) {
824
-                rlang::abort(.non_standard_annotation_structure())
825
-            }
826
-            refgenes <- tibble::as_tibble(refgenes) %>%
827
-                dplyr::mutate(chrom = stringr::str_replace_all(
828
-                    .data$chrom,
829
-                    "chr", ""
830
-                ))
831
-        } else if (ext == "csv") {
832
-            refgenes <- utils::read.csv(
833
-                file = genomic_annotation_file,
834
-                header = TRUE, fill = TRUE,
835
-                check.names = FALSE,
836
-                na.strings = c("NONE", "NA", "NULL", "NaN", "")
837
-            )
838
-            # Check annotation file format
839
-            if (!all(refGene_table_cols() %in% colnames(refgenes))) {
840
-                rlang::abort(.non_standard_annotation_structure())
841
-            }
842
-            refgenes <- tibble::as_tibble(refgenes) %>%
843
-                dplyr::mutate(chrom = stringr::str_replace_all(
844
-                    .data$chrom,
845
-                    "chr", ""
846
-                ))
847
-        } else {
848
-            gen_file_err <- paste(
849
-                "The genomic annotation file must be either in",
850
-                ".tsv or .csv format (compressed or not)"
851
-            )
852
-            rlang::abort(gen_file_err)
930
+        # Check annotation file format
931
+        refgenes <- genomic_annotation_file
932
+        if (!all(refGene_table_cols() %in% colnames(refgenes))) {
933
+            rlang::abort(.non_standard_annotation_structure())
853 934
         }
935
+        refgenes <- tibble::as_tibble(refgenes) %>%
936
+            dplyr::mutate(chrom = stringr::str_replace_all(
937
+                .data$chrom,
938
+                "chr", ""
939
+            ))
854 940
     }
855
-    stopifnot(is.numeric(grubbs_flanking_gene_bp) |
941
+    stopifnot(is.numeric(grubbs_flanking_gene_bp) ||
856 942
         is.integer(grubbs_flanking_gene_bp))
857 943
     grubbs_flanking_gene_bp <- grubbs_flanking_gene_bp[1]
858 944
     stopifnot(is.numeric(threshold_alpha))
... ...
@@ -861,13 +947,20 @@ CIS_grubbs <- function(x,
861 947
     if (!all(by %in% colnames(x))) {
862 948
         rlang::abort(.missing_user_cols_error(by[!by %in% colnames(x)]))
863 949
     }
864
-    result <- if (is.null(by)) {
865
-        .cis_grubb_calc(
950
+    stopifnot(is.logical(return_missing_as_df))
951
+    return_missing_as_df <- return_missing_as_df[1]
952
+    if (is.null(by)) {
953
+        result <- .cis_grubb_calc(
866 954
             x = x,
867 955
             refgenes = refgenes,
868 956
             grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
869
-            threshold_alpha = threshold_alpha
957
+            threshold_alpha = threshold_alpha,
958
+            gene_symbol_col = gene_symbol_col,
959
+            gene_strand_col = gene_strand_col,
960
+            chr_col = chrom_col, locus_col = locus_col,
961
+            strand_col = strand_col
870 962
         )
963
+        missing_df <- result$missing
871 964
     } else {
872 965
         grouped <- x %>%
873 966
             dplyr::group_by(dplyr::across(dplyr::all_of(by)))
... ...
@@ -879,14 +972,60 @@ CIS_grubbs <- function(x,
879 972
             dplyr::group_by(dplyr::across(dplyr::all_of(by))) %>%
880 973
             dplyr::group_split() %>%
881 974
             purrr::set_names(group_keys)
882
-        purrr::map(split, ~ .cis_grubb_calc(
975
+        result <- purrr::map(split, ~ .cis_grubb_calc(
883 976
             x = .x,
884 977
             refgenes = refgenes,
885 978
             grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
886
-            threshold_alpha = threshold_alpha
979
+            threshold_alpha = threshold_alpha,
980
+            gene_symbol_col = gene_symbol_col,
981
+            gene_strand_col = gene_strand_col,
982
+            chr_col = chrom_col, locus_col = locus_col,
983
+            strand_col = strand_col
887 984
         ))
985
+        missing_df <- purrr::map(result, ~ .x$missing) %>%
986
+            purrr::reduce(dplyr::bind_rows) %>%
987
+            dplyr::distinct()
888 988
     }
889
-    return(result)
989
+    if (nrow(missing_df) > 0 & getOption("ISAnalytics.verbose") == TRUE) {
990
+        warn_miss <- c("Warning: missing genes in refgenes table",
991
+            i = paste(paste(
992
+                "A total of", nrow(missing_df),
993
+                "genes",
994
+                "were found in the input data but not",
995
+                "in the refgene table. This may be caused by",
996
+                "a mismatch in the annotation phase of",
997
+                "the matrix. Here is a summary: "
998
+            ),
999
+            paste0(capture.output({
1000
+                print(missing_df, n = Inf)
1001
+            }), collapse = "\n"),
1002
+            sep = "\n"
1003
+            ),
1004
+            i = paste(
1005
+                "NOTE: missing genes will be removed from",
1006
+                "the final output! Review results carefully"
1007
+            )
1008
+        )
1009
+        rlang::inform(warn_miss, class = "warn_miss_genes")
1010
+    }
1011
+    if (!is.null(by)) {
1012
+        result <- if (results_as_list) {
1013
+            purrr::map(result, ~ .x$df)
1014
+        } else {
1015
+            purrr::map2(result, names(result), ~ {
1016
+                .x$df %>%
1017
+                    dplyr::mutate(group = .y)
1018
+            }) %>% purrr::reduce(dplyr::bind_rows)
1019
+        }
1020
+        if (return_missing_as_df) {
1021
+            return(list(cis = result, missing_genes = missing_df))
1022
+        }
1023
+        return(result)
1024
+    }
1025
+    if (return_missing_as_df) {
1026
+        return(list(cis = result$df, missing_genes = missing_df))
1027
+    }
1028
+    return(result$df)
890 1029
 }
891 1030
 
892 1031
 #' Integrations cumulative count in time by sample
... ...
@@ -970,115 +1109,19 @@ cumulative_count_union <- function(x,
970 1109
     zero = "0000",
971 1110
     aggregate = FALSE,
972 1111
     ...) {
973
-    stopifnot(is.data.frame(x))
974
-    stopifnot(is.data.frame(association_file) | is.null(association_file))
975
-    stopifnot(is.character(timepoint_column) & length(timepoint_column) == 1)
976
-    stopifnot(is.character(key))
977
-    stopifnot(is.logical(include_tp_zero))
978
-    stopifnot(is.character(zero) & length(zero) == 1)
979
-    stopifnot(is.logical(aggregate))
980
-    if (aggregate == TRUE & is.null(association_file)) {
981
-        rlang::abort(.agg_with_null_meta_err())
982
-    }
983
-    if (!all(timepoint_column %in% key)) {
984
-        rlang::abort(.key_without_tp_err())
985
-    }
986
-    if (aggregate == FALSE) {
987
-        if (!all(key %in% colnames(x))) {
988
-            rlang::abort(.key_not_found())
989
-        }
990
-    } else {
991
-        x <- aggregate_values_by_key(
992
-            x = x,
993
-            association_file = association_file,
994
-            key = key, ...
995
-        )
996
-        if (is.numeric(association_file[[timepoint_column]]) |
997
-            is.integer(association_file[[timepoint_column]])) {
998
-            max <- max(association_file[[timepoint_column]])
999
-            digits <- floor(log10(x)) + 1
1000
-            association_file <- association_file %>%
1001
-                dplyr::mutate(
1002
-                    {{ timepoint_column }} := stringr::str_pad(
1003
-                        as.character(.data$TimePoint),
1004
-                        digits,
1005
-                        side = "left",
1006
-                        pad = "0"
1007
-                    )
1008
-                )
1009
-            zero <- paste0(rep_len("0", digits), collapse = "")
1010
-        }
1011
-    }
1012
-    if (include_tp_zero == FALSE) {
1013
-        x <- x %>%
1014
-            dplyr::filter(dplyr::across(
1015
-                dplyr::all_of(timepoint_column),
1016
-                ~ .x != zero
1017
-            ))
1018
-        if (nrow(x) == 0) {
1019
-            all_tp0_msg <- paste(
1020
-                "All time points zeros were excluded, the data",
1021
-                "frame is empty."
1022
-            )
1023
-            rlang::inform(all_tp0_msg)
1024
-            return(x)
1025
-        }
1026
-    }
1027
-    annot <- if (.is_annotated(x)) {
1028
-        annotation_IS_vars()
1029
-    } else {
1030
-        character(0)
1031
-    }
1032
-    x <- x %>% dplyr::select(dplyr::all_of(c(
1033
-        mandatory_IS_vars(),
1034
-        annot,
1035
-        key
1036
-    )))
1037
-    cols_for_join <- colnames(x)[!colnames(x) %in% timepoint_column]
1038
-    key_minus_tp <- key[!key %in% timepoint_column]
1039
-    distinct_tp_for_each <- x %>%
1040
-        dplyr::arrange(dplyr::across(dplyr::all_of(timepoint_column))) %>%
1041
-        dplyr::group_by(dplyr::across(dplyr::all_of(key_minus_tp))) %>%
1042
-        dplyr::summarise(
1043
-            Distinct_tp = unique(
1044
-                `$`(.data, !!timepoint_column)
1045
-            ),
1046
-            .groups = "drop"
1047
-        ) %>%
1048
-        dplyr::rename({{ timepoint_column }} := "Distinct_tp")
1049
-
1050
-    splitted <- x %>%
1051
-        dplyr::arrange(dplyr::across(dplyr::all_of(timepoint_column))) %>%
1052
-        dplyr::group_by(dplyr::across(dplyr::all_of(timepoint_column))) %>%
1053
-        dplyr::group_split()
1054
-
1055
-    mult_tp <- purrr::reduce(splitted, dplyr::full_join, by = cols_for_join)
1056
-    tp_indexes <- grep(timepoint_column, colnames(mult_tp))
1057
-    tp_indexes <- tp_indexes[-1]
1058
-    res <- if (!purrr::is_empty(tp_indexes)) {
1059
-        for (i in tp_indexes) {
1060
-            mod_col <- mult_tp[[i]]
1061
-            mod_col_prec <- mult_tp[[i - 1]]
1062
-            val <- dplyr::first(stats::na.omit(mod_col))
1063
-            mod_col[!is.na(mod_col_prec)] <- val
1064
-            mult_tp[i] <- mod_col
1065
-        }
1066
-        mult_tp %>%
1067
-            tidyr::pivot_longer(
1068
-                cols = dplyr::starts_with(timepoint_column),
1069
-                values_to = timepoint_column,
1070
-                values_drop_na = TRUE
1071
-            ) %>%
1072
-            dplyr::select(-c("name")) %>%
1073
-            dplyr::distinct()
1074
-    } else {
1075
-        mult_tp
1076
-    }
1077
-    res <- res %>% dplyr::semi_join(distinct_tp_for_each, by = key)
1078
-    res <- res %>%
1079
-        dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
1080
-        dplyr::summarise(count = dplyr::n(), .groups = "drop")
1081
-    return(res)
1112
+    lifecycle::deprecate_warn(
1113
+        when = "1.5.4",
1114
+        what = "cumulative_count_union()",
1115
+        with = "cumulative_is()",
1116
+        details = c(paste(
1117
+            "Use option `counts = TRUE`. Function will be likely dropped in the",
1118
+            "next release cycle"
1119
+        ))
1120
+    )
1121
+    cumulative_is(
1122
+        x = x, timepoint_col = timepoint_column, key = key,
1123
+        include_tp_zero = include_tp_zero, counts = TRUE
1124
+    )
1082 1125
 }
1083 1126
 
1084 1127
 #' Expands integration matrix with the cumulative is union over time.
... ...
@@ -1125,14 +1168,17 @@ cumulative_is <- function(x,
1125 1168
     ),
1126 1169
     timepoint_col = "TimePoint",
1127 1170
     include_tp_zero = FALSE,
1128
-    keep_og_is = TRUE,
1129
-    expand = FALSE) {
1171
+    counts = TRUE,
1172
+    keep_og_is = FALSE,
1173
+    expand = TRUE) {
1130 1174
     stopifnot(is.data.frame(x))
1131 1175
     stopifnot(is.character(key))
1132 1176
     stopifnot(is.character(timepoint_col))
1133 1177
     timepoint_col <- timepoint_col[1]
1134 1178
     stopifnot(is.logical(include_tp_zero))
1135 1179
     include_tp_zero <- include_tp_zero[1]
1180
+    stopifnot(is.logical(counts))
1181
+    counts <- counts[1]
1136 1182
     stopifnot(is.logical(keep_og_is))
1137 1183
     stopifnot(is.logical(expand))
1138 1184
     if (!timepoint_col %in% key) {
... ...
@@ -1153,7 +1199,7 @@ cumulative_is <- function(x,
1153 1199
         temp <- temp %>%
1154 1200
             dplyr::filter(.data[[timepoint_col]] != 0)
1155 1201
         if (nrow(temp) == 0) {
1156
-            rlang::inform(.only_zero_tp())
1202
+            rlang::inform(.only_zero_tp(), class = "only_zero_tps")
1157 1203
             return(NULL)
1158 1204
         }
1159 1205
     }
... ...
@@ -1163,11 +1209,11 @@ cumulative_is <- function(x,
1163 1209
         dplyr::distinct(dplyr::across(dplyr::all_of(is_vars)),
1164 1210
             .keep_all = TRUE
1165 1211
         )
1166
-    temp <- data.table::setDT(temp)
1212
+    data.table::setDT(temp)
1167 1213
     temp <- temp[, .(is = list(.SD)), by = key]
1168 1214
     no_tp_key <- key[key != timepoint_col]
1169
-    splitted <- split(temp, by = no_tp_key)
1170
-    cumulate <- purrr::map(splitted, function(x) {
1215
+    split <- split(temp, by = no_tp_key)
1216
+    cumulate <- purrr::map(split, function(x) {
1171 1217
         x[, cumulative_is := purrr::accumulate(
1172 1218
             is,
1173 1219
             ~ data.table::funion(.x, .y)
... ...
@@ -1177,13 +1223,25 @@ cumulative_is <- function(x,
1177 1223
     if (!keep_og_is) {
1178 1224
         cumulate[, is := NULL]
1179 1225
     }
1226
+    counts_df <- if (counts) {
1227
+        cumulate[, list(is_n_cumulative = unlist(purrr::map(
1228
+            get("cumulative_is"), nrow
1229
+        ))), by = key]
1230
+    } else {
1231
+        NULL
1232
+    }
1180 1233
     if (expand) {
1181 1234
         cumulate <- tidyr::unnest(cumulate,
1182 1235
             cols = "cumulative_is"
1183 1236
         )
1184
-        cumulate <- data.table::setDT(cumulate)
1237
+        data.table::setDT(cumulate)
1238
+    }
1239
+    to_return <- if (counts) {
1240
+        list(coordinates = cumulate, counts = counts_df)
1241
+    } else {
1242
+        cumulate
1185 1243
     }
1186
-    cumulate
1244
+    return(to_return)
1187 1245
 }
1188 1246
 
1189 1247
 #' Sharing of integration sites between given groups.
... ...
@@ -1462,270 +1520,6 @@ is_sharing <- function(...,
1462 1520
 }
1463 1521
 
1464 1522
 
1465
-#' Filter integration sites based on purity.
1466
-#'
1467
-#' @description
1468
-#' \lifecycle{experimental}
1469
-#' Filter that targets possible contamination between cell lines based on
1470
-#' a numeric quantification (likely abundance or sequence count).
1471
-#'
1472
-#' @details
1473
-#' ## Setting input arguments
1474
-#'
1475
-#' The input matrix can be re-aggregated with the provided `group_key`
1476
-#' argument. This key contains the names of the columns to group on
1477
-#' (besides the columns holding genomic coordinates of the integration
1478
-#' sites) and must be contained in at least one of `x` or `lineages`
1479
-#' data frames. If the key is not found only in `x`, then a join operation
1480
-#' with the `lineages` data frame is performed on the common column(s)
1481
-#' `join_on`.
1482
-#'
1483
-#' ## Group selection
1484
-#' It is possible for the user to specify on which groups the logic of the
1485
-#' filter should be applied to. For example: if we have
1486
-#' `group_key = c("HematoLineage")` and we set
1487
-#' `selected_groups = c("CD34", "Myeloid","Lymphoid")`
1488
-#' it means that a single integration will be evaluated for the filter only
1489
-#' for groups that have the values of "CD34", "Myeloid" and "Lymphoid" in
1490
-#' the "HematoLineage" column.
1491
-#' If the same integration is present in other groups it is
1492
-#' kept as it is. `selected_groups` can be set to `NULL` if we want
1493
-#' the logic to apply to every group present in the data frame,
1494
-#' it can be set as a simple character vector as the example above if
1495
-#' the group key has length 1 (and there is no need to filter on time point).
1496
-#' If the group key is longer than 1 then the filter is applied only on the
1497
-#' first element of the key.
1498
-#'
1499
-#' If a more refined selection on groups is needed, a data frame can
1500
-#' be provided instead:
1501
-#'
1502
-#' ```
1503
-#' group_key = c("CellMarker", "Tissue")
1504
-#' selected_groups = tibble::tribble(
1505
-#' ~ CellMarker, ~ Tissue,
1506
-#' "CD34", "BM",
1507
-#' "CD14", "BM",
1508
-#' "CD14", "PB"
1509
-#' )
1510
-#' ```
1511
-#'
1512
-#' Columns in the data frame should be the same as group key (plus,
1513
-#' eventually, the time point column). In this example only those groups
1514
-#' identified by the rows in the provided data frame are processed.
1515
-#'
1516
-#' @family Analysis functions
1517
-#'
1518
-#' @param x An aggregated integration matrix, obtained via
1519
-#' `aggregate_values_by_key()`
1520
-#' @param lineages A data frame containing cell lineages information
1521
-#' @param aggregation_key The key used for aggregating `x`
1522
-#' @param group_key A character vector of column names for re-aggregation.
1523
-#' Column names must be either in `x` or in `lineages`. See details.
1524
-#' @param selected_groups Either NULL, a character vector or a
1525
-#' data frame for group selection. See details.
1526
-#' @param join_on Common columns to perform a join operation on
1527
-#' @param min_value A minimum value to filter the input matrix. Integrations
1528
-#' with a value strictly lower than `min_value` are excluded (dropped) from
1529
-#' the output.
1530
-#' @param impurity_threshold The ratio threshold for impurity in groups
1531
-#' @param by_timepoint Should filtering be applied on each time point? If
1532
-#' `FALSE`, all time points are merged together
1533
-#' @param timepoint_column Column in `x` containing the time point
1534
-#' @param value_column Column in `x` containing the numeric
1535
-#' quantification of interest
1536
-#'
1537
-#' @return A data frame
1538
-#' @export
1539
-#'
1540
-#' @examples
1541
-#' data("integration_matrices", package = "ISAnalytics")
1542
-#' data("association_file", package = "ISAnalytics")
1543
-#' aggreg <- aggregate_values_by_key(
1544
-#'     x = integration_matrices,
1545
-#'     association_file = association_file,
1546
-#'     value_cols = c("seqCount", "fragmentEstimate")
1547
-#' )
1548
-#' filtered_by_purity <- purity_filter(
1549
-#'     x = aggreg,
1550
-#'     value_column = "seqCount_sum"
1551
-#' )
1552
-#' head(filtered_by_purity)
1553
-purity_filter <- function(x,
1554
-    lineages = blood_lineages_default(),
1555
-    aggregation_key = c(
1556
-        "SubjectID", "CellMarker",
1557
-        "Tissue", "TimePoint"
1558
-    ),
1559
-    group_key = c("CellMarker", "Tissue"),
1560
-    selected_groups = NULL,
1561
-    join_on = "CellMarker",
1562
-    min_value = 3,
1563
-    impurity_threshold = 10,
1564
-    by_timepoint = TRUE,
1565
-    timepoint_column = "TimePoint",
1566
-    value_column = "seqCount_sum") {
1567
-    ## Checks
1568
-    #### - Base
1569
-    stopifnot(is.data.frame(x))
1570
-    stopifnot(is.character(aggregation_key))
1571
-    stopifnot(is.character(group_key))
1572
-    stopifnot(is.logical(by_timepoint))
1573
-    stopifnot(is.numeric(min_value) || is.integer(min_value))
1574
-    stopifnot(is.numeric(impurity_threshold) || is.integer(impurity_threshold))
1575
-    stopifnot(is.character(value_column))
1576
-    stopifnot(is.null(selected_groups) || is.character(selected_groups) ||
1577
-        is.data.frame(selected_groups))
1578
-    #### - Keys
1579
-    if (!all(aggregation_key %in% colnames(x))) {
1580
-        rlang::abort(.missing_user_cols_error(
1581
-            aggregation_key[!aggregation_key %in% colnames(x)]
1582
-        ))
1583
-    }
1584
-    if (!value_column[1] %in% colnames(x)) {
1585
-        rlang::abort(.missing_user_cols_error(value_column))
1586
-    }
1587
-    to_join <- if (all(group_key %in% colnames(x))) {
1588
-        ### If the groups rely only on attributes not related to lineages
1589
-        ### and are contained in the input matrix
1590
-        FALSE
1591
-    } else {
1592
-        ### If lineages info is needed
1593
-        stopifnot(is.data.frame(lineages))
1594
-        if (!all(group_key %in% unique(c(colnames(x), colnames(lineages))))) {
1595
-            missing_cols <- group_key[!group_key %in% unique(c(
1596
-                colnames(x),
1597
-                colnames(lineages)
1598
-            ))]
1599
-            rlang::abort(.missing_user_cols_error(missing_cols))
1600
-        }
1601
-        if (!(all(join_on %in% colnames(x)) &
1602
-            all(join_on %in% colnames(lineages)))
1603
-        ) {
1604
-            missing_common <- c("Missing common column(s) to join on",
1605
-                i = paste(
1606
-                    "The column(s) provided in argument",
1607
-                    "`join_on` is missing from one or",
1608
-                    "both data frames, aborting"
1609
-                )
1610
-            )
1611
-            rlang::abort(missing_common)
1612
-        }
1613
-        TRUE
1614
-    }
1615
-    if (by_timepoint) {
1616
-        stopifnot(is.character(timepoint_column))
1617
-        timepoint_column <- timepoint_column[1]
1618
-        if (!timepoint_column %in% colnames(x)) {
1619
-            rlang::abort(.missing_user_cols_error(timepoint_column))
1620
-        }
1621
-        group_key <- union(group_key, timepoint_column)
1622
-    }
1623
-    ## Pre-processing
1624
-    #### - Join if needed
1625
-    if (to_join) {
1626
-        x <- x %>%
1627
-            dplyr::left_join(lineages, by = join_on)
1628
-    }
1629
-    #### - Group and sum
1630
-    is_vars <- if (.is_annotated(x)) {
1631
-        c(mandatory_IS_vars(), annotation_IS_vars())
1632
-    } else {
1633
-        mandatory_IS_vars()
1634
-    }
1635
-    grouped <- x %>%
1636
-        dplyr::group_by(dplyr::across(dplyr::all_of(c(is_vars, group_key)))) %>%
1637
-        dplyr::summarise(
1638
-            Value = sum(.data[[value_column]]),
1639
-            .groups = "drop"
1640
-        )
1641
-    #### - value filter
1642
-    filtered_value <- threshold_filter(
1643
-        x = grouped,
1644
-        threshold = min_value,
1645
-        cols_to_compare = "Value",
1646
-        comparators = ">="
1647
-    )
1648
-    #### - Separating IS 1: group filtering
1649
-    pre_filt <- list()
1650
-    if (is.null(selected_groups) || purrr::is_empty(selected_groups)) {
1651
-        pre_filt[["process"]] <- filtered_value
1652
-        pre_filt[["keep"]] <- filtered_value[0, ]
1653
-    } else if (is.character(selected_groups)) {
1654
-        pre_filt[["process"]] <- filtered_value %>%
1655
-            dplyr::filter(.data[[group_key[1]]] %in% selected_groups)
1656
-        pre_filt[["keep"]] <- filtered_value %>%
1657
-            dplyr::filter(!.data[[group_key[1]]] %in% selected_groups)
1658
-    } else {
1659
-        ok_cols <- colnames(selected_groups)[colnames(selected_groups) %in%
1660
-            group_key]
1661
-        selected_groups <- selected_groups %>%
1662
-            dplyr::select(dplyr::all_of(ok_cols)) %>%
1663
-            dplyr::distinct()
1664
-        if (ncol(selected_groups) == 0 ||
1665
-            nrow(selected_groups) == 0) {
1666
-            pre_filt[["process"]] <- filtered_value
1667
-            pre_filt[["keep"]] <- filtered_value[0, ]
1668
-        } else {
1669
-            pre_filt[["process"]] <- dplyr::inner_join(filtered_value,
1670
-                selected_groups,
1671
-                by = ok_cols
1672
-            )
1673
-            pre_filt[["keep"]] <- dplyr::anti_join(filtered_value,
1674
-                selected_groups,
1675
-                by = ok_cols
1676
-            )
1677
-        }
1678
-    }
1679
-    if (nrow(pre_filt$process) == 0) {
1680
-        if (getOption("ISAnalytics.verbose")) {
1681
-            rlang::inform("No iss to process, done")
1682
-        }
1683
-        return(filtered_value)
1684
-    }
1685
-    #### - Separating IS 2: iss that are shared between groups are going to be
1686
-    #### processed, unique iss are kept as they are
1687
-    vars_to_group <- if (by_timepoint) {
1688
-        c(is_vars, timepoint_column)
1689
-    } else {
1690
-        is_vars
1691
-    }
1692
-    by_is <- pre_filt$process %>%
1693
-        dplyr::group_by(dplyr::across(dplyr::all_of(vars_to_group))) %>%
1694
-        dplyr::summarise(n = n(), .groups = "drop")
1695
-    to_process <- by_is %>%
1696
-        dplyr::filter(.data$n > 1) %>%
1697
-        dplyr::select(-.data$n) %>%
1698
-        dplyr::inner_join(pre_filt$process, by = vars_to_group)
1699
-    if (nrow(to_process) == 0) {
1700
-        ## If there are no shared iss there is nothing to process,
1701
-        ## return just the filtered matrix
1702
-        return(filtered_value)
1703
-    }
1704
-    to_keep <- by_is %>%
1705
-        dplyr::filter(.data$n == 1) %>%
1706
-        dplyr::select(-.data$n) %>%
1707
-        dplyr::inner_join(pre_filt$process, by = vars_to_group)
1708
-    #### - Process groups
1709
-    .filter_by_purity <- function(group) {
1710
-        max_val <- max(group$Value)
1711
-        processed <- group %>%
1712
-            dplyr::mutate(remove = (max_val / .data$Value) >
1713
-                impurity_threshold) %>%
1714
-            dplyr::filter(remove == FALSE) %>%
1715
-            dplyr::select(-.data$remove)
1716
-        processed
1717
-    }
1718
-    processed_iss <- to_process %>%
1719
-        dplyr::group_by(dplyr::across(vars_to_group)) %>%
1720
-        dplyr::group_modify(~ .filter_by_purity(.x)) %>%
1721
-        dplyr::ungroup()
1722
-    #### - Re-compose matrix
1723
-    final <- processed_iss %>%
1724
-        dplyr::bind_rows(to_keep) %>%
1725
-        dplyr::bind_rows(pre_filt$keep)
1726
-    final
1727
-}
1728
-
1729 1523
 #' Find the source of IS by evaluating sharing.
1730 1524
 #'
1731 1525
 #' @description \lifecycle{experimental}
... ...
@@ -290,22 +290,27 @@ remove_collisions <- function(x,
290 290
 #'     other_matrices = list(fragmentEstimate = separated$fragmentEstimate)
291 291
 #' )
292 292
 #' realigned
293
-realign_after_collisions <- function(sc_matrix, other_matrices) {
293
+realign_after_collisions <- function(sc_matrix,
294
+                                     other_matrices,
295
+                                     sample_column = pcr_id_column()) {
294 296
     stopifnot(is.list(other_matrices) & !is.null(names(other_matrices)))
295 297
     stopifnot(all(names(other_matrices) %in% quantification_types()))
298
+    stopifnot(is.character(sample_column))
299
+    sample_column <- sample_column[1]
296 300
     all_ISm <- purrr::map_lgl(other_matrices, .check_mandatory_vars)
297 301
     if (!all(all_ISm)) {
298 302
         rlang::abort(.non_ISM_error())
299 303
     }
300
-    all_campid <- purrr::map_lgl(other_matrices, .check_sample_col)
304
+    all_campid <- purrr::map_lgl(other_matrices,
305
+                                 ~ sample_column %in% colnames(.x))
301 306
     if (!all(all_campid)) {
302
-        rlang::abort(.missing_complAmpID_error())
307
+        rlang::abort(.missing_needed_cols(sample_column))
303 308
     }
304 309
     realigned <- purrr::map(other_matrices, function(x) {
305 310
         x %>% dplyr::semi_join(sc_matrix,
306 311
             by = c(
307 312
                 mandatory_IS_vars(),
308
-                "CompleteAmplificationID"
313
+                sample_column
309 314
             )
310 315
         )
311 316
     })
... ...
@@ -27,12 +27,12 @@
27 27
   tibble::tribble(
28 28
     ~ names, ~ types, ~ transform, ~ flag, ~ tag,
29 29
     "ProjectID", "char", NULL, "required", "project_id",
30
-    "FUSIONID", "char", NULL, "optional", NA_character_,
30
+    "FUSIONID", "char", NULL, "optional", "fusion_id",
31 31
     "PoolID", "char", NULL, "required", "pool_id",
32 32
     "TagSequence", "char", NULL, "required", "tag_seq",
33 33
     "SubjectID", "char", NULL, "required", "subject",
34 34
     "VectorType", "char", NULL, "optional", NA_character_,
35
-    "VectorID", "char", NULL, "required", NA_character_,
35
+    "VectorID", "char", NULL, "required", "vector_id",
36 36
     "ExperimentID", "char", NULL, "optional", NA_character_,
37 37
     "Tissue", "char", NULL, "required", "tissue",
38 38
     "TimePoint", "char", ~ stringr::str_pad(.x, 4, side = "left", pad = "0"),
... ...
@@ -42,7 +42,7 @@
42 42
     "TagIDextended", "char", NULL, "optional", NA_character_,
43 43
     "Keywords","char", NULL, "optional", NA_character_,
44 44
     "CellMarker", "char", NULL, "required", "cell_marker",
45
-    "TagID", "char", NULL, "required", NA_character_,
45
+    "TagID", "char", NULL, "required", "tag_id",
46 46
     "NGSProvider", "char", NULL, "optional", NA_character_,
47 47
     "NGSTechnology", "char", NULL, "required", "ngs_tech",
48 48
     "ConverrtedFilesDir", "char", NULL, "optional", NA_character_,
... ...
@@ -266,11 +266,36 @@
266 266
 #' @examples
267 267
 #' reduced_AF_columns()
268 268
 reduced_AF_columns <- function() {
269
-    c(
270
-        "TagID", "Tissue", "SubjectID", "TimePoint", "FUSIONID",
271
-        "CompleteAmplificationID", "CellMarker", "ProjectID", "VectorID",
272
-        "PoolID"
273
-    )
269
+  required <- list(
270
+    tag_id = "char",
271
+    tissue = "char",
272
+    subject = "char",
273
+    tp_days = c("char", "numeric", "integer"),
274
+    fusion_id = "char",
275
+    pcr_repl_id = "char",
276
+    cell_marker = "char",
277
+    project_id = "char",
278
+    vector_id = "char",
279
+    pool_id = "char"
280
+  )
281
+  politics <- list(
282
+    tag_id = "error",
283
+    tissue = "error",
284
+    subject = "error",
285
+    tp_days = "first",
286
+    fusion_id = "error",
287
+    pcr_repl_id = "error",
288
+    cell_marker = "error",
289
+    project_id = "error",
290
+    vector_id = "error",
291
+    pool_id = "error"
292
+  )
293
+  tag_cols <- .check_required_cols(required_tags = required,
294
+                                   vars_df = association_file_columns(TRUE),
295
+                                   duplicate_politic = politics) %>%
296
+    dplyr::select(.data$names, .data$tag)
297
+  data.table::setDT(tag_cols)
298
+  return(tag_cols)
274 299
 }
275 300
 
276 301
 # Names of the columns of iss stats considered for aggregation
... ...
@@ -334,20 +359,107 @@ refGene_table_cols <- function() {
334 359
 }
335 360
 
336 361
 
337
-available_column_tags <- function() {
338
-    list(
339
-        critical = list(af = c(
340
-            "project_id", "pool_id", "tag_seq", "subject", "tissue",
341
-            "cell_marker", "pcr_replicate", "vispa_concatenate",
342
-            "pcr_repl_id", "proj_folder"
343
-        ),
344
-        matrix = c("chromosome", "locus", "is_strand", "gene_symbol"),
345
-        stats = c("vispa_concatenate", "tag_seq")),
346
-        optional = list(af = c(
347
-            "tp_days", "pcr_method", "ngs_tech", "dna_num",
348
-            "vcn", "genome"
349
-        ),
350
-        matrix = c("gene_strand"),
351
-        stats = c())
352
-    )
362
+#' Title
363
+#'
364
+#' @return
365
+#' @export
366
+#'
367
+#' @examples
368
+available_tags <- function() {
369
+  data.table::data.table(
370
+    tag = c("chromosome", "locus", "is_strand", "gene_symbol", "gene_strand",
371
+            "project_id", "fusion_id", "tag_seq", "subject", "vector_id",
372
+            "tissue", "tp_days", "pcr_method", "cell_marker", "tag_id",
373
+            "ngs_tech", "dna_num", "pcr_replicate", "vcn", "vispa_concatenate",
374
+            "pcr_repl_id", "proj_folder", "genome",
375
+            "vispa_concatenate", "tag_seq"),
376
+    needed_in = list(c("top_targeted_genes",
377
+                       "CIS_grubbs",
378
+                       "compute_near_integrations"),
379
+                     c("top_targeted_genes",
380
+                       "CIS_grubbs",
381
+                       "compute_near_integrations"),
382
+                     c("CIS_grubbs",
383
+                       "compute_near_integrations"),
384
+                     c("top_targeted_genes",
385
+                       "CIS_grubbs",
386
+                       "compute_near_integrations",
387
+                       "CIS_volcano_plot"),
388
+                     c("top_targeted_genes",
389
+                       "CIS_grubbs"),
390
+                     c("generate_default_folder_structure",
391
+                       "import_Vispa2_stats", "remove_collisions",
392
+                       "generate_Vispa2_launch_AF", "import_association_file",
393
+                       "import_parallel_Vispa2Matrices"),
394
+                     c("generate_Vispa2_launch_AF"),
395
+                     c("generate_default_folder_structure",
396
+                       "import_association_file", "import_Vispa2_stats"),
397
+                     c("import_association_file",
398
+                       "HSC_population_size_estimate"),
399
+                     c("generate_Vispa2_launch_AF"),
400
+                     c("generate_Vispa2_launch_AF", "import_association_file",
401
+                       "HSC_population_size_estimate"),
402
+                     c("generate_Vispa2_launch_AF", "import_association_file"),
403
+                     c(),
404
+                     c("generate_Vispa2_launch_AF", "import_association_file",
405
+                       "HSC_population_size_estimate"),
406
+                     c("generate_Vispa2_launch_AF"),
407
+                     c(),
408
+                     c(),
409
+                     c("import_association_file", "remove_collisions"),
410
+                     c(),
411
+                     c("import_association_file", "generate_Vispa2_launch_AF",
412
+                       "generate_default_folder_structure",
413
+                       "import_Vispa2_stats", "import_parallel_Vispa2Matrices"),
414
+                     c("pcr_id_column", "generate_Vispa2_launch_AF",
415
+                       "import_association_file", "import_Vispa2_stats"),
416
+                     c("import_association_file"),
417
+                     c(),
418
+                     c("import_association_file", "generate_Vispa2_launch_AF",
419
+                       "generate_default_folder_structure",
420
+                       "import_Vispa2_stats", "import_parallel_Vispa2Matrices"),
421
+                     c("generate_default_folder_structure",
422
+                       "import_association_file", "import_Vispa2_stats")
423
+                     ),
424
+    description = c(paste("Number of the chromosome"),
425
+                    paste("The locus at which the integration occurs"),
426
+                    paste("The DNA strand in which the integration occurs"),
427
+                    paste("The symbol of the gene"),
428
+                    paste("The strand of the gene"),
429
+                    paste("Unique identifier of a project"),
430
+                    paste("Identification code/number of the",
431
+                          "barcoded (SLiM-)PCR product included in the",
432
+                          "sequencing library"),
433
+                    paste("The barcode tag sequence"),
434
+                    paste("Unique identifier of a study subject",
435
+                          "(usually a patient)"),
436
+                    paste("Unique identifier of the vector used"),
437
+                    paste("The biological tissue the sample belongs to"),
438
+                    paste("The time point expressed in days"),
439
+                    paste("The PCR method used"),
440
+                    paste("Cell marker associated with isolated",
441
+                          "cells carrying the IS"),
442
+                    paste("Unique identifier of the barcode tag, as specified",
443
+                          "in VISPA2 requirements"),
444
+                    paste("Technology used for next generation sequencing"),
445
+                    paste("Identification code/number of the DNA extraction",
446
+                          "from a specific biological sample"),
447
+                    paste("Number of the PCR replicate"),
448
+                    paste("Vector copy number"),
449
+                    paste("Unique identifier of a pool as specified in VISPA2"),
450
+                    paste("Unique identifier of the pcr replicate, used as",
451
+                          "key to join data and metadata"),
452
+                    paste("Path on disk containing the standard VISPA2 folder",
453
+                          "structure of the project"),
454
+                    paste("The reference genome (e.g. “hg19”)"),
455
+                    paste("Unique identifier of a pool as specified in VISPA2"),
456
+                    paste("The barcode tag sequence")
457
+                    ),
458
+    dyn_vars_tbl = c("mand_vars", "mand_vars", "mand_vars",
459
+                     "annot_vars", "annot_vars",
460
+                     "af_vars", "af_vars", "af_vars", "af_vars", "af_vars",
461
+                     "af_vars", "af_vars", "af_vars", "af_vars", "af_vars",
462
+                     "af_vars", "af_vars", "af_vars", "af_vars", "af_vars",
463
+                     "af_vars", "af_vars", "af_vars", "iss_vars", "iss_vars")
464
+  )
353 465
 }
... ...
@@ -79,7 +79,7 @@ import_single_Vispa2Matrix <- function(path,
79 79
                                  "which allows a more refined tuning. See",
80 80
                                  "`?import_single_Vispa2Matrix` for details")
81 81
     if (lifecycle::is_present(to_exclude)) {
82
-      lifecycle::deprecate_warn(
82
+      lifecycle::deprecate_stop(
83 83
         when = "1.5.4",
84 84
         what = "import_single_Vispa2Matrix(to_exclude)",
85 85
         with = "import_single_Vispa2Matrix(additional_cols)",
... ...
@@ -88,7 +88,7 @@ import_single_Vispa2Matrix <- function(path,
88 88
       return(NULL)
89 89
     }
90 90
     if (lifecycle::is_present(keep_excluded)) {
91
-      lifecycle::deprecate_warn(
91
+      lifecycle::deprecate_stop(
92 92
         when = "1.5.4",
93 93
         what = "import_single_Vispa2Matrix(keep_excluded)",
94 94
         with = "import_single_Vispa2Matrix(additional_cols)",
... ...
@@ -359,9 +359,13 @@ import_association_file <- function(path,
359 359
     if (!is.null(transformations)) {
360 360
         as_file <- transform_columns(as_file, transformations)
361 361
     }
362
-
362
+    crit_tags <- c(
363
+      "project_id", "pool_id", "tag_seq", "subject", "tissue",
364
+      "cell_marker", "pcr_replicate", "vispa_concatenate",
365
+      "pcr_repl_id", "proj_folder"
366
+    )
363 367
     crit_colnames <- association_file_columns(TRUE) %>%
364
-        dplyr::filter(.data$tag %in% available_column_tags()$critical$af) %>%
368
+        dplyr::filter(.data$tag %in% crit_tags) %>%
365 369
         dplyr::pull(.data$names)
366 370
     crit_colnames <- colnames(as_file)[colnames(as_file) %in% crit_colnames]
367 371
     crit_nas <- if (length(crit_colnames) > 0) {
... ...
@@ -778,6 +782,7 @@ import_parallel_Vispa2Matrices <- function(association_file,
778 782
         dplyr::pull(.data$names)
779 783
     ### --- Interactive
780 784
     if (mode == "INTERACTIVE") {
785
+      matching_option <- NULL
781 786
         ## User selects projects to keep
782 787
         association_file <- .interactive_select_projects_import(
783 788
             association_file,
... ...
@@ -996,7 +1001,8 @@ annotation_issues <- function(matrix) {
996 1001
     find_probs <- function(m) {
997 1002
         needed <- c(mandatory_IS_vars(), annotation_IS_vars())
998 1003
         if (!all(needed %in% colnames(m))) {
999
-            rlang::inform(.missing_needed_cols(needed[!needed %in% colnames(m)]))
1004
+            rlang::inform(
1005
+              .missing_needed_cols(needed[!needed %in% colnames(m)]))
1000 1006
             return(NULL)
1001 1007
         }
1002 1008
         tmp <- m %>%
... ...
@@ -1005,7 +1011,8 @@ annotation_issues <- function(matrix) {
1005 1011
                 annotation_IS_vars()
1006 1012
             ))) %>%
1007 1013
             dplyr::distinct() %>%
1008
-            dplyr::group_by(dplyr::across(dplyr::all_of(mandatory_IS_vars()))) %>%
1014
+            dplyr::group_by(dplyr::across(
1015
+              dplyr::all_of(mandatory_IS_vars()))) %>%
1009 1016
             dplyr::summarise(distinct_genes = dplyr::n())
1010 1017
         if (any(tmp$distinct_genes > 1)) {
1011 1018
             tmp %>%
... ...
@@ -5,7 +5,8 @@
5 5
 
6 6
 #### ---- Internals for dynamic variables  ----####
7 7
 # Internal for setting mandatory or annotation is vars
8
-.new_IS_vars_checks <- function(specs, err) {
8
+# tbl_type must be in {mand_vars, annot_vars, af_vars, iss_vars}
9
+.new_IS_vars_checks <- function(specs, err, tbl_type) {
9 10
     new_vars <- if (is.data.frame(specs)) {
10 11
         # if data frame supplied
11 12
         colnames_ok <- all(c("names", "types", "transform", "flag", "tag")
... ...
@@ -46,7 +47,8 @@
46 47
         specs
47 48
     } else {
48 49
         # if vector supplied
49
-        if (is.null(names(specs)) || !all(specs %in% .types_mapping()[["types"]])) {
50
+        if (is.null(names(specs)) || !all(specs %in%
51
+                                          .types_mapping()[["types"]])) {
50 52
             rlang::abort(err)
51 53
         }
52 54
         purrr::map2_dfr(
... ...
@@ -60,6 +62,31 @@
60 62
             )
61 63
         )
62 64
     }
65
+    # -- check critical tags
66
+    if (!is.null(tbl_type)) {
67
+      available_tags <- available_tags()
68
+      to_check <- available_tags[eval(sym("dyn_vars_tbl")) == tbl_type &
69
+                                 !purrr::map_lgl(
70
+                                   eval(sym("needed_in")), ~ purrr::is_empty(.x)
71
+                                 ), ]
72
+      if (!all(to_check$tag %in% new_vars$tag)) {
73
+        missing_tags <- to_check$tag[!to_check$tag %in% new_vars$tag]
74
+        inspect_cmd <- paste0("inspect_tags(c(", paste0("'", missing_tags,
75
+                                                        "'", collapse = ","),
76
+                              "))")
77
+        warn <- c("Warning: important tags missing",
78
+                  i = paste("Some tags are required for proper execution",
79
+                            "of some functions. If these tags are not",
80
+                            "provided, execution of dependent functions",
81
+                            "might fail.",
82
+                            "Review your inputs carefully."),
83
+                  i = paste("Missing tags:", paste0(missing_tags,
84
+                                                    collapse = ", ")),
85
+                  i = paste0("To see where these are involved type `",
86
+                             inspect_cmd, "`"))
87
+        rlang::inform(warn, class = "missing_crit_tags")
88
+      }
89
+    }
63 90
     new_vars
64 91
 }
65 92
 
... ...
@@ -284,7 +311,7 @@
284 311
                 tag_list$vispa_concatenate
285 312
         ) %>%
286 313
         dplyr::distinct() %>%
287
-      dplyr::filter(!dplyr::if_all(iss_cols, is.na))
314
+      dplyr::filter(!dplyr::if_all(dplyr::all_of(iss_cols), is.na))
288 315
     # Separate by project
289 316
     stats_split <- iss_data %>%
290 317
         dplyr::group_by(dplyr::across(dplyr::all_of(tag_list$project_id)))
... ...
@@ -753,27 +780,6 @@
753 780
     return(res)
754 781
 }
755 782
 
756
-# Internal helper function for checking `Value` column presence in x.
757
-#
758
-# Checks if the column `Value` is present in the data frame and also
759
-# checks if the column is numeric or integer.
760
-# @param x A data.frame object (or any extending class)
761
-# @keywords internal
762
-#
763
-# @return FALSE if not found or contains non-numeric data, TRUE otherwise
764
-.check_value_col <- function(x) {
765
-    stopifnot(is.data.frame(x))
766
-    present <- if ("Value" %in% colnames(x)) {
767
-        TRUE
768
-    } else {
769
-        FALSE
770
-    }
771
-    if (present == TRUE) {
772
-        return(is.numeric(x$Value) | is.integer(x$Value))
773
-    } else {
774
-        return(FALSE)
775
-    }
776
-}
777 783
 
778 784
 # Internal helper function for checking that the pcr replicate
779 785
 # column presence in x.
... ...
@@ -1691,15 +1697,13 @@
1691 1697
         p <- BiocParallel::SnowParam(
1692 1698
             stop.on.error = FALSE,
1693 1699
             tasks = length(stats_paths$stats_files),
1694
-            progressbar = getOption("ISAnalytics.verbose"),
1695
-            exportglobals = TRUE
1700
+            progressbar = getOption("ISAnalytics.verbose")
1696 1701
         )
1697 1702
     } else {
1698 1703
         p <- BiocParallel::MulticoreParam(
1699 1704
             stop.on.error = FALSE,
1700 1705
             tasks = length(stats_paths$stats_files),
1701
-            progressbar = getOption("ISAnalytics.verbose"),
1702
-            exportglobals = FALSE
1706
+            progressbar = getOption("ISAnalytics.verbose")
1703 1707
         )
1704 1708
     }
1705 1709
     FUN <- function(x, req_cols) {
... ...
@@ -1715,6 +1719,7 @@
1715 1719
         if (ok == TRUE) {
1716 1720
           # Apply column transformations if present
1717 1721
           stats <- .apply_col_transform(stats, iss_stats_specs(TRUE))
1722
+          return(stats)
1718 1723
         } else {
1719 1724
             return("MALFORMED")
1720 1725
         }
... ...
@@ -3053,10 +3058,10 @@
3053 3058
   wanted_tags <- c(
3054 3059
     "subject", "tissue", "cell_marker", "tp_days"
3055 3060
   )
3056
-  wanted <- data.table::setDT(.check_required_cols(required_tags = wanted_tags,
3057
-                       vars_df = association_file_columns(TRUE),
3058
-                       duplicate_politic = "keep") %>%
3059
-    dplyr::filter(.data$names %in% colnames(association_file)))
3061
+  wanted <- association_file_columns(TRUE)
3062
+  data.table::setDT(wanted)
3063
+  wanted <- wanted[eval(sym("tag")) %in% wanted_tags &
3064
+                     eval(sym("names")) %in% colnames(association_file), ]
3060 3065
   wanted <- data.table::rbindlist(list(req_tag_cols, wanted))
3061 3066
   predicate <- purrr::map(mandatory_IS_vars(), ~ {
3062 3067
     rlang::expr(is.na(!!sym(.x)))
... ...
@@ -3659,36 +3664,24 @@
3659 3664
 
3660 3665
 # Internal function that performs aggregation on values with a lambda
3661 3666
 # operation.
3662
-#
3663
-# @param x The list of matrices to aggregate. If a single matrix has to be
3664
-# supplied it must be enclosed in a list. For example `x = list(matrix)`.
3665
-# @param af The association file
3666
-# @param key A string or character vector to use as key
3667
-# @param lambda The aggregating operation to apply to values. Must take as
3668
-# input a numeric/integer vector and return a single value
3669
-# @param group The additional variables to add to grouping
3670
-# @param args Additional arguments passed on to lambda (named list)
3671
-# @param namespace The namespace from which the lambda is exported
3672
-# @param envir The environment in which symbols must be evaluated
3673
-#' @importFrom rlang .data
3674
-# @keywords internal
3675
-#
3676
-# @return A list of tibbles with aggregated values
3667
+# - x is a single matrix
3677 3668
 .aggregate_lambda <- function(x, af, key, value_cols, lambda, group,
3678 3669
     join_af_by) {
3679
-    # Vectorize
3680
-    aggregated_matrices <- purrr::map(x, function(y) {
3681
-        cols <- c(colnames(y), key)
3682
-        agg <- y %>%
3683
-            dplyr::left_join(af, by = dplyr::all_of(join_af_by)) %>%
3684
-            dplyr::select(dplyr::all_of(cols)) %>%
3685
-            dplyr::group_by(dplyr::across(dplyr::all_of(c(group, key)))) %>%
3686
-            dplyr::summarise(dplyr::across(
3687
-                .cols = dplyr::all_of(value_cols),
3688
-                .fns = lambda
3689
-            ), .groups = "drop")
3690
-        agg
3691
-    })
3670
+    cols_to_check <- c(group, key, value_cols, join_af_by)
3671
+    joint <- x %>%
3672
+      dplyr::left_join(af, by = dplyr::all_of(join_af_by))
3673
+    if (any(!cols_to_check %in% colnames(joint))) {
3674
+      missing <- cols_to_check[!cols_to_check %in% colnames(joint)]
3675
+      rlang::abort(.missing_user_cols_error(missing))
3676
+    }
3677
+    cols <- c(colnames(x), key)
3678
+    aggregated_matrices <- joint %>%
3679
+      dplyr::select(dplyr::all_of(cols)) %>%
3680
+      dplyr::group_by(dplyr::across(dplyr::all_of(c(group, key)))) %>%
3681
+      dplyr::summarise(dplyr::across(
3682
+        .cols = dplyr::all_of(value_cols),
3683
+        .fns = lambda
3684
+      ), .groups = "drop")
3692 3685
     aggregated_matrices
3693 3686
 }
3694 3687
 
... ...
@@ -4383,90 +4376,122 @@
4383 4376
     return(filtered)
4384 4377
 }
4385 4378
 
4379
+#---- USED IN : top_targeted_genes ----
4380
+# Internal to count the number of distinct integration sites for each gene
4381
+# - x: input df, must include mand vars and annotation vars
4382
+# - include_chr: should the distinction be made by taking into account that
4383
+#   some genes span over different chromosomes? If TRUE keeps same
4384
+#   (gene_sym, gene_strand) but different chr separated (different rows)
4385
+# - include_gene_strand: same but for gene strand
4386
+#' @importFrom data.table %chin%
4387
+.count_distinct_is_per_gene <- function(x, include_chr,
4388
+                                        include_gene_strand,
4389
+                                        gene_sym_col,
4390
+                                        gene_strand_col,
4391
+                                        chr_col,
4392
+                                        mand_vars_to_check
4393
+                                        ) {
4394
+  data.table::setDT(mand_vars_to_check)
4395
+  data.table::setDT(x)
4396
+  present_mand_vars <- mand_vars_to_check[eval(sym("names")) %chin% colnames(x)]
4397
+  group_key <- c(gene_sym_col)
4398
+  if (include_gene_strand) {
4399
+    group_key <- c(group_key, gene_strand_col)
4400
+  }
4401
+  if (include_chr) {
4402
+    group_key <- c(group_key, chr_col)
4403
+    present_mand_vars <- present_mand_vars[!eval(sym("names")) %chin% chr_col]
4404
+  }
4405
+  is_vars <- present_mand_vars$names