Browse code

[UPDATE] Update to 1.5.4

Merge branch 'dynamicVarsFeature'

# Conflicts:
# DESCRIPTION

Giulia Pais authored on 20/04/2022 19:09:10
Showing129 changed files

... ...
@@ -11,3 +11,4 @@
11 11
 ^doc$
12 12
 ^Design$
13 13
 ^sample_reports$
14
+^man-roxygen$
... ...
@@ -22,8 +22,6 @@
22 22
 
23 23
 on:
24 24
   push:
25
-    branches:
26
-      - master
27 25
   pull_request:
28 26
 
29 27
 name: R-CMD-check-bioc
... ...
@@ -1,6 +1,6 @@
1 1
 Package: ISAnalytics
2 2
 Title: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies
3
-Version: 1.5.3
3
+Version: 1.5.4
4 4
 Date: 2020-07-03
5 5
 Authors@R: c(
6 6
   person(given = "Andrea",
... ...
@@ -21,7 +21,7 @@ URL: https://calabrialab.github.io/ISAnalytics, https://github.com//calabrialab/
21 21
 BugReports: https://github.com/calabrialab/ISAnalytics/issues
22 22
 biocViews: BiomedicalInformatics, Sequencing, SingleCell
23 23
 Depends: 
24
-    R (>= 4.1),
24
+    R (>= 4.2),
25 25
     magrittr
26 26
 Imports: 
27 27
     utils,
... ...
@@ -44,9 +44,13 @@ Imports:
44 44
     readxl,
45 45
     tools,
46 46
     Rcapture,
47
-    grDevices, 
47
+    grDevices,
48
+    forcats,
49
+    glue,
48 50
     shiny,
49
-    zip
51
+    shinyWidgets,
52
+    datamods,
53
+    bslib
50 54
 Encoding: UTF-8
51 55
 LazyData: false
52 56
 Roxygen: list(markdown = TRUE)
... ...
@@ -73,9 +77,7 @@ Suggests:
73 77
     plotly,
74 78
     gtools,
75 79
     eulerr,
76
-    shinyWidgets,
77
-    datamods,
78
-    bslib
80
+    openxlsx
79 81
 VignetteBuilder: knitr
80 82
 RdMacros: 
81 83
     lifecycle
... ...
@@ -12,6 +12,7 @@ export(annotation_issues)
12 12
 export(as_sparse_matrix)
13 13
 export(association_file_columns)
14 14
 export(available_outlier_tests)
15
+export(available_tags)
15 16
 export(blood_lineages_default)
16 17
 export(circos_genomic_density)
17 18
 export(clinical_relevant_suspicious_genes)
... ...
@@ -20,41 +21,62 @@ export(compute_abundance)
20 21
 export(compute_near_integrations)
21 22
 export(cumulative_count_union)
22 23
 export(cumulative_is)
23
-export(date_columns_coll)
24 24
 export(date_formats)
25
+export(default_af_transform)
25 26
 export(default_iss_file_prefixes)
26 27
 export(default_meta_agg)
28
+export(default_rec_agg_lambdas)
27 29
 export(default_report_path)
28 30
 export(default_stats)
31
+export(fisher_scatterplot)
32
+export(gene_frequency_fisher)
29 33
 export(generate_Vispa2_launch_AF)
30 34
 export(generate_blank_association_file)
35
+export(generate_default_folder_structure)
31 36
 export(import_Vispa2_stats)
32 37
 export(import_association_file)
33 38
 export(import_parallel_Vispa2Matrices)
34 39
 export(import_parallel_Vispa2Matrices_auto)
35 40
 export(import_parallel_Vispa2Matrices_interactive)
36 41
 export(import_single_Vispa2Matrix)
42
+export(inspect_tags)
37 43
 export(integration_alluvial_plot)
38 44
 export(is_sharing)
39 45
 export(iss_source)
46
+export(iss_stats_specs)
40 47
 export(known_clinical_oncogenes)
41 48
 export(mandatory_IS_vars)
42 49
 export(matching_options)
50
+export(matrix_file_suffixes)
43 51
 export(outlier_filter)
44 52
 export(outliers_by_pool_fragments)
53
+export(pcr_id_column)
45 54
 export(purity_filter)
46 55
 export(quantification_types)
47 56
 export(realign_after_collisions)
48 57
 export(reduced_AF_columns)
49 58
 export(refGene_table_cols)
50 59
 export(remove_collisions)
60
+export(reset_af_columns_def)
61
+export(reset_annotation_IS_vars)
62
+export(reset_dyn_vars_config)
63
+export(reset_iss_stats_specs)
64
+export(reset_mandatory_IS_vars)
65
+export(reset_matrix_file_suffixes)
51 66
 export(sample_statistics)
52 67
 export(separate_quant_matrices)
68
+export(set_af_columns_def)
69
+export(set_annotation_IS_vars)
70
+export(set_iss_stats_specs)
71
+export(set_mandatory_IS_vars)
72
+export(set_matrix_file_suffixes)
53 73
 export(sharing_heatmap)
54 74
 export(sharing_venn)
55 75
 export(threshold_filter)
56 76
 export(top_abund_tableGrob)
57 77
 export(top_integrations)
78
+export(top_targeted_genes)
79
+export(transform_columns)
58 80
 export(unzip_file_system)
59 81
 import(ggplot2)
60 82
 import(lifecycle)
... ...
@@ -66,43 +88,33 @@ importFrom(BiocParallel,bpstop)
66 88
 importFrom(BiocParallel,bptry)
67 89
 importFrom(Rcapture,closedp.0)
68 90
 importFrom(Rcapture,closedp.bc)
91
+importFrom(data.table,"%chin%")
92
+importFrom(data.table,.N)
69 93
 importFrom(data.table,.SD)
70 94
 importFrom(data.table,`%chin%`)
71 95
 importFrom(data.table,fread)
72
-importFrom(data.table,melt.data.table)
73
-importFrom(data.table,rbindlist)
74 96
 importFrom(data.table,setDT)
75 97
 importFrom(data.table,setnames)
76 98
 importFrom(dplyr,across)
77 99
 importFrom(dplyr,all_of)
78
-importFrom(dplyr,anti_join)
79 100
 importFrom(dplyr,arrange)
80 101
 importFrom(dplyr,bind_cols)
81 102
 importFrom(dplyr,bind_rows)
82
-importFrom(dplyr,contains)
83
-importFrom(dplyr,cur_column)
84 103
 importFrom(dplyr,desc)
85 104
 importFrom(dplyr,distinct)
86 105
 importFrom(dplyr,everything)
87 106
 importFrom(dplyr,filter)
88
-importFrom(dplyr,first)
89
-importFrom(dplyr,full_join)
90 107
 importFrom(dplyr,group_by)
91 108
 importFrom(dplyr,group_keys)
92 109
 importFrom(dplyr,group_modify)
93 110
 importFrom(dplyr,group_split)
94
-importFrom(dplyr,if_else)
95
-importFrom(dplyr,inner_join)
96
-importFrom(dplyr,intersect)
97 111
 importFrom(dplyr,left_join)
98 112
 importFrom(dplyr,mutate)
99 113
 importFrom(dplyr,n)
100
-importFrom(dplyr,n_distinct)
101 114
 importFrom(dplyr,pull)
102 115
 importFrom(dplyr,rename)
103 116
 importFrom(dplyr,rename_with)
104 117
 importFrom(dplyr,select)
105
-importFrom(dplyr,semi_join)
106 118
 importFrom(dplyr,slice)
107 119
 importFrom(dplyr,slice_head)
108 120
 importFrom(dplyr,starts_with)
... ...
@@ -112,10 +124,8 @@ importFrom(fs,as_fs_path)
112 124
 importFrom(fs,dir_create)
113 125
 importFrom(fs,dir_exists)
114 126
 importFrom(fs,dir_ls)
115
-importFrom(fs,file_exists)
116 127
 importFrom(fs,is_dir)
117 128
 importFrom(fs,path)
118
-importFrom(fs,path_dir)
119 129
 importFrom(fs,path_ext)
120 130
 importFrom(fs,path_ext_set)
121 131
 importFrom(fs,path_home)
... ...
@@ -124,8 +134,6 @@ importFrom(ggplot2,aes)
124 134
 importFrom(ggplot2,aes_)
125 135
 importFrom(ggplot2,element_blank)
126 136
 importFrom(ggplot2,element_text)
127
-importFrom(ggplot2,geom_hline)
128
-importFrom(ggplot2,geom_point)
129 137
 importFrom(ggplot2,geom_raster)
130 138
 importFrom(ggplot2,geom_text)
131 139
 importFrom(ggplot2,ggplot)
... ...
@@ -135,14 +143,9 @@ importFrom(ggplot2,rel)
135 143
 importFrom(ggplot2,scale_alpha_continuous)
136 144
 importFrom(ggplot2,scale_fill_gradientn)
137 145
 importFrom(ggplot2,scale_fill_viridis_d)
138
-importFrom(ggplot2,scale_x_continuous)
139
-importFrom(ggplot2,scale_y_continuous)
140 146
 importFrom(ggplot2,sym)
141 147
 importFrom(ggplot2,theme)
142
-importFrom(ggplot2,unit)
143
-importFrom(ggrepel,geom_label_repel)
144 148
 importFrom(lifecycle,deprecate_warn)
145
-importFrom(lubridate,parse_date_time)
146 149
 importFrom(lubridate,today)
147 150
 importFrom(magrittr,`%>%`)
148 151
 importFrom(purrr,cross_df)
... ...
@@ -150,14 +153,9 @@ importFrom(purrr,detect_index)
150 153
 importFrom(purrr,flatten)
151 154
 importFrom(purrr,flatten_chr)
152 155
 importFrom(purrr,is_empty)
153
-importFrom(purrr,is_formula)
154
-importFrom(purrr,is_function)
155 156
 importFrom(purrr,map)
156 157
 importFrom(purrr,map2)
157
-importFrom(purrr,map2_chr)
158 158
 importFrom(purrr,map2_dfr)
159
-importFrom(purrr,map2_lgl)
160
-importFrom(purrr,map_chr)
161 159
 importFrom(purrr,map_dbl)
162 160
 importFrom(purrr,map_dfr)
163 161
 importFrom(purrr,map_lgl)
... ...
@@ -171,7 +169,6 @@ importFrom(purrr,set_names)
171 169
 importFrom(purrr,walk)
172 170
 importFrom(purrr,walk2)
173 171
 importFrom(readr,cols)
174
-importFrom(readr,problems)
175 172
 importFrom(readr,read_delim)
176 173
 importFrom(readr,write_tsv)
177 174
 importFrom(rlang,.data)
... ...
@@ -183,8 +180,6 @@ importFrom(rlang,as_function)
183 180
 importFrom(rlang,call2)
184 181
 importFrom(rlang,current_env)
185 182
 importFrom(rlang,dots_list)
186
-importFrom(rlang,enexpr)
187
-importFrom(rlang,env_bind)
188 183
 importFrom(rlang,eval_tidy)
189 184
 importFrom(rlang,exec)
190 185
 importFrom(rlang,expr)
... ...
@@ -196,29 +191,19 @@ importFrom(rlang,list2)
196 191
 importFrom(rlang,parse_expr)
197 192
 importFrom(rlang,sym)
198 193
 importFrom(stats,dt)
199
-importFrom(stats,na.omit)
200 194
 importFrom(stats,setNames)
201 195
 importFrom(stats,shapiro.test)
202 196
 importFrom(stringr,str_detect)
203
-importFrom(stringr,str_pad)
204
-importFrom(stringr,str_replace)
205 197
 importFrom(stringr,str_replace_all)
206 198
 importFrom(stringr,str_split)
207
-importFrom(stringr,str_to_lower)
208 199
 importFrom(stringr,str_to_upper)
209 200
 importFrom(tibble,add_column)
210 201
 importFrom(tibble,as_tibble)
211
-importFrom(tibble,as_tibble_col)
212 202
 importFrom(tibble,tibble)
213 203
 importFrom(tibble,tribble)
214 204
 importFrom(tidyr,nest)
215
-importFrom(tidyr,pivot_longer)
216 205
 importFrom(tidyr,pivot_wider)
217
-importFrom(tidyr,separate)
218 206
 importFrom(tidyr,unite)
219 207
 importFrom(tidyr,unnest)
220 208
 importFrom(tools,file_path_sans_ext)
221
-importFrom(utils,read.csv)
222 209
 importFrom(utils,read.delim)
223
-importFrom(utils,tail)
224
-importFrom(zip,unzip)
... ...
@@ -2,6 +2,59 @@
2 2
 title: "NEWS"
3 3
 output: github_document
4 4
 ---
5
+# ISAnalytics 1.5.4 (2022-04-20)
6
+
7
+## MAJOR CHANGES
8
+
9
+* ISAnalytics has now a new "dynamic vars system" to allow more flexibility
10
+on user inputs, view the dedicated vignette with 
11
+`vignette("setup_workflow", package="ISAnalytics")`
12
+* All package functions were reviewed to work properly with this system
13
+
14
+## NEW FEATURES
15
+
16
+* `gene_frequency_fisher()` is a new function of the analysis family that 
17
+allows the computation of Fisher's exact test p-values on gene frequency - 
18
+`fisher_scatterplot()` is the associated plotting function
19
+* `top_targeted_genes()` is a new function of the analysis family that 
20
+produces the top n targeted genes based on the number of IS
21
+* `NGSdataExplorer()` is a newly implemented Shiny interface that allows the
22
+exploration and plotting of data
23
+* zipped examples were removed from the package to contain size. To compensate,
24
+the new function `generate_default_folder_structure()` generates the 
25
+standard folder structure with package-included data on-demand
26
+* `transform_columns()` is a new utility function, also used internally by
27
+other exported functions, that allows arbitrary transformations on 
28
+data frame columns
29
+
30
+## MINOR CHANGES
31
+
32
+* `remove_collisions()` now has a dedicated parameter to specify how 
33
+independent samples are identified
34
+* `compute_near_integration_sites()` now has a parameter called
35
+`additional_agg_lambda()` to allow aggregation of additional columns
36
+* `CIS_grubbs()` now signals if there are missing genes in the refgenes table
37
+and eventually returns them as a df
38
+* `outlier_filter()` is now able to take multiple tests in input and combine
39
+them with a given logic. It now also produces an HTML report.
40
+* Several functions now use data.table under the hood
41
+* Color of the strata containing IS below threshold can now be set in
42
+`integration_alluvial_plot()` 
43
+
44
+## BUG FIXES
45
+
46
+* Fixed a minor bug in `import_Vispa2_stats()` - function failed when passing
47
+`report_path = NULL`
48
+* Fixed minor issue in `circos_genomic_density()` when trying to use a pdf 
49
+device
50
+
51
+## DEPRECATED FUNCTIONS
52
+
53
+* `unzip_file_system()` was made defunct in favor of
54
+`generate_default_folder_structure()`
55
+* `cumulative_count_union()` was deprecated and its functionality was moved
56
+to `cumulative_is()`
57
+
5 58
 # ISAnalytics 1.5.3 (2022-01-13)
6 59
 
7 60
 ## MINOR CHANGES
... ...
@@ -1,6 +1,63 @@
1 1
 NEWS
2 2
 ================
3 3
 
4
+# ISAnalytics 1.5.4 (2022-04-20)
5
+
6
+## MAJOR CHANGES
7
+
8
+-   ISAnalytics has now a new “dynamic vars system” to allow more
9
+    flexibility on user inputs, view the dedicated vignette with
10
+    `vignette("setup_workflow", package="ISAnalytics")`
11
+-   All package functions were reviewed to work properly with this
12
+    system
13
+
14
+## NEW FEATURES
15
+
16
+-   `gene_frequency_fisher()` is a new function of the analysis family
17
+    that allows the computation of Fisher’s exact test p-values on gene
18
+    frequency - `fisher_scatterplot()` is the associated plotting
19
+    function
20
+-   `top_targeted_genes()` is a new function of the analysis family that
21
+    produces the top n targeted genes based on the number of IS
22
+-   `NGSdataExplorer()` is a newly implemented Shiny interface that
23
+    allows the exploration and plotting of data
24
+-   zipped examples were removed from the package to contain size. To
25
+    compensate, the new function `generate_default_folder_structure()`
26
+    generates the standard folder structure with package-included data
27
+    on-demand
28
+-   `transform_columns()` is a new utility function, also used
29
+    internally by other exported functions, that allows arbitrary
30
+    transformations on data frame columns
31
+
32
+## MINOR CHANGES
33
+
34
+-   `remove_collisions()` now has a dedicated parameter to specify how
35
+    independent samples are identified
36
+-   `compute_near_integration_sites()` now has a parameter called
37
+    `additional_agg_lambda()` to allow aggregation of additional columns
38
+-   `CIS_grubbs()` now signals if there are missing genes in the
39
+    refgenes table and eventually returns them as a df
40
+-   `outlier_filter()` is now able to take multiple tests in input and
41
+    combine them with a given logic. It now also produces an HTML
42
+    report.
43
+-   Several functions now use data.table under the hood
44
+-   Color of the strata containing IS below threshold can now be set in
45
+    `integration_alluvial_plot()`
46
+
47
+## BUG FIXES
48
+
49
+-   Fixed a minor bug in `import_Vispa2_stats()` - function failed when
50
+    passing `report_path = NULL`
51
+-   Fixed minor issue in `circos_genomic_density()` when trying to use a
52
+    pdf device
53
+
54
+## DEPRECATED FUNCTIONS
55
+
56
+-   `unzip_file_system()` was made defunct in favor of
57
+    `generate_default_folder_structure()`
58
+-   `cumulative_count_union()` was deprecated and its functionality was
59
+    moved to `cumulative_is()`
60
+
4 61
 # ISAnalytics 1.5.3 (2022-01-13)
5 62
 
6 63
 ## MINOR CHANGES
... ...
@@ -11,5 +11,9 @@
11 11
 #'  \code{\link{import_parallel_Vispa2Matrices}}
12 12
 #'  * import_parallel_Vispa2Matrices_interactive:
13 13
 #'  \code{\link{import_parallel_Vispa2Matrices}}
14
+#'  * unzip_file_system:
15
+#'  \code{\link{generate_default_folder_structure}}
16
+#'  * cumulative_count_union:
17
+#'  \code{\link{cumulative_is}}
14 18
 #' @keywords internal
15 19
 NULL
... ...
@@ -3,7 +3,8 @@
3 3
 #------------------------------------------------------------------------------#
4 4
 #' Performs aggregation on metadata contained in the association file.
5 5
 #'
6
-#' \lifecycle{stable}
6
+#' @description
7
+#' `r lifecycle::badge("stable")`
7 8
 #' Groups metadata by the specified grouping keys and returns a
8 9
 #' summary of info for each group. For more details on how to use this function:
9 10
 #' \code{vignette("aggregate_function_usage", package = "ISAnalytics")}
... ...
@@ -20,10 +21,8 @@
20 21
 #' @param import_stats `r lifecycle::badge("deprecated")` The import
21 22
 #' of VISPA2 stats has been moved to its dedicated function,
22 23
 #' see \link{import_Vispa2_stats}.
23
-#' @family Aggregate functions
24
-#' @importFrom rlang abort inform
25
-#' @importFrom purrr is_empty
26
-#' @import lifecycle
24
+#'
25
+#' @family Data cleaning and pre-processing
27 26
 #'
28 27
 #' @return An aggregated data frame
29 28
 #' @export
... ...
@@ -101,9 +100,8 @@ aggregate_metadata <- function(association_file,
101 100
 #' * `Output_colname`: a `glue` specification that will be used to determine
102 101
 #' a unique output column name. See \link[glue]{glue} for more details.
103 102
 #'
104
-#' @importFrom tibble tribble
105 103
 #' @return A data frame
106
-#' @family Aggregate functions
104
+#' @family Data cleaning and pre-processing
107 105
 #' @export
108 106
 #'
109 107
 #' @examples
... ...
@@ -146,7 +144,8 @@ default_meta_agg <- function() {
146 144
 
147 145
 #' Aggregates matrices values based on specified key.
148 146
 #'
149
-#' \lifecycle{stable}
147
+#' @description
148
+#' `r lifecycle::badge("stable")`
150 149
 #' Performs aggregation on values contained in the integration matrices based
151 150
 #' on the key and the specified lambda. For more details on how to use this
152 151
 #' function:
... ...
@@ -206,12 +205,13 @@ default_meta_agg <- function() {
206 205
 #' @param join_af_by A character vector representing the joining key
207 206
 #' between the matrix and the metadata. Useful to re-aggregate already
208 207
 #' aggregated matrices.
209
-#' @family Aggregate functions
208
+#'
209
+#' @family Data cleaning and pre-processing
210 210
 #'
211 211
 #' @importFrom purrr walk set_names map_lgl
212 212
 #' @importFrom rlang expr eval_tidy abort
213 213
 #'
214
-#' @return A list of tibbles or a single tibble aggregated according to
214
+#' @return A list of data frames or a single data frame aggregated according to
215 215
 #' the specified arguments
216 216
 #' @export
217 217
 #'
... ...
@@ -240,115 +240,54 @@ aggregate_values_by_key <- function(x,
240 240
     ),
241 241
     join_af_by = "CompleteAmplificationID") {
242 242
     stopifnot(is.data.frame(x) || is.list(x))
243
-    if (!is.data.frame(x)) {
244
-        purrr::walk(x, function(df) {
245
-            stopifnot(is.data.frame(df))
246
-            if (.check_mandatory_vars(df) == FALSE) {
247
-                rlang::abort(.missing_mand_vars())
248
-            }
249
-            if (!all(join_af_by %in% colnames(df))) {
250
-                rlang::abort(c(
251
-                    x = paste(
252
-                        "Missing common columns",
253
-                        "to join metadata"
254
-                    ),
255
-                    i = paste(
256
-                        "Missing: ",
257
-                        paste0(join_af_by[!join_af_by %in%
258
-                            colnames(df)],
259
-                        collapse = ", "
260
-                        )
261
-                    )
262
-                ))
263
-            }
264
-            if (!all(value_cols %in% colnames(df))) {
265
-                rlang::abort(.missing_user_cols_error(
266
-                    value_cols[!value_cols %in% colnames(df)]
267
-                ))
268
-            }
269
-            is_numeric_col <- purrr::map_lgl(value_cols, function(col) {
270
-                if (!is.double(df[[col]]) &&
271
-                    !is.integer(df[[col]])) {
272
-                    FALSE
273
-                } else {
274
-                    TRUE
275
-                }
276
-            }) %>% purrr::set_names(value_cols)
277
-            if (any(!is_numeric_col)) {
278
-                rlang::abort(.non_num_user_cols_error(
279
-                    names(is_numeric_col)[!is_numeric_col]
280
-                ))
281
-            }
282
-        })
283
-    } else {
284
-        if (.check_mandatory_vars(x) == FALSE) {
285
-            rlang::abort(.missing_mand_vars())
286
-        }
287
-        if (!all(join_af_by %in% colnames(x))) {
288
-            rlang::abort(c(
289
-                x = paste(
290
-                    "Missing common columns",
291
-                    "to join metadata"
292
-                ),
293
-                i = paste(
294
-                    "Missing: ",
295
-                    paste0(join_af_by[!join_af_by %in%
296
-                        colnames(x)],
297
-                    collapse = ", "
298
-                    )
299
-                )
300
-            ))
301
-        }
302
-        if (!all(value_cols %in% colnames(x))) {
303
-            rlang::abort(.missing_user_cols_error(
304
-                value_cols[!value_cols %in% colnames(x)]
305
-            ))
306
-        }
307
-        is_numeric_col <- purrr::map_lgl(value_cols, function(col) {
308
-            if (!is.double(x[[col]]) &&
309
-                !is.integer(x[[col]])) {
310
-                FALSE
311
-            } else {
312
-                TRUE
313
-            }
314
-        }) %>% purrr::set_names(value_cols)
243
+    stopifnot(is.character(value_cols))
244
+    stopifnot(is.character(key))
245
+    stopifnot(is.null(group) || is.character(group))
246
+    stopifnot(is.character(join_af_by))
247
+    stopifnot(is.data.frame(association_file))
248
+    data.table::setDT(association_file)
249
+    stopifnot(is.list(lambda) && !is.null(names(lambda)))
250
+    join_key_err <- c("Join key not present in in both data frames",
251
+        x = paste(
252
+            "Fields specified in the argument",
253
+            "`join_af_by` must appear in both",
254
+            "the association file and the matrix(es)"
255
+        )
256
+    )
257
+    check_single_matrix <- function(df) {
258
+        stopifnot(is.data.frame(df))
259
+        is_numeric_col <- purrr::map_lgl(
260
+            value_cols,
261
+            ~ is.numeric(df[[.x]]) ||
262
+                is.double(df[[.x]]) ||
263
+                is.integer(df[[.x]])
264
+        ) %>% purrr::set_names(value_cols)
315 265
         if (any(!is_numeric_col)) {
316 266
             rlang::abort(.non_num_user_cols_error(
317 267
                 names(is_numeric_col)[!is_numeric_col]
318 268
             ))
319 269
         }
320
-    }
321
-    # Check association file
322
-    stopifnot(is.data.frame(association_file))
323
-    # Check key
324
-    stopifnot(is.character(key))
325
-    if (!all(key %in% colnames(association_file))) {
326
-        rlang::abort(c(x = "Key fields are missing from association file"))
327
-    }
328
-    # Check lambda
329
-    stopifnot(is.list(lambda))
330
-    # Check group
331
-    stopifnot(is.character(group) | is.null(group))
332
-    if (is.data.frame(x)) {
333
-        if (!all(group %in% c(colnames(association_file), colnames(x)))) {
334
-            rlang::abort(paste("Grouping variables not found"))
270
+        if (!all(join_af_by %in% colnames(df))) {
271
+            rlang::abort(join_key_err, class = "join_key_err_agg")
335 272
         }
273
+    }
274
+    if (!is.data.frame(x)) {
275
+        purrr::walk(x, check_single_matrix)
336 276
     } else {
337
-        purrr::walk(x, function(df) {
338
-            if (!all(group %in% c(colnames(association_file), colnames(df)))) {
339
-                rlang::abort(paste("Grouping variables not found"))
340
-            }
341
-        })
277
+        check_single_matrix(x)
342 278
     }
343
-    if (is.data.frame(x)) {
344
-        x <- list(x)
345
-        agg_matrix <- .aggregate_lambda(
279
+    join_key_in_af <- all(join_af_by %in% colnames(association_file))
280
+    if (!join_key_in_af) {
281
+        rlang::abort(join_key_err, class = "join_key_err_agg")
282
+    }
283
+    agg_matrix <- if (is.data.frame(x)) {
284
+        .aggregate_lambda(
346 285
             x, association_file, key, value_cols, lambda, group, join_af_by
347 286
         )
348
-        return(agg_matrix[[1]])
287
+    } else {
288
+        agg_matrix <- purrr::map(x, ~ .aggregate_lambda(
289
+            .x, association_file, key, value_cols, lambda, group, join_af_by
290
+        ))
349 291
     }
350
-    agg_matrix <- .aggregate_lambda(
351
-        x, association_file, key, value_cols, lambda, group, join_af_by
352
-    )
353
-    agg_matrix
292
+    return(agg_matrix)
354 293
 }
... ...
@@ -4,7 +4,8 @@
4 4
 
5 5
 #' Computes the abundance for every integration event in the input data frame.
6 6
 #'
7
-#' \lifecycle{stable}
7
+#' @description
8
+#' `r lifecycle::badge("stable")`
8 9
 #' Abundance is obtained for every integration event by calculating the ratio
9 10
 #' between the single value and the total value for the given group.
10 11
 #'
... ...
@@ -13,6 +14,11 @@
13 14
 #' relative abundance column (and optionally a percentage abundance
14 15
 #' column) will be produced.
15 16
 #'
17
+#' @section Required tags:
18
+#' The function will explicitly check for the presence of these tags:
19
+#'
20
+#' * All columns declared in `mandatory_IS_vars()`
21
+#'
16 22
 #' @param x An integration matrix - aka a data frame that includes
17 23
 #' the `mandatory_IS_vars()` as columns. The matrix can either be aggregated
18 24
 #' (via `aggregate_values_by_key()`) or not.
... ...
@@ -29,12 +35,6 @@
29 35
 #'
30 36
 #' @family Analysis functions
31 37
 #'
32
-#' @importFrom magrittr `%>%`
33
-#' @importFrom dplyr group_by across all_of summarise left_join mutate
34
-#' @importFrom dplyr cur_column distinct select contains rename_with
35
-#' @importFrom rlang .data eval_tidy parse_expr abort
36
-#' @importFrom purrr map_lgl
37
-#' @importFrom stringr str_replace
38 38
 #' @return Either a single data frame with computed abundance values or
39 39
 #' a list of 2 data frames (abundance_df, quant_totals)
40 40
 #' @export
... ...
@@ -59,24 +59,20 @@ compute_abundance <- function(x,
59 59
     if (.check_mandatory_vars(x) == FALSE) {
60 60
         rlang::abort(.missing_mand_vars())
61 61
     }
62
-    stopifnot(is.logical(percentage) & length(percentage) == 1)
63
-    if (!all(columns %in% colnames(x)) | !all(key %in% colnames(x))) {
62
+    stopifnot(is.logical(percentage))
63
+    percentage <- percentage[1]
64
+    if (!all(c(columns, key) %in% colnames(x))) {
64 65
         missing_cols <- c(
65 66
             columns[!columns %in% colnames(x)],
66 67
             key[!key %in% colnames(x)]
67 68
         )
68 69
         rlang::abort(.missing_user_cols_error(missing_cols))
69 70
     }
70
-    non_num_cols <- purrr::map_lgl(columns, function(col) {
71
-        expr <- rlang::expr(`$`(x, !!col))
72
-        if (is.numeric(rlang::eval_tidy(expr)) |
73
-            is.integer(rlang::eval_tidy(expr))) {
74
-            return(FALSE)
75
-        } else {
76
-            return(TRUE)
77
-        }
78
-    })
79
-    if (any(non_num_cols)) {
71
+    non_num_cols <- purrr::map_lgl(
72
+        columns,
73
+        ~ is.numeric(x[[.x]]) || is.integer(x[[.x]])
74
+    )
75
+    if (any(!non_num_cols)) {
80 76
         rlang::abort(.non_num_user_cols_error(columns[non_num_cols]))
81 77
     }
82 78
     stopifnot(is.logical(keep_totals) || keep_totals == "df")
... ...
@@ -131,186 +127,10 @@ compute_abundance <- function(x,
131 127
     }
132 128
 }
133 129
 
134
-
135
-#' obtain a single integration matrix from individual quantification
136
-#' matrices.
137
-#'
138
-#' \lifecycle{stable}
139
-#' Takes a list of integration matrices referring to different quantification
140
-#' types and merges them in a single data frame that has multiple
141
-#' value columns, each renamed according to their quantification type
142
-#' of reference.
143
-#'
144
-#' @param x A named list of integration matrices, ideally obtained via
145
-#' \link{import_parallel_Vispa2Matrices_interactive} or
146
-#' \link{import_parallel_Vispa2Matrices_auto}. Names must be
147
-#' quantification types.
148
-#' @param fragmentEstimate The name of the output column for fragment
149
-#' estimate values
150
-#' @param seqCount The name of the output column for sequence
151
-#' count values
152
-#' @param barcodeCount The name of the output column for barcode count
153
-#' values
154
-#' @param cellCount The name of the output column for cell count values
155
-#' @param ShsCount The name of the output column for Shs count values
156
-#'
157
-#' @importFrom purrr walk map2 reduce
158
-#' @importFrom dplyr rename full_join intersect
159
-#' @importFrom magrittr `%>%`
160
-#' @importFrom rlang .data `:=`
161
-#'
162
-#' @family Analysis functions
163
-#'
164
-#' @seealso \link{quantification_types}
165
-#'
166
-#' @return A tibble
167
-#' @export
168
-#'
169
-#' @examples
170
-#' fs_path <- system.file("extdata", "fs.zip", package = "ISAnalytics")
171
-#' fs <- unzip_file_system(fs_path, "fs")
172
-#' af_path <- system.file("extdata", "asso.file.tsv.gz",
173
-#'     package = "ISAnalytics"
174
-#' )
175
-#' af <- import_association_file(af_path,
176
-#'     root = fs,
177
-#'     import_iss = FALSE,
178
-#'     report_path = NULL
179
-#' )
180
-#' matrices <- import_parallel_Vispa2Matrices(af,
181
-#'     c("seqCount", "fragmentEstimate"),
182
-#'     mode = "AUTO", report_path = NULL, multi_quant_matrix = FALSE
183
-#' )
184
-#' multi_quant <- comparison_matrix(matrices)
185
-#' head(multi_quant)
186
-comparison_matrix <- function(x,
187
-    fragmentEstimate = "fragmentEstimate",
188
-    seqCount = "seqCount",
189
-    barcodeCount = "barcodeCount",
190
-    cellCount = "cellCount",
191
-    ShsCount = "ShsCount") {
192
-    stopifnot(is.list(x) & !is.data.frame(x))
193
-    stopifnot(all(names(x) %in% quantification_types()))
194
-    stopifnot(is.character(fragmentEstimate) & length(fragmentEstimate) == 1)
195
-    stopifnot(is.character(seqCount) & length(seqCount) == 1)
196
-    stopifnot(is.character(barcodeCount) & length(barcodeCount) == 1)
197
-    stopifnot(is.character(cellCount) & length(cellCount) == 1)
198
-    stopifnot(is.character(ShsCount) & length(ShsCount) == 1)
199
-    param_names <- c(
200
-        fragmentEstimate = fragmentEstimate,
201
-        seqCount = seqCount, barcodeCount = barcodeCount,
202
-        cellCount = cellCount, ShsCount = ShsCount
203
-    )
204
-    x <- purrr::map2(x, names(x), function(matrix, quant_type) {
205
-        quant_name <- param_names[names(param_names) %in% quant_type]
206
-        matrix %>% dplyr::rename(!!quant_name := .data$Value)
207
-    })
208
-    result <- purrr::reduce(x, function(matrix1, matrix2) {
209
-        commoncols <- dplyr::intersect(colnames(matrix1), colnames(matrix2))
210
-        matrix1 %>%
211
-            dplyr::full_join(matrix2, by = commoncols)
212
-    })
213
-    na_introduced <- purrr::map_lgl(param_names, function(p) {
214
-        any(is.na(result[[p]]))
215
-    })
216
-    if (any(na_introduced) & getOption("ISAnalytics.verbose") == TRUE) {
217
-        rlang::inform(.nas_introduced_msg())
218
-    }
219
-    result
220
-}
221
-
222
-
223
-#' Separate a multiple-quantification matrix into single quantification
224
-#' matrices.
225
-#'
226
-#' \lifecycle{stable}
227
-#' The function separates a single multi-quantification integration
228
-#' matrix, obtained via \link{comparison_matrix}, into single
229
-#' quantification matrices as a named list of tibbles.
230
-#'
231
-#' @param x Single integration matrix with multiple quantification
232
-#' value columns, likely obtained via \link{comparison_matrix}.
233
-#' @param fragmentEstimate Name of the fragment estimate values column
234
-#' in input
235
-#' @param seqCount Name of the sequence count values column
236
-#' in input
237
-#' @param barcodeCount Name of the barcode count values column
238
-#' in input
239
-#' @param cellCount Name of the cell count values column
240
-#' in input
241
-#' @param ShsCount Name of the shs count values column
242
-#' in input
243
-#' @param key Key columns to perform the joining operation
244
-#'
245
-#' @importFrom purrr is_empty map set_names
246
-#' @importFrom dplyr rename
247
-#' @importFrom magrittr `%>%`
248
-#'
249
-#' @family Analysis functions
250
-#'
251
-#' @return A named list of tibbles, where names are quantification types
252
-#' @seealso \link{quantification_types}
253
-#' @export
254
-#'
255
-#' @examples
256
-#' data("integration_matrices", package = "ISAnalytics")
257
-#' separated <- separate_quant_matrices(
258
-#'     integration_matrices
259
-#' )
260
-#' separated
261
-separate_quant_matrices <- function(x,
262
-    fragmentEstimate = "fragmentEstimate",
263
-    seqCount = "seqCount",
264
-    barcodeCount = "barcodeCount",
265
-    cellCount = "cellCount",
266
-    ShsCount = "ShsCount",
267
-    key = c(
268
-        mandatory_IS_vars(),
269
-        annotation_IS_vars(),
270
-        "CompleteAmplificationID"
271
-    )) {
272
-    stopifnot(is.data.frame(x))
273
-    if (!all(key %in% colnames(x))) {
274
-        rlang::abort(.missing_user_cols_error(key[!key %in% colnames(x)]))
275
-    }
276
-    num_cols <- .find_exp_cols(x, key)
277
-    if (purrr::is_empty(num_cols)) {
278
-        rlang::abort(.missing_num_cols_error())
279
-    }
280
-    stopifnot(is.character(fragmentEstimate) & length(fragmentEstimate) == 1)
281
-    stopifnot(is.character(seqCount) & length(seqCount) == 1)
282
-    stopifnot(is.character(barcodeCount) & length(barcodeCount) == 1)
283
-    stopifnot(is.character(cellCount) & length(cellCount) == 1)
284
-    stopifnot(is.character(ShsCount) & length(ShsCount) == 1)
285
-    param_col <- c(
286
-        fragmentEstimate = fragmentEstimate,
287
-        seqCount = seqCount, barcodeCount = barcodeCount,
288
-        cellCount = cellCount,
289
-        ShsCount = ShsCount
290
-    )
291
-    to_copy <- if (any(!num_cols %in% param_col)) {
292
-        if (all(!num_cols %in% param_col)) {
293
-            rlang::abort(.non_quant_cols_error())
294
-        }
295
-        num_cols[!num_cols %in% param_col]
296
-    }
297
-    num_cols <- param_col[param_col %in% num_cols]
298
-    if (!purrr::is_empty(to_copy) & getOption("ISAnalytics.verbose") == TRUE) {
299
-        rlang::inform(.non_quant_cols_msg(to_copy))
300
-    }
301
-    separated <- purrr::map(num_cols, function(quant) {
302
-        x %>%
303
-            dplyr::select(dplyr::all_of(c(key, to_copy, quant))) %>%
304
-            dplyr::rename(Value = quant)
305
-    }) %>% purrr::set_names(names(num_cols))
306
-    separated
307
-}
308
-
309
-
310 130
 #' Filter data frames with custom predicates
311 131
 #'
312 132
 #' @description
313
-#' \lifecycle{experimental}
133
+#' `r lifecycle::badge("stable")`
314 134
 #' Filter a single data frame or a list of data frames with custom
315 135
 #' predicates assembled from the function parameters.
316 136
 #'
... ...
@@ -422,7 +242,7 @@ separate_quant_matrices <- function(x,
422 242
 #' character vectors. Must be one of the allowed values between
423 243
 #' `c("<", ">", "==", "!=", ">=", "<=")`
424 244
 #'
425
-#' @family Analysis functions
245
+#' @family Data cleaning and pre-processing
426 246
 #'
427 247
 #' @return A data frame or a list of data frames
428 248
 #' @export
... ...
@@ -468,11 +288,11 @@ threshold_filter <- function(x,
468 288
     return(.tf_list(x, threshold, cols_to_compare, comparators))
469 289
 }
470 290
 
471
-
472 291
 #' Sorts and keeps the top n integration sites based on the values
473 292
 #' in a given column.
474 293
 #'
475
-#' \lifecycle{experimental}
294
+#' @description
295
+#' `r lifecycle::badge("stable")`
476 296
 #' The input data frame will be sorted by the highest values in
477 297
 #' the columns specified and the top n rows will be returned as output.
478 298
 #' The user can choose to keep additional columns in the output
... ...
@@ -481,6 +301,11 @@ threshold_filter <- function(x,
481 301
 #' * `keep = "nothing"` only keeps the mandatory columns
482 302
 #' (`mandatory_IS_vars()`) plus the columns in the `columns` parameter.
483 303
 #'
304
+#' @section Required tags:
305
+#' The function will explicitly check for the presence of these tags:
306
+#'
307
+#' * All columns declared in `mandatory_IS_vars()`
308
+#'
484 309
 #' @param x An integration matrix (data frame containing
485 310
 #' `mandatory_IS_vars()`)
486 311
 #' @param n How many integrations should be sliced (in total or
... ...
@@ -497,8 +322,6 @@ threshold_filter <- function(x,
497 322
 #'
498 323
 #' @family Analysis functions
499 324
 #'
500
-#' @importFrom magrittr `%>%`
501
-#' @importFrom rlang abort
502 325
 #'
503 326
 #' @return Either a data frame with at most n rows or
504 327
 #' a data frames with at most n*(number of groups) rows.
... ...
@@ -525,6 +348,7 @@ threshold_filter <- function(x,
525 348
 #'     keep = "Value2",
526 349
 #'     key = "CompleteAmplificationID"
527 350
 #' )
351
+# top_abundant_is
528 352
 top_integrations <- function(x,
529 353
     n = 20,
530 354
     columns = "fragmentEstimate_sum_RelAbundance",
... ...
@@ -588,11 +412,409 @@ top_integrations <- function(x,
588 412
 }
589 413
 
590 414
 
415
+#' Top n targeted genes based on number of IS.
416
+#'
417
+#' @description
418
+#' `r lifecycle::badge("experimental")`
419
+#' Produces a summary of the number of integration events per gene, orders
420
+#' the table in decreasing order and slices the first n rows - either on
421
+#' all the data frame or by group.
422
+#'
423
+#' @details
424
+#' ## Gene grouping
425
+#' When producing a summary of IS by gene, there are different options that
426
+#' can be chosen.
427
+#' The argument `consider_chr` accounts for the fact that some genes (same
428
+#' gene symbol) may span more than one chromosome: if set to `TRUE`
429
+#' counts of IS will be separated for those genes that span 2 or more
430
+#' chromosomes - in other words they will be in 2 different rows of the
431
+#' output table. On the contrary, if the argument is set to `FALSE`,
432
+#' counts will be produced in a single row.
433
+#'
434
+#' NOTE: the function counts **DISTINCT** integration events, which logically
435
+#' corresponds to a union of sets. Be aware of the fact that counts per group
436
+#' and counts with different arguments might be different: if for example
437
+#' counts are performed by considering chromosome and there is one gene symbol
438
+#' with 2 different counts, the sum of those 2 will likely not be equal to
439
+#' the count obtained by performing the calculations without
440
+#' considering the chromosome.
441
+#'
442
+#' The same reasoning can be applied for the argument `consider_gene_strand`,
443
+#' that takes into account the strand of the gene.
444
+#'
445
+#' @section Required tags:
446
+#' The function will explicitly check for the presence of these tags:
447
+#'
448
+#' ```{r echo=FALSE, results="asis"}
449
+#' all_tags <- available_tags()
450
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
451
+#'  ~ "top_targeted_genes" %in% .x)][["tag"]])
452
+#'  cat(paste0("* ", needed, collapse="\n"))
453
+#' ```
454
+#'
455
+#' Note that the tags "gene_strand" and "chromosome" are explicitly required
456
+#' only if `consider_chr = TRUE` and/or `consider_gene_strand = TRUE`.
457
+#'
458
+#' @param x An integration matrix - must be annotated
459
+#' @param n Number of rows to slice
460
+#' @param key If slice has to be performed for each group, the character
461
+#' vector of column names that identify the groups. If `NULL` considers the
462
+#' whole input data frame.
463
+#' @param consider_chr Logical, should the chromosome be taken into account?
464
+#' See details.
465
+#' @param consider_gene_strand Logical, should the gene strand be taken into
466
+#' account? See details.
467
+#' @param as_df If computation is performed by group, `TRUE` returns all
468
+#' groups merged in a single data frame with a column containing the group id.
469
+#' If `FALSE` returns a named list.
470
+#'
471
+#' @importFrom rlang sym
472
+#'
473
+#' @return A data frame or a list of data frames
474
+#' @export
475
+#' @family Analysis functions
476
+#'
477
+#' @examples
478
+#' data("integration_matrices", package = "ISAnalytics")
479
+#' top_targ <- top_targeted_genes(
480
+#'     integration_matrices,
481
+#'     key = NULL
482
+#' )
483
+#' top_targ
484
+top_targeted_genes <- function(x,
485
+    n = 20,
486
+    key = c(
487
+        "SubjectID", "CellMarker",
488
+        "Tissue", "TimePoint"
489
+    ),
490
+    consider_chr = TRUE,
491
+    consider_gene_strand = TRUE,
492
+    as_df = TRUE) {
493
+    stopifnot(is.data.frame(x))
494
+    data.table::setDT(x)
495
+    stopifnot(is.numeric(n) || is.integer(n))
496
+    stopifnot(is.null(key) || is.character(key))
497
+    stopifnot(is.logical(consider_chr))
498
+    stopifnot(is.logical(consider_gene_strand))
499
+
500
+    required_annot_tags <- c("gene_symbol")
501
+    if (consider_gene_strand) {
502
+        required_annot_tags <- c(required_annot_tags, "gene_strand")
503
+    }
504
+    annot_tag_cols <- .check_required_cols(
505
+        required_annot_tags,
506
+        annotation_IS_vars(TRUE),
507
+        "error"
508
+    )
509
+    if (consider_chr) {
510
+        chr_tag_col <- .check_required_cols(
511
+            c("chromosome", "locus"),
512
+            mandatory_IS_vars(TRUE),
513
+            "error"
514
+        )
515
+        annot_tag_cols <- annot_tag_cols %>%
516
+            dplyr::bind_rows(chr_tag_col)
517
+    }
518
+    data.table::setDT(annot_tag_cols)
519
+    cols_to_check <- c(annot_tag_cols$names, key)
520
+    if (!all(cols_to_check %in% colnames(x))) {
521
+        rlang::abort(.missing_needed_cols(
522
+            cols_to_check[!cols_to_check %in% colnames(x)]
523
+        ))
524
+    }
525
+
526
+    df_with_is_counts <- if (is.null(key)) {
527
+        .count_distinct_is_per_gene(
528
+            x = x, include_chr = consider_chr,
529
+            include_gene_strand = consider_gene_strand,
530
+            gene_sym_col = annot_tag_cols[
531
+                eval(sym("tag")) == "gene_symbol"
532
+            ][["names"]],
533
+            gene_strand_col = annot_tag_cols[
534
+                eval(sym("tag")) == "gene_strand"
535
+            ][["names"]],
536
+            chr_col = annot_tag_cols[eval(sym("tag")) ==
537
+                "chromosome"][["names"]],
538
+            mand_vars_to_check = mandatory_IS_vars(TRUE)
539
+        ) %>%
540
+            dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
541
+            dplyr::slice_head(n = n)
542
+    } else {
543
+        tmp <- x[, .count_distinct_is_per_gene(
544
+            x = .SD, include_chr = consider_chr,
545
+            include_gene_strand = consider_gene_strand,
546
+            gene_sym_col = annot_tag_cols[
547
+                eval(sym("tag")) == "gene_symbol"
548
+            ][["names"]],
549
+            gene_strand_col = annot_tag_cols[
550
+                eval(sym("tag")) == "gene_strand"
551
+            ][["names"]],
552
+            chr_col = annot_tag_cols[eval(sym("tag")) ==
553
+                "chromosome"][["names"]],
554
+            mand_vars_to_check = mandatory_IS_vars(TRUE)
555
+        ), by = eval(key)]
556
+        tmp[,
557
+            .SD %>% dplyr::arrange(dplyr::desc(.data$n_IS)) %>%
558
+                dplyr::slice_head(n = n),
559
+            by = eval(key)
560
+        ]
561
+    }
562
+    if (as_df) {
563
+        return(df_with_is_counts)
564
+    }
565
+    return(split(df_with_is_counts, by = key))
566
+}
567
+
568
+
569
+#' Compute Fisher's exact test on gene frequencies.
570
+#'
571
+#' @description
572
+#' `r lifecycle::badge("experimental")`
573
+#' Provided 2 data frames with calculations for CIS, via `CIS_grubbs()`,
574
+#' computes Fisher's exact test.
575
+#' Results can be plotted via `fisher_scatterplot()`.
576
+#'
577
+#' @param cis_x A data frame obtained via `CIS_grubbs()`
578
+#' @param cis_y A data frame obtained via `CIS_grubbs()`
579
+#' @param min_is_per_gene Used for pre-filtering purposes. Genes with a
580
+#' number of distinct integration less than this number will be filtered out
581
+#' prior calculations. Single numeric or integer.
582
+#' @param gene_set_method One between "intersection" and "union". When merging
583
+#' the 2 data frames, `intersection` will perform an inner join operation,
584
+#' while `union` will perform a full join operation.
585
+#' @param significance_threshold Significance threshold for the Fisher's
586
+#' test p-value
587
+#' @param remove_unbalanced_0 Remove from the final output those pairs in
588
+#' which there are no IS for one group or the other and the number of
589
+#' IS of the non-missing group are less than the mean number of IS for that
590
+#' group
591
+#'
592
+#' @template genes_db
593
+#'
594
+#' @section Required tags:
595
+#' The function will explicitly check for the presence of these tags:
596
+#'
597
+#' ```{r echo=FALSE, results="asis"}
598
+#' all_tags <- available_tags()
599
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
600
+#'  ~ "gene_frequency_fisher" %in% .x)][["tag"]])
601
+#'  cat(paste0("* ", needed, collapse="\n"))
602
+#' ```
603
+#'
604
+#' @return A data frame
605
+#' @export
606
+#' @family Analysis functions
607
+#'
608
+#' @examples
609
+#' data("integration_matrices", package = "ISAnalytics")
610
+#' data("association_file", package = "ISAnalytics")
611
+#' aggreg <- aggregate_values_by_key(
612
+#'     x = integration_matrices,
613
+#'     association_file = association_file,
614
+#'     value_cols = c("seqCount", "fragmentEstimate")
615
+#' )
616
+#' cis <- CIS_grubbs(aggreg, by = "SubjectID")
617
+#' fisher <- gene_frequency_fisher(cis$cis$PT001, cis$cis$PT002,
618
+#'     min_is_per_gene = 2
619
+#' )
620
+#' fisher
621
+gene_frequency_fisher <- function(cis_x,
622
+    cis_y,
623
+    min_is_per_gene = 3,
624
+    gene_set_method = c("intersection", "union"),
625
+    onco_db_file = "proto_oncogenes",
626
+    tumor_suppressors_db_file = "tumor_suppressors",
627
+    species = "human",
628
+    known_onco = known_clinical_oncogenes(),
629
+    suspicious_genes =
630
+        clinical_relevant_suspicious_genes(),
631
+    significance_threshold = 0.05,
632
+    remove_unbalanced_0 = TRUE) {
633
+    ## --- Input checks
634
+    stopifnot(is.data.frame(cis_x) && is.data.frame(cis_y))
635
+    stopifnot(is.integer(min_is_per_gene) || is.numeric(min_is_per_gene))
636
+    gene_set_method <- rlang::arg_match(gene_set_method)
637
+    stopifnot(is.character(onco_db_file))
638
+    stopifnot(is.character(tumor_suppressors_db_file))
639
+    stopifnot(is.character(species))
640
+    stopifnot(is.data.frame(known_onco))
641
+    stopifnot(is.data.frame(suspicious_genes))
642
+    stopifnot(is.numeric(significance_threshold))
643
+    stopifnot(is.logical(remove_unbalanced_0))
644
+    ## -- Fetch gene symbol column
645
+    gene_sym_col <- .check_required_cols(
646
+        "gene_symbol", annotation_IS_vars(TRUE),
647
+        duplicate_politic = "error"
648
+    )[["names"]]
649
+    req_cis_cols <- c(
650
+        gene_sym_col, "n_IS_perGene", "average_TxLen",
651
+        "raw_gene_integration_frequency"
652
+    )
653
+    quiet_expand <- purrr::quietly(.expand_cis_df)
654
+    cols_for_join <- c(
655
+        gene_sym_col,
656
+        "Onco1_TS2", "ClinicalRelevance", "DOIReference",
657
+        "KnownGeneClass", "KnownClonalExpansion",
658
+        "CriticalForInsMut"
659
+    )
660
+    ## --- Calculations to perform on each df
661
+    append_calc <- function(df, group_n) {
662
+        if (!all(req_cis_cols %in% colnames(df))) {
663
+            rlang::abort(
664
+                .missing_needed_cols(req_cis_cols[!req_cis_cols %in%
665
+                    colnames(df)])
666
+            )
667
+        }
668
+        modified <- quiet_expand(
669
+            df, gene_sym_col,
670
+            onco_db_file, tumor_suppressors_db_file,
671
+            species, known_onco, suspicious_genes
672
+        )$result
673
+        modified <- modified %>%
674
+            dplyr::mutate(
675
+                IS_per_kbGeneLen = .data$raw_gene_integration_frequency * 1000,
676
+                Sum_IS_per_kbGeneLen = sum(.data$IS_per_kbGeneLen,
677
+                    na.rm = TRUE
678
+                ),
679
+                IS_per_kbGeneLen_perMDepth_TPM = (.data$IS_per_kbGeneLen /
680
+                    .data$Sum_IS_per_kbGeneLen) * 1e6
681
+            ) %>%
682
+            dplyr::filter(.data$n_IS_perGene >= min_is_per_gene) %>%
683
+            dplyr::select(dplyr::all_of(c(
684
+                req_cis_cols, cols_for_join,
685
+                "IS_per_kbGeneLen",
686
+                "Sum_IS_per_kbGeneLen",
687
+                "IS_per_kbGeneLen_perMDepth_TPM"
688
+            )))
689
+        colnames(modified)[!colnames(modified) %in% cols_for_join] <- paste(
690
+            colnames(modified)[!colnames(modified) %in% cols_for_join], group_n,
691
+            sep = "_"
692
+        )
693
+        return(modified)
694
+    }
695
+    cis_mod <- purrr::map2(list(cis_x, cis_y), c(1, 2), append_calc)
696
+    ## --- Merge the two in 1 df
697
+    merged <- if (gene_set_method == "union") {
698
+        purrr::reduce(cis_mod, ~ dplyr::full_join(.x, .y, by = cols_for_join))
699
+    } else {
700
+        purrr::reduce(cis_mod, ~ dplyr::inner_join(.x, .y, by = cols_for_join))
701
+    }
702
+    if (nrow(merged) == 0) {
703
+        if (getOption("ISAnalytics.verbose") == TRUE) {
704
+            msg <- c("Data frame empty after filtering",
705
+                i = paste(
706
+                    "Data frame is empty after applying filter on IS,",
707
+                    "is your filter too stringent?"
708
+                ),
709
+                x = "Nothing to return"
710
+            )
711
+            rlang::inform(msg, class = "empty_df_gene_freq")
712
+        }
713
+        return(NULL)
714
+    }
715
+    ## --- Actual computation of fisher test: test is applied on each row
716
+    ## (each gene)
717
+    merged <- merged %>%
718
+        dplyr::mutate(
719
+            tot_n_IS_perGene_1 = sum(cis_x$n_IS_perGene, na.rm = TRUE),
720
+            tot_n_IS_perGene_2 = sum(cis_y$n_IS_perGene, na.rm = TRUE)
721
+        )
722
+    compute_fisher <- function(...) {
723
+        row <- list(...)
724
+        n_IS_perGene_1 <- row$n_IS_perGene_1
725
+        n_IS_perGene_2 <- row$n_IS_perGene_2
726
+        n_IS_perGene_1[which(is.na(n_IS_perGene_1))] <- 0
727
+        n_IS_perGene_2[which(is.na(n_IS_perGene_2))] <- 0
728
+        matrix <- matrix(
729
+            data = c(
730
+                n_IS_perGene_1,
731
+                row$tot_n_IS_perGene_1 - n_IS_perGene_1,
732
+                n_IS_perGene_2,
733
+                row$tot_n_IS_perGene_2 - n_IS_perGene_2
734
+            ),
735
+            nrow = 2,
736
+            dimnames = list(
737
+                G1 = c("IS_of_gene", "TotalIS"),
738
+                G2 = c("IS_of_gene", "TotalIS")
739
+            )
740
+        )
741
+        ft <- stats::fisher.test(matrix)
742
+        return(ft$p.value)
743
+    }
744
+    merged <- merged %>%
745
+        dplyr::mutate(
746
+            Fisher_p_value = purrr::pmap_dbl(., compute_fisher)
747
+        ) %>%
748
+        dplyr::mutate(
749
+            Fisher_p_value_significant = dplyr::if_else(
750
+                condition = .data$Fisher_p_value < significance_threshold,
751
+                true = TRUE, false = FALSE
752
+            )
753
+        )
754
+    ## --- Removing unbalanced 0s if requested - this scenario applies
755
+    ## only if "union" is selected as method for join
756
+    if (remove_unbalanced_0) {
757
+        mean_is_per_gene_1 <- ceiling(mean(merged$n_IS_perGene_1, na.rm = TRUE))
758
+        mean_is_per_gene_2 <- ceiling(mean(merged$n_IS_perGene_2, na.rm = TRUE))
759
+        test_exclude <- function(...) {
760
+            row <- list(...)
761
+            if (is.na(row$n_IS_perGene_1) || is.na(row$n_IS_perGene_2)) {
762
+                to_ex <- ifelse(
763
+                    test = ((row$n_IS_perGene_1 < mean_is_per_gene_1) &
764
+                        (is.na(row$n_IS_perGene_2))) |
765
+                        ((is.na(row$n_IS_perGene_1)) &
766
+                            (row$n_IS_perGene_2 < mean_is_per_gene_2)),
767
+                    yes = TRUE,
768
+                    no = FALSE
769
+                )
770
+                return(to_ex)
771
+            }
772
+            return(FALSE)
773
+        }
774
+        merged <- merged %>%
775
+            dplyr::mutate(
776
+                to_exclude_from_test = purrr::pmap(., test_exclude)
777
+            ) %>%
778
+            dplyr::filter(.data$to_exclude_from_test == FALSE) %>%
779
+            dplyr::select(-.data$to_exclude_from_test)
780
+        if (nrow(merged) == 0) {
781
+            if (getOption("ISAnalytics.verbose") == TRUE) {
782
+                msg <- c("Data frame empty after filtering",
783
+                    i = paste(
784
+                        "Data frame is after removing unbalanced IS,",
785
+                        "nothing to return"
786
+                    )
787
+                )
788
+                rlang::inform(msg, class = "empty_df_gene_freq_unbal")
789
+            }
790
+            return(NULL)
791
+        }
792
+    }
793
+    ## --- Apply statistical corrections to p-value
794
+    merged <- merged %>%
795
+        dplyr::mutate(
796
+            Fisher_p_value_fdr = stats::p.adjust(.data$Fisher_p_value,
797
+                method = "fdr",
798
+                n = length(.data$Fisher_p_value)
799
+            ),
800
+            Fisher_p_value_benjamini = stats::p.adjust(.data$Fisher_p_value,
801
+                method = "BY",
802
+                n = length(.data$Fisher_p_value)
803
+            ),
804
+            minus_log10_pvalue = -log(.data$Fisher_p_value, base = 10)
805
+        ) %>%
806
+        dplyr::mutate(
807
+            minus_log10_pvalue_fdr = -log(.data$Fisher_p_value_fdr, base = 10),
808
+        )
809
+    return(merged)
810
+}
811
+
812
+
591 813
 #' Computes user specified functions on numerical columns and updates
592 814
 #' the metadata data frame accordingly.
593 815
 #'
594 816
 #' @description
595
-#' \lifecycle{experimental}
817
+#' `r lifecycle::badge("stable")`
596 818
 #' The function operates on a data frame by grouping the content by
597 819
 #' the sample key and computing every function specified on every
598 820
 #' column in the `value_columns` parameter. After that the metadata
... ...
@@ -628,14 +850,17 @@ top_integrations <- function(x,
628 850
 #' @param functions A named list of function or purrr-style lambdas
629 851
 #' @param add_integrations_count Add the count of distinct integration sites
630 852
 #' for each group? Can be computed only if `x` contains the mandatory columns
631
-#' `chr`, `integration_locus`, `strand`
853
+#' `mandatory_IS_vars()`
854
+#'
855
+#' @section Required tags:
856
+#' The function will explicitly check for the presence of these tags:
857
+#'
858
+#' * All columns declared in `mandatory_IS_vars()`
859
+#'
860
+#' These are checked only if `add_integrations_count = TRUE`.
632 861
 #'
633 862
 #' @family Analysis functions
634
-#' @importFrom rlang eval_tidy expr abort .data sym inform
635
-#' @importFrom purrr is_function is_formula map_lgl walk map set_names
636
-#' @importFrom dplyr group_by across all_of summarise rename_with bind_cols
637
-#' @importFrom dplyr n_distinct left_join
638
-#' @importFrom magrittr `%>%`
863
+#' @importFrom rlang .data sym
639 864
 #'
640 865
 #' @return A list with modified x and metadata data frames
641 866
 #' @export
... ...
@@ -649,7 +874,8 @@ top_integrations <- function(x,
649 874
 #'     value_columns = c("seqCount", "fragmentEstimate")
650 875
 #' )
651 876
 #' stats
652
-sample_statistics <- function(x, metadata,
877
+sample_statistics <- function(x,
878
+    metadata,
653 879
     sample_key = "CompleteAmplificationID",
654 880
     value_columns = "Value",
655 881
     functions = default_stats(),
... ...
@@ -676,7 +902,9 @@ sample_statistics <- function(x, metadata,
676 902
             is.integer(rlang::eval_tidy(expr))
677 903
     })
678 904
     if (any(vcols_are_numeric == FALSE)) {
679
-        rlang::abort(.non_num_user_cols_error(value_columns[!vcols_are_numeric]))
905
+        rlang::abort(.non_num_user_cols_error(
906
+            value_columns[!vcols_are_numeric]
907
+        ))
680 908
     }
681 909
     purrr::walk(functions, function(f) {
682 910
         if (!(purrr::is_function(f) | purrr::is_formula(f))) {
... ...
@@ -736,7 +964,8 @@ sample_statistics <- function(x, metadata,
736 964
 
737 965
 #' Grubbs test for Common Insertion Sites (CIS).
738 966
 #'
739
-#' \lifecycle{stable}
967
+#' @description
968
+#' `r lifecycle::badge("stable")`
740 969
 #' Statistical approach for the validation of common insertion sites
741 970
 #' significance based on the comparison of the integration frequency
742 971
 #' at the CIS gene with respect to other genes contained in the
... ...
@@ -746,9 +975,14 @@ sample_statistics <- function(x, metadata,
746 975
 #'
747 976
 #' @details
748 977
 #' ## Genomic annotation file
749
-#' This file is a data base, or more simply a .tsv file to import, with
750
-#' genes annotation for the specific genome. The annotations for the
751
-#' human genome (hg19) and murine genome (mm9) are already
978
+#' A data frame containing
979
+#' genes annotation for the specific genome.
980
+#' From version `1.5.4` the argument `genomic_annotation_file` accepts only
981
+#' data frames or package provided defaults.
982
+#' The user is responsible for importing the appropriate tabular files if
983
+#' customization is needed.
984
+#' The annotations for the human genome (hg19) and
985
+#' murine genome (mm9) are already
752 986
 #' included in this package: to use one of them just
753 987
 #' set the argument `genomic_annotation_file` to either `"hg19"` or
754 988
 #' `"mm9"`.
... ...
@@ -758,6 +992,16 @@ sample_statistics <- function(x, metadata,
758 992
 #'
759 993
 #' `r refGene_table_cols()`
760 994
 #'
995
+#' @section Required tags:
996
+#' The function will explicitly check for the presence of these tags:
997
+#'
998
+#' ```{r echo=FALSE, results="asis"}
999
+#' all_tags <- available_tags()
1000
+#' needed <- unique(all_tags[purrr::map_lgl(eval(rlang::sym("needed_in")),
1001
+#'  ~ "CIS_grubbs" %in% .x)][["tag"]])
1002
+#'  cat(paste0("* ", needed, collapse="\n"))
1003
+#' ```
1004
+#'
761 1005
 #' @param x An integration matrix, must include the `mandatory_IS_vars()`
762 1006
 #' columns and the `annotation_IS_vars()` columns
763 1007
 #' @param genomic_annotation_file Database file for gene annotation,
... ...
@@ -768,17 +1012,16 @@ sample_statistics <- function(x, metadata,
768 1012
 #' NULL, the function will perform calculations for each group and return
769 1013
 #' a list of data frames with the results. E.g. for `by = "SubjectID"`,
770 1014
 #' CIS will be computed for each distinct SubjectID found in the table
771
-#' (of course, "SubjectID" column must be included in the input data frame).
1015
+#' ("SubjectID" column must be included in the input data frame).
1016
+#' @param return_missing_as_df Returns those genes present in the input df
1017
+#' but not in the refgenes as a data frame?
1018
+#' @param results_as_list Relevant only if `by` is not `NULL` - if `TRUE`
1019
+#' return the group computations as a named list, otherwise return a single
1020
+#' df with an additional column containing the group id
772 1021
 #'
773 1022
 #' @family Analysis functions
774 1023
 #'
775
-#' @importFrom tibble as_tibble
776
-#' @importFrom rlang .data abort current_env eval_tidy sym
777
-#' @importFrom magrittr `%>%`
778
-#' @importFrom utils read.csv
779
-#' @importFrom stringr str_replace_all
780
-#' @importFrom tidyr unite
781
-#' @importFrom purrr set_names map
1024
+#' @importFrom rlang .data sym
782 1025
 #'
783 1026
 #' @return A data frame
784 1027
 #' @export
... ...
@@ -786,73 +1029,93 @@ sample_statistics <- function(x, metadata,
786 1029
 #' @examples
787 1030
 #' data("integration_matrices", package = "ISAnalytics")
788 1031
 #' cis <- CIS_grubbs(integration_matrices)
789
-#' head(cis)
1032
+#' cis
790 1033
 CIS_grubbs <- function(x,
791 1034
     genomic_annotation_file = "hg19",
792 1035
     grubbs_flanking_gene_bp = 100000,
793 1036
     threshold_alpha = 0.05,
794
-    by = NULL) {
795
-    # Check x has the correct structure
1037
+    by = NULL,
1038
+    return_missing_as_df = TRUE,
1039
+    results_as_list = TRUE) {
1040
+    ## Check x has the correct structure
796 1041
     stopifnot(is.data.frame(x))
797
-    if (!all(mandatory_IS_vars() %in% colnames(x))) {
798
-        rlang::abort(.missing_mand_vars())
1042
+    ## Check dyn vars for required tags
1043
+    req_mand_vars <- .check_required_cols(c("chromosome", "locus"),
1044
+        vars_df = mandatory_IS_vars(TRUE),
1045
+        duplicate_politic = "error"
1046
+    )
1047
+    chrom_col <- req_mand_vars %>%
1048
+        dplyr::filter(.data$tag == "chromosome") %>%
1049
+        dplyr::pull(.data$names)
1050
+    locus_col <- req_mand_vars %>%
1051
+        dplyr::filter(.data$tag == "locus") %>%
1052
+        dplyr::pull(.data$names)
1053
+    strand_col <- if ("is_strand" %in% mandatory_IS_vars(TRUE)$tag) {
1054
+        col <- mandatory_IS_vars(TRUE) %>%
1055
+            dplyr::filter(.data$tag == "is_strand") %>%
1056
+            dplyr::pull(.data$names)
1057
+        if (col %in% colnames(x)) {
1058
+            col
1059
+        } else {
1060
+            NULL
1061
+        }
1062
+    } else {
1063
+        NULL
799 1064
     }
800
-    if (!.is_annotated(x)) {
801
-        rlang::abort(.missing_annot())
1065
+    req_annot_col <- .check_required_cols(
1066
+        list(gene_symbol = "char", gene_strand = "char"),
1067
+        vars_df = annotation_IS_vars(TRUE),
1068
+        "error"
1069
+    )
1070
+    gene_symbol_col <- req_annot_col %>%
1071
+        dplyr::filter(.data$tag == "gene_symbol") %>%
1072
+        dplyr::pull(.data$names)
1073
+    gene_strand_col <- req_annot_col %>%
1074
+        dplyr::filter(.data$tag == "gene_strand") %>%
1075
+        dplyr::pull(.data$names)
1076
+    cols_required <- c(chrom_col, locus_col, gene_symbol_col, gene_strand_col)
1077
+    if (!all(cols_required %in% colnames(x))) {
1078
+        rlang::abort(.missing_user_cols_error(
1079
+            cols_required[!cols_required %in% colnames(x)]
1080
+        ))
802 1081
     }
803 1082
     # Check other parameters
804
-    stopifnot(is.character(genomic_annotation_file))
1083
+    stopifnot(is.data.frame(genomic_annotation_file) ||
1084
+        is.character(genomic_annotation_file))
805 1085
     genomic_annotation_file <- genomic_annotation_file[1]
806
-    if (genomic_annotation_file %in% c("hg19", "mm9")) {
1086
+    if (is.character(genomic_annotation_file) &&
1087
+        !genomic_annotation_file %in% c("hg19", "mm9")) {
1088
+        err_msg <- c("Genomic annotation file unknown",
1089
+            x = paste(
1090
+                "Since ISAnalytics 1.5.4, if provided as",
1091
+                "character vector, `genomic_annotation_file`",
1092
+                "parameter must be one of 'hg19' or 'mm9'"
1093
+            ),
1094
+            i = paste(
1095
+                "For using other genome reference files",
1096
+                "import them in the R environment and pass",
1097
+                "them to the function"
1098
+            )
1099
+        )
1100
+        rlang::abort(err_msg, class = "genomic_file_char")
1101
+    }
1102
+    if (is.character(genomic_annotation_file)) {
807 1103
         gen_file <- paste0("refGenes_", genomic_annotation_file)
808 1104
         utils::data(list = gen_file, envir = rlang::current_env())
809 1105
         refgenes <- rlang::eval_tidy(rlang::sym(gen_file))
810 1106
     } else {
811
-        stopifnot(file.exists(genomic_annotation_file))
812
-        # Determine file extension
813
-        ext <- .check_file_extension(genomic_annotation_file)
814
-        # Try to import annotation file
815
-        if (ext == "tsv") {
816
-            refgenes <- utils::read.csv(
817
-                file = genomic_annotation_file,
818
-                header = TRUE, fill = TRUE, sep = "\t",
819
-                check.names = FALSE,
820
-                na.strings = c("NONE", "NA", "NULL", "NaN", "")
821
-            )
822
-            # Check annotation file format
823
-            if (!all(refGene_table_cols() %in% colnames(refgenes))) {
824
-                rlang::abort(.non_standard_annotation_structure())
825
-            }
826
-            refgenes <- tibble::as_tibble(refgenes) %>%
827
-                dplyr::mutate(chrom = stringr::str_replace_all(
828
-                    .data$chrom,
829
-                    "chr", ""
830
-                ))
831
-        } else if (ext == "csv") {
832
-            refgenes <- utils::read.csv(
833
-                file = genomic_annotation_file,
834
-                header = TRUE, fill = TRUE,
835
-                check.names = FALSE,
836
-                na.strings = c("NONE", "NA", "NULL", "NaN", "")
837
-            )
838
-            # Check annotation file format
839
-            if (!all(refGene_table_cols() %in% colnames(refgenes))) {
840
-                rlang::abort(.non_standard_annotation_structure())
841
-            }
842
-            refgenes <- tibble::as_tibble(refgenes) %>%
843
-                dplyr::mutate(chrom = stringr::str_replace_all(
844
-                    .data$chrom,
845
-                    "chr", ""
846
-                ))
847
-        } else {
848
-            gen_file_err <- paste(
849
-                "The genomic annotation file must be either in",
850
-                ".tsv or .csv format (compressed or not)"
851
-            )
852
-            rlang::abort(gen_file_err)
1107
+        # Check annotation file format
1108
+        refgenes <- genomic_annotation_file
1109
+        if (!all(refGene_table_cols() %in% colnames(refgenes))) {
1110
+            rlang::abort(.non_standard_annotation_structure())
853 1111
         }
1112
+        refgenes <- tibble::as_tibble(refgenes) %>%
1113
+            dplyr::mutate(chrom = stringr::str_replace_all(
1114
+                .data$chrom,
1115
+                "chr", ""
1116
+            ))
854 1117
     }
855
-    stopifnot(is.numeric(grubbs_flanking_gene_bp) |
1118
+    stopifnot(is.numeric(grubbs_flanking_gene_bp) ||
856 1119
         is.integer(grubbs_flanking_gene_bp))
857 1120
     grubbs_flanking_gene_bp <- grubbs_flanking_gene_bp[1]
858 1121
     stopifnot(is.numeric(threshold_alpha))
... ...
@@ -861,13 +1124,20 @@ CIS_grubbs <- function(x,
861 1124
     if (!all(by %in% colnames(x))) {
862 1125
         rlang::abort(.missing_user_cols_error(by[!by %in% colnames(x)]))
863 1126
     }
864
-    result <- if (is.null(by)) {
865
-        .cis_grubb_calc(
1127
+    stopifnot(is.logical(return_missing_as_df))
1128
+    return_missing_as_df <- return_missing_as_df[1]
1129
+    if (is.null(by)) {
1130
+        result <- .cis_grubb_calc(
866 1131
             x = x,
867 1132
             refgenes = refgenes,
868 1133
             grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
869
-            threshold_alpha = threshold_alpha
1134
+            threshold_alpha = threshold_alpha,
1135
+            gene_symbol_col = gene_symbol_col,
1136
+            gene_strand_col = gene_strand_col,
1137
+            chr_col = chrom_col, locus_col = locus_col,
1138
+            strand_col = strand_col
870 1139
         )
1140
+        missing_df <- result$missing
871 1141
     } else {
872 1142
         grouped <- x %>%
873 1143
             dplyr::group_by(dplyr::across(dplyr::all_of(by)))
... ...
@@ -879,48 +1149,69 @@ CIS_grubbs <- function(x,
879 1149
             dplyr::group_by(dplyr::across(dplyr::all_of(by))) %>%
880 1150
             dplyr::group_split() %>%
881 1151
             purrr::set_names(group_keys)
882
-        purrr::map(split, ~ .cis_grubb_calc(
1152
+        result <- purrr::map(split, ~ .cis_grubb_calc(
883 1153
             x = .x,
884 1154
             refgenes = refgenes,
885 1155
             grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
886
-            threshold_alpha = threshold_alpha
1156
+            threshold_alpha = threshold_alpha,
1157
+            gene_symbol_col = gene_symbol_col,
1158
+            gene_strand_col = gene_strand_col,
1159
+            chr_col = chrom_col, locus_col = locus_col,
1160
+            strand_col = strand_col
887 1161
         ))
1162
+        missing_df <- purrr::map(result, ~ .x$missing) %>%
1163
+            purrr::reduce(dplyr::bind_rows) %>%
1164
+            dplyr::distinct()
888 1165
     }
889
-    return(result)
1166
+    if (nrow(missing_df) > 0 & getOption("ISAnalytics.verbose") == TRUE) {
1167
+        warn_miss <- c("Warning: missing genes in refgenes table",
1168
+            i = paste(paste(
1169
+                "A total of", nrow(missing_df),
1170
+                "genes",
1171
+                "were found in the input data but not",
1172
+                "in the refgene table. This may be caused by",
1173
+                "a mismatch in the annotation phase of",
1174
+                "the matrix. Here is a summary: "
1175
+            ),
1176
+            paste0(utils::capture.output({
1177
+                print(missing_df, n = Inf)
1178
+            }), collapse = "\n"),
1179
+            sep = "\n"
1180
+            ),
1181
+            i = paste(
1182
+                "NOTE: missing genes will be removed from",
1183
+                "the final output! Review results carefully"
1184
+            )
1185
+        )
1186
+        rlang::inform(warn_miss, class = "warn_miss_genes")
1187
+    }
1188
+    if (!is.null(by)) {
1189
+        result <- if (results_as_list) {
1190
+            purrr::map(result, ~ .x$df)
1191
+        } else {
1192
+            purrr::map2(result, names(result), ~ {
1193
+                .x$df %>%
1194
+                    dplyr::mutate(group = .y)
1195
+            }) %>% purrr::reduce(dplyr::bind_rows)
1196
+        }
1197
+        if (return_missing_as_df) {
1198
+            return(list(cis = result, missing_genes = missing_df))
1199
+        }
1200
+        return(result)
1201
+    }
1202
+    if (return_missing_as_df) {
1203
+        return(list(cis = result$df, missing_genes = missing_df))
1204
+    }
1205
+    return(result$df)
890 1206
 }
891 1207
 
892 1208
 #' Integrations cumulative count in time by sample
893 1209
 #'
894
-#' \lifecycle{experimental}
895
-#' This function computes the cumulative number of integrations
896
-#' observed in each sample at different time points by assuming that
897
-#' if an integration is observed at time point "t" then it is also observed in
898
-#' time point "t+1".
1210
+#' @description
1211
+#' `r lifecycle::badge("deprecated")`
1212
+#' This function was deprecated in favour of a single function,
1213
+#' please use `cumulative_is` instead.
899 1214
 #'
900
-#' @details
901
-#' ## Input data frame
902
-#' The user can provide as input for the `x` parameter both a simple
903
-#' integration matrix AND setting the `aggregate` parameter to TRUE,
904
-#' or provide an already aggregated matrix via
905
-#' \link{aggregate_values_by_key}.
906
-#' If the user supplies a matrix to be aggregated the `association_file`
907
-#' parameter must not be NULL: aggregation will be done by an internal
908
-#' call to the aggregation function.
909
-#' If the user supplies an already aggregated matrix, the `key` parameter
910
-#' is the key used for aggregation -
911
-#' **NOTE: for this operation is mandatory
912
-#' that the time point column is included in the key.**
913
-#' ## Assumptions on time point format
914
-#' By using the functions provided by this package, when imported,
915
-#' an association file will be correctly formatted for future usage.
916
-#' In the formatting process there is also a padding operation performed on
917
-#' time points: this means the functions expects the time point column to
918
-#' be of type character and to be correctly padded with 0s. If the
919
-#' chosen column for time point is detected as numeric the function will
920
-#' attempt the conversion to character and automatic padding.
921
-#' If you choose to import the association file not using the
922
-#' \link{import_association_file} function, be sure to check the format of
923
-#' the chosen column to avoid undesired results.
924 1215
 #'
925 1216
 #' @param x A simple integration matrix or an aggregated matrix (see details)
926 1217
 #' @param association_file NULL or the association file for x if `aggregate`
... ...
@@ -932,20 +1223,9 @@ CIS_grubbs <- function(x,
932 1223
 #' @param aggregate Should x be aggregated?
933 1224
 #' @param ... Additional parameters to pass to `aggregate_values_by_key`
934 1225
 #'
935
-#' @family Analysis functions
936
-#'
937
-#' @importFrom dplyr mutate filter across all_of select summarise group_by
938
-#' @importFrom dplyr arrange group_split first full_join starts_with distinct
939
-#' @importFrom dplyr semi_join n rename
940
-#' @importFrom magrittr `%>%`
941
-#' @importFrom rlang .data abort inform `:=`
942
-#' @importFrom stringr str_pad
943
-#' @importFrom purrr reduce is_empty
944
-#' @importFrom tidyr pivot_longer
945
-#' @importFrom stats na.omit
946
-#'
947 1226
 #' @return A data frame
948 1227
 #' @export
1228
+#' @keywords internal
949 1229
 #'
950 1230
 #' @examples
951 1231
 #' data("integration_matrices", package = "ISAnalytics")
... ...
@@ -970,120 +1250,26 @@ cumulative_count_union <- function(x,
970 1250
     zero = "0000",
971 1251
     aggregate = FALSE,
972 1252
     ...) {
973
-    stopifnot(is.data.frame(x))
974
-    stopifnot(is.data.frame(association_file) | is.null(association_file))
975
-    stopifnot(is.character(timepoint_column) & length(timepoint_column) == 1)
976
-    stopifnot(is.character(key))
977
-    stopifnot(is.logical(include_tp_zero))
978
-    stopifnot(is.character(zero) & length(zero) == 1)
979
-    stopifnot(is.logical(aggregate))
980
-    if (aggregate == TRUE & is.null(association_file)) {
981
-        rlang::abort(.agg_with_null_meta_err())
982
-    }
983
-    if (!all(timepoint_column %in% key)) {
984
-        rlang::abort(.key_without_tp_err())
985
-    }
986
-    if (aggregate == FALSE) {
987
-        if (!all(key %in% colnames(x))) {
988
-            rlang::abort(.key_not_found())
989
-        }
990
-    } else {
991
-        x <- aggregate_values_by_key(
992
-            x = x,
993
-            association_file = association_file,
994
-            key = key, ...
995
-        )
996
-        if (is.numeric(association_file[[timepoint_column]]) |
997
-            is.integer(association_file[[timepoint_column]])) {
998
-            max <- max(association_file[[timepoint_column]])
999
-            digits <- floor(log10(x)) + 1
1000
-            association_file <- association_file %>%
1001
-                dplyr::mutate(
1002
-                    {{ timepoint_column }} := stringr::str_pad(
1003
-                        as.character(.data$TimePoint),
1004
-                        digits,
1005
-                        side = "left",
1006
-                        pad = "0"
1007
-                    )
1008
-                )
1009
-            zero <- paste0(rep_len("0", digits), collapse = "")
1010
-        }
1011
-    }
1012
-    if (include_tp_zero == FALSE) {
1013
-        x <- x %>%
1014
-            dplyr::filter(dplyr::across(
1015
-                dplyr::all_of(timepoint_column),
1016
-                ~ .x != zero
1017
-            ))
1018
-        if (nrow(x) == 0) {
1019
-            all_tp0_msg <- paste(
1020
-                "All time points zeros were excluded, the data",
1021
-                "frame is empty."
1022
-            )
1023
-            rlang::inform(all_tp0_msg)
1024
-            return(x)
1025
-        }
1026
-    }
1027
-    annot <- if (.is_annotated(x)) {
1028
-        annotation_IS_vars()
1029
-    } else {
1030
-        character(0)
1031
-    }
1032
-    x <- x %>% dplyr::select(dplyr::all_of(c(
1033
-        mandatory_IS_vars(),
1034
-        annot,
1035
-        key
1036
-    )))
1037
-    cols_for_join <- colnames(x)[!colnames(x) %in% timepoint_column]
1038
-    key_minus_tp <- key[!key %in% timepoint_column]
1039
-    distinct_tp_for_each <- x %>%
1040
-        dplyr::arrange(dplyr::across(dplyr::all_of(timepoint_column))) %>%
1041
-        dplyr::group_by(dplyr::across(dplyr::all_of(key_minus_tp))) %>%
1042
-        dplyr::summarise(
1043
-            Distinct_tp = unique(
1044
-                `$`(.data, !!timepoint_column)
1045
-            ),
1046
-            .groups = "drop"
1047
-        ) %>%
1048
-        dplyr::rename({{ timepoint_column }} := "Distinct_tp")
1049
-
1050
-    splitted <- x %>%
1051
-        dplyr::arrange(dplyr::across(dplyr::all_of(timepoint_column))) %>%
1052
-        dplyr::group_by(dplyr::across(dplyr::all_of(timepoint_column))) %>%
1053
-        dplyr::group_split()
1054
-
1055
-    mult_tp <- purrr::reduce(splitted, dplyr::full_join, by = cols_for_join)
1056
-    tp_indexes <- grep(timepoint_column, colnames(mult_tp))
1057
-    tp_indexes <- tp_indexes[-1]
1058
-    res <- if (!purrr::is_empty(tp_indexes)) {
1059
-        for (i in tp_indexes) {
1060
-            mod_col <- mult_tp[[i]]
1061
-            mod_col_prec <- mult_tp[[i - 1]]
1062
-            val <- dplyr::first(stats::na.omit(mod_col))
1063
-            mod_col[!is.na(mod_col_prec)] <- val
1064
-            mult_tp[i] <- mod_col
1065
-        }
1066
-        mult_tp %>%
1067
-            tidyr::pivot_longer(
1068
-                cols = dplyr::starts_with(timepoint_column),
1069
-                values_to = timepoint_column,
1070
-                values_drop_na = TRUE
1071
-            ) %>%
1072
-            dplyr::select(-c("name")) %>%
1073
-            dplyr::distinct()
1074
-    } else {
1075
-        mult_tp
1076
-    }
1077
-    res <- res %>% dplyr::semi_join(distinct_tp_for_each, by = key)
1078
-    res <- res %>%
1079
-        dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
1080
-        dplyr::summarise(count = dplyr::n(), .groups = "drop")
1081
-    return(res)
1253
+    lifecycle::deprecate_warn(
1254
+        when = "1.5.4",
1255
+        what = "cumulative_count_union()",
1256
+        with = "cumulative_is()",
1257
+        details = c(paste(
1258
+            "Use option `counts = TRUE`.",
1259
+            "Function will be likely dropped in the",
1260
+            "next release cycle"
1261
+        ))
1262
+    )
1263
+    cumulative_is(
1264
+        x = x, timepoint_col = timepoint_column, key = key,
1265
+        include_tp_zero = include_tp_zero, counts = TRUE
1266
+    )
1082 1267
 }
1083 1268
 
1084
-#' Expands integration matrix with the cumulative is union over time.
1269
+#' Expands integration matrix with the cumulative IS union over time.
1085 1270
 #'
1086
-#' @description \lifecycle{experimental}
1271
+#' @description
1272
+#' `r lifecycle::badge("experimental")`
1087 1273
 #' Given an input integration matrix that can be grouped over time,
1088 1274
 #' this function adds integrations in groups assuming that
1089 1275
 #' if an integration is observed at time point "t" then it is also observed in
... ...
@@ -1098,6 +1284,14 @@ cumulative_count_union <- function(x,
1098 1284
 #' @param expand If `FALSE`, for each group, the set of integration sites is
1099 1285
 #' returned in a separate column as a nested table, otherwise the resulting
1100 1286
 #' column is unnested.
1287
+#' @param counts Add cumulative counts? Logical
1288
+#'
1289
+#' @section Required tags:
1290
+#' The function will explicitly check for the presence of these tags:
1291
+#'
1292
+#' * All columns declared in `mandatory_IS_vars()`
1293
+#' * Checks if the matrix is annotated by assessing presence of
1294
+#' `annotation_IS_vars()`
1101 1295
 #'
1102 1296
 #' @family Analysis functions
1103 1297
 #' @return A data frame
... ...
@@ -1125,14 +1319,17 @@ cumulative_is <- function(x,
1125 1319
     ),
1126 1320
     timepoint_col = "TimePoint",
1127 1321
     include_tp_zero = FALSE,
1128
-    keep_og_is = TRUE,
1129
-    expand = FALSE) {
1322
+    counts = TRUE,
1323
+    keep_og_is = FALSE,
1324
+    expand = TRUE) {
1130 1325
     stopifnot(is.data.frame(x))
1131 1326
     stopifnot(is.character(key))
1132 1327
     stopifnot(is.character(timepoint_col))
1133 1328
     timepoint_col <- timepoint_col[1]
1134 1329
     stopifnot(is.logical(include_tp_zero))
1135 1330
     include_tp_zero <- include_tp_zero[1]
1331
+    stopifnot(is.logical(counts))
1332
+    counts <- counts[1]
1136 1333
     stopifnot(is.logical(keep_og_is))
1137 1334
     stopifnot(is.logical(expand))
1138 1335
     if (!timepoint_col %in% key) {
... ...
@@ -1153,7 +1350,7 @@ cumulative_is <- function(x,
1153 1350
         temp <- temp %>%
1154 1351
             dplyr::filter(.data[[timepoint_col]] != 0)
1155 1352
         if (nrow(temp) == 0) {
1156
-            rlang::inform(.only_zero_tp())
1353
+            rlang::inform(.only_zero_tp(), class = "only_zero_tps")
1157 1354
             return(NULL)
1158 1355
         }
1159 1356
     }
... ...
@@ -1163,13 +1360,13 @@ cumulative_is <- function(x,
1163 1360
         dplyr::distinct(dplyr::across(dplyr::all_of(is_vars)),
1164 1361
             .keep_all = TRUE