Browse code

[MAJOR UPDATE] Major update to version 1.3.3

Giulia Pais authored on 30/07/2021 16:20:39
Showing116 changed files

... ...
@@ -10,3 +10,4 @@
10 10
 ^pkgdown$
11 11
 ^doc$
12 12
 ^Design$
13
+^sample_reports$
... ...
@@ -35,3 +35,6 @@ unsrturl.bst
35 35
 inst/doc
36 36
 ISAnalytics.Rproj
37 37
 docs
38
+sample_reports
39
+README.Rmd
40
+NEWS.Rmd
... ...
@@ -1,6 +1,6 @@
1 1
 Package: ISAnalytics
2 2
 Title: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies
3
-Version: 1.3.2
3
+Version: 1.3.3
4 4
 Date: 2020-07-03
5 5
 Authors@R: c(
6 6
   person(given = "Andrea",
... ...
@@ -25,8 +25,6 @@ Depends:
25 25
     magrittr
26 26
 Imports: 
27 27
     utils,
28
-    reactable,
29
-    htmltools,
30 28
     dplyr,
31 29
     readr,
32 30
     tidyr,
... ...
@@ -36,20 +34,18 @@ Imports:
36 34
     BiocParallel,
37 35
     stringr,
38 36
     fs,
39
-    zip,
40 37
     lubridate,
41 38
     lifecycle,
42 39
     ggplot2,
43 40
     ggrepel,
44 41
     stats,
45
-    upsetjs,
46 42
     psych,
47
-    grDevices,
48 43
     data.table,
49 44
     readxl,
50 45
     tools,
51 46
     Rcapture,
52
-    plotly
47
+    grDevices, 
48
+    zip
53 49
 Encoding: UTF-8
54 50
 LazyData: false
55 51
 Roxygen: list(markdown = TRUE)
... ...
@@ -59,7 +55,6 @@ Suggests:
59 55
     covr,
60 56
     knitr,
61 57
     BiocStyle,
62
-    knitcitations,
63 58
     sessioninfo,
64 59
     rmarkdown,
65 60
     roxygen2,
... ...
@@ -69,7 +64,13 @@ Suggests:
69 64
     ggalluvial,
70 65
     scales,
71 66
     gridExtra,
72
-    R.utils
67
+    R.utils,
68
+    RefManageR,
69
+    flexdashboard,
70
+    DT,
71
+    circlize,
72
+    plotly,
73
+    ggvenn
73 74
 VignetteBuilder: knitr
74 75
 RdMacros: 
75 76
     lifecycle
... ...
@@ -11,6 +11,7 @@ export(as_sparse_matrix)
11 11
 export(association_file_columns)
12 12
 export(available_outlier_tests)
13 13
 export(blood_lineages_default)
14
+export(circos_genomic_density)
14 15
 export(clinical_relevant_suspicious_genes)
15 16
 export(comparison_matrix)
16 17
 export(compute_abundance)
... ...
@@ -20,11 +21,13 @@ export(date_columns_coll)
20 21
 export(date_formats)
21 22
 export(default_iss_file_prefixes)
22 23
 export(default_meta_agg)
24
+export(default_report_path)
23 25
 export(default_stats)
24 26
 export(generate_Vispa2_launch_AF)
25 27
 export(generate_blank_association_file)
26 28
 export(import_Vispa2_stats)
27 29
 export(import_association_file)
30
+export(import_parallel_Vispa2Matrices)
28 31
 export(import_parallel_Vispa2Matrices_auto)
29 32
 export(import_parallel_Vispa2Matrices_interactive)
30 33
 export(import_single_Vispa2Matrix)
... ...
@@ -38,6 +41,7 @@ export(outliers_by_pool_fragments)
38 41
 export(quantification_types)
39 42
 export(realign_after_collisions)
40 43
 export(reduced_AF_columns)
44
+export(refGene_table_cols)
41 45
 export(remove_collisions)
42 46
 export(sample_statistics)
43 47
 export(separate_quant_matrices)
... ...
@@ -46,12 +50,8 @@ export(threshold_filter)
46 50
 export(top_abund_tableGrob)
47 51
 export(top_integrations)
48 52
 export(unzip_file_system)
49
-import(BiocParallel)
50
-import(dplyr)
51 53
 import(ggplot2)
52
-import(htmltools)
53 54
 import(lifecycle)
54
-import(upsetjs)
55 55
 importFrom(BiocParallel,MulticoreParam)
56 56
 importFrom(BiocParallel,SnowParam)
57 57
 importFrom(BiocParallel,bplapply)
... ...
@@ -61,28 +61,48 @@ importFrom(BiocParallel,bptry)
61 61
 importFrom(Rcapture,closedp.0)
62 62
 importFrom(Rcapture,closedp.bc)
63 63
 importFrom(data.table,.SD)
64
+importFrom(data.table,`%chin%`)
64 65
 importFrom(data.table,fread)
65 66
 importFrom(data.table,melt.data.table)
66 67
 importFrom(data.table,rbindlist)
67 68
 importFrom(data.table,setDT)
69
+importFrom(data.table,setnames)
68 70
 importFrom(dplyr,across)
69 71
 importFrom(dplyr,all_of)
72
+importFrom(dplyr,anti_join)
70 73
 importFrom(dplyr,arrange)
71 74
 importFrom(dplyr,bind_cols)
72 75
 importFrom(dplyr,bind_rows)
76
+importFrom(dplyr,contains)
77
+importFrom(dplyr,cur_column)
78
+importFrom(dplyr,desc)
73 79
 importFrom(dplyr,distinct)
80
+importFrom(dplyr,everything)
74 81
 importFrom(dplyr,filter)
82
+importFrom(dplyr,first)
75 83
 importFrom(dplyr,full_join)
76 84
 importFrom(dplyr,group_by)
85
+importFrom(dplyr,group_keys)
86
+importFrom(dplyr,group_modify)
77 87
 importFrom(dplyr,group_split)
88
+importFrom(dplyr,if_else)
78 89
 importFrom(dplyr,inner_join)
79 90
 importFrom(dplyr,intersect)
80 91
 importFrom(dplyr,left_join)
81 92
 importFrom(dplyr,mutate)
93
+importFrom(dplyr,n)
94
+importFrom(dplyr,n_distinct)
95
+importFrom(dplyr,pull)
82 96
 importFrom(dplyr,rename)
97
+importFrom(dplyr,rename_with)
83 98
 importFrom(dplyr,select)
84 99
 importFrom(dplyr,semi_join)
85 100
 importFrom(dplyr,slice)
101
+importFrom(dplyr,slice_head)
102
+importFrom(dplyr,starts_with)
103
+importFrom(dplyr,summarise)
104
+importFrom(dplyr,transmute)
105
+importFrom(dplyr,ungroup)
86 106
 importFrom(fs,as_fs_path)
87 107
 importFrom(fs,dir_create)
88 108
 importFrom(fs,dir_exists)
... ...
@@ -92,28 +112,34 @@ importFrom(fs,is_dir)
92 112
 importFrom(fs,path)
93 113
 importFrom(fs,path_dir)
94 114
 importFrom(fs,path_ext)
115
+importFrom(fs,path_ext_set)
116
+importFrom(fs,path_home)
95 117
 importFrom(fs,path_wd)
118
+importFrom(ggplot2,aes)
96 119
 importFrom(ggplot2,aes_)
120
+importFrom(ggplot2,element_blank)
121
+importFrom(ggplot2,element_text)
122
+importFrom(ggplot2,geom_hline)
123
+importFrom(ggplot2,geom_point)
124
+importFrom(ggplot2,geom_raster)
97 125
 importFrom(ggplot2,geom_text)
98 126
 importFrom(ggplot2,ggplot)
127
+importFrom(ggplot2,ggplot_build)
128
+importFrom(ggplot2,labs)
129
+importFrom(ggplot2,rel)
130
+importFrom(ggplot2,scale_alpha_continuous)
131
+importFrom(ggplot2,scale_fill_gradientn)
99 132
 importFrom(ggplot2,scale_fill_viridis_d)
133
+importFrom(ggplot2,scale_x_continuous)
134
+importFrom(ggplot2,scale_y_continuous)
100 135
 importFrom(ggplot2,sym)
136
+importFrom(ggplot2,theme)
137
+importFrom(ggplot2,unit)
101 138
 importFrom(ggrepel,geom_label_repel)
102
-importFrom(grDevices,colorRamp)
103
-importFrom(grDevices,rgb)
104
-importFrom(htmltools,browsable)
105
-importFrom(htmltools,div)
106
-importFrom(htmltools,h1)
107
-importFrom(htmltools,h2)
108
-importFrom(htmltools,h3)
109
-importFrom(htmltools,h4)
110
-importFrom(htmltools,save_html)
111
-importFrom(htmltools,tagList)
112
-importFrom(htmltools,tags)
139
+importFrom(lifecycle,deprecate_warn)
113 140
 importFrom(lubridate,parse_date_time)
141
+importFrom(lubridate,today)
114 142
 importFrom(magrittr,`%>%`)
115
-importFrom(plotly,plot_ly)
116
-importFrom(psych,describe)
117 143
 importFrom(purrr,cross_df)
118 144
 importFrom(purrr,detect_index)
119 145
 importFrom(purrr,flatten)
... ...
@@ -129,9 +155,11 @@ importFrom(purrr,map2_lgl)
129 155
 importFrom(purrr,map_chr)
130 156
 importFrom(purrr,map_dbl)
131 157
 importFrom(purrr,map_dfr)
158
+importFrom(purrr,map_int)
132 159
 importFrom(purrr,map_lgl)
133 160
 importFrom(purrr,pmap)
134 161
 importFrom(purrr,pmap_chr)
162
+importFrom(purrr,pmap_dbl)
135 163
 importFrom(purrr,pmap_df)
136 164
 importFrom(purrr,pmap_dfr)
137 165
 importFrom(purrr,pmap_lgl)
... ...
@@ -139,22 +167,18 @@ importFrom(purrr,reduce)
139 167
 importFrom(purrr,set_names)
140 168
 importFrom(purrr,walk)
141 169
 importFrom(purrr,walk2)
142
-importFrom(reactable,colDef)
143
-importFrom(reactable,colFormat)
144
-importFrom(reactable,reactable)
145
-importFrom(reactable,reactableTheme)
146 170
 importFrom(readr,cols)
147
-importFrom(readr,parse_factor)
148 171
 importFrom(readr,problems)
149 172
 importFrom(readr,read_delim)
150 173
 importFrom(readr,write_tsv)
151
-importFrom(readxl,read_excel)
152 174
 importFrom(rlang,.data)
175
+importFrom(rlang,`!!!`)
153 176
 importFrom(rlang,`:=`)
154 177
 importFrom(rlang,abort)
155 178
 importFrom(rlang,arg_match)
156 179
 importFrom(rlang,as_function)
157 180
 importFrom(rlang,call2)
181
+importFrom(rlang,current_env)
158 182
 importFrom(rlang,dots_list)
159 183
 importFrom(rlang,enexpr)
160 184
 importFrom(rlang,env_bind)
... ...
@@ -167,11 +191,9 @@ importFrom(rlang,is_formula)
167 191
 importFrom(rlang,is_function)
168 192
 importFrom(rlang,list2)
169 193
 importFrom(rlang,parse_expr)
194
+importFrom(rlang,sym)
170 195
 importFrom(stats,dt)
171
-importFrom(stats,median)
172 196
 importFrom(stats,na.omit)
173
-importFrom(stats,p.adjust)
174
-importFrom(stats,pt)
175 197
 importFrom(stats,setNames)
176 198
 importFrom(stats,shapiro.test)
177 199
 importFrom(stringr,str_detect)
... ...
@@ -179,17 +201,14 @@ importFrom(stringr,str_pad)
179 201
 importFrom(stringr,str_replace)
180 202
 importFrom(stringr,str_replace_all)
181 203
 importFrom(stringr,str_split)
204
+importFrom(stringr,str_to_lower)
182 205
 importFrom(stringr,str_to_upper)
206
+importFrom(tibble,add_case)
183 207
 importFrom(tibble,add_column)
184
-importFrom(tibble,add_row)
185 208
 importFrom(tibble,as_tibble)
186 209
 importFrom(tibble,as_tibble_col)
187
-importFrom(tibble,as_tibble_row)
188
-importFrom(tibble,is_tibble)
189 210
 importFrom(tibble,tibble)
190
-importFrom(tibble,tibble_row)
191 211
 importFrom(tibble,tribble)
192
-importFrom(tidyr,everything)
193 212
 importFrom(tidyr,nest)
194 213
 importFrom(tidyr,pivot_longer)
195 214
 importFrom(tidyr,pivot_wider)
... ...
@@ -2,6 +2,45 @@
2 2
 title: "NEWS"
3 3
 output: github_document
4 4
 ---
5
+# ISAnalytics 1.3.3 (TBD)
6
+
7
+## MAJOR CHANGES
8
+* Completely reworked interactive HTML report system, for details take
9
+a look at the new vignette `vignette("report_system", package = "ISAnalytics")`
10
+* Old `ISAnalytics.widgets` option has been replaced by `ISAnalytics.reports`
11
+* In `remove_collisions()`, removed arguments `seq_count_col`,
12
+`max_rows_reports` and `save_widget_path`, added arguments `quant_cols`
13
+and `report_path` (see documentation for details)
14
+
15
+## MINOR CHANGES
16
+* `import_single_Vispa2Matrix()` now allows keeping additional non-standard
17
+columns
18
+* `compute_near_integrations()` is now faster on bigger data sets
19
+* Changed default values for arguments `columns` and `key` in 
20
+`compute_abundance()`
21
+* `compute_near_integrations()` now produces only re-calibration map in *.tsv
22
+format
23
+* `CIS_grubbs()` now supports calculations for each group specified in argument
24
+`by`
25
+* In `sample_statistics()` now there is the option to include the calculation
26
+of distinct integration sites for each group (if mandatory vars are present)
27
+
28
+## NEW FUNCTIONALITY
29
+* Added new plotting function `circos_genomic_density()`
30
+
31
+## FIXES
32
+* Fixed minor issue with NA values in alluvial plots
33
+
34
+## DEPRECATIONS
35
+* `import_parallel_Vispa2Matrices_interactive()` and
36
+`import_parallel_Vispa2Matrices_auto()` are officially deprecated in favor
37
+of `import_parallel_Vispa2Matrices()`
38
+
39
+## OTHER
40
+* The package has now a more complete and functional example data set 
41
+for executable examples
42
+* Reworked documentation
43
+
5 44
 # ISAnalytics 1.3.2 (2021-06-28)
6 45
 
7 46
 ## FIXES
... ...
@@ -1,6 +1,54 @@
1 1
 NEWS
2 2
 ================
3 3
 
4
+# ISAnalytics 1.3.3 (TBD)
5
+
6
+## MAJOR CHANGES
7
+
8
+-   Completely reworked interactive HTML report system, for details take
9
+    a look at the new vignette
10
+    `vignette("report_system", package = "ISAnalytics")`
11
+-   Old `ISAnalytics.widgets` option has been replaced by
12
+    `ISAnalytics.reports`
13
+-   In `remove_collisions()`, removed arguments `seq_count_col`,
14
+    `max_rows_reports` and `save_widget_path`, added arguments
15
+    `quant_cols` and `report_path` (see documentation for details)
16
+
17
+## MINOR CHANGES
18
+
19
+-   `import_single_Vispa2Matrix()` now allows keeping additional
20
+    non-standard columns
21
+-   `compute_near_integrations()` is now faster on bigger data sets
22
+-   Changed default values for arguments `columns` and `key` in
23
+    `compute_abundance()`
24
+-   `compute_near_integrations()` now produces only re-calibration map
25
+    in \*.tsv format
26
+-   `CIS_grubbs()` now supports calculations for each group specified in
27
+    argument `by`
28
+-   In `sample_statistics()` now there is the option to include the
29
+    calculation of distinct integration sites for each group (if
30
+    mandatory vars are present)
31
+
32
+## NEW FUNCTIONALITY
33
+
34
+-   Added new plotting function `circos_genomic_density()`
35
+
36
+## FIXES
37
+
38
+-   Fixed minor issue with NA values in alluvial plots
39
+
40
+## DEPRECATIONS
41
+
42
+-   `import_parallel_Vispa2Matrices_interactive()` and
43
+    `import_parallel_Vispa2Matrices_auto()` are officially deprecated in
44
+    favor of `import_parallel_Vispa2Matrices()`
45
+
46
+## OTHER
47
+
48
+-   The package has now a more complete and functional example data set
49
+    for executable examples
50
+-   Reworked documentation
51
+
4 52
 # ISAnalytics 1.3.2 (2021-06-28)
5 53
 
6 54
 ## FIXES
7 55
new file mode 100644
... ...
@@ -0,0 +1,15 @@
1
+
2
+#' @title Deprecated functions in package \pkg{ISAnalytics}.
3
+#' @description These functions are provided for compatibility
4
+#'  with older versions of \sQuote{ISAnalytics} only,
5
+#'  and will be defunct at the next release.
6
+#' @name ISAnalytics-deprecated
7
+#' @details The following functions are deprecated and will be made defunct;
8
+#' use the replacement indicated below:
9
+#'
10
+#' * import_parallel_Vispa2Matrices_auto:
11
+#'  \code{\link{import_parallel_Vispa2Matrices}}
12
+#'  * import_parallel_Vispa2Matrices_interactive:
13
+#'  \code{\link{import_parallel_Vispa2Matrices}}
14
+#' @keywords internal
15
+NULL
... ...
@@ -2,7 +2,7 @@
2 2
 #' identified from genomics next generation sequencing reads for
3 3
 #' clonal tracking studies
4 4
 #'
5
-#' @description \lifecycle{maturing}
5
+#' @description \lifecycle{stable}
6 6
 #' In gene therapy, stem cells are modified using viral
7 7
 #' vectors to deliver the therapeutic transgene and replace functional
8 8
 #' properties since the genetic modification is stable and inherited in
... ...
@@ -27,8 +27,7 @@
27 27
 #'   * \code{\link{import_single_Vispa2Matrix}}
28 28
 #'   * \code{\link{import_association_file}}
29 29
 #'   * \code{\link{import_Vispa2_stats}}
30
-#'   * \code{\link{import_parallel_Vispa2Matrices_interactive}}
31
-#'   * \code{\link{import_parallel_Vispa2Matrices_auto}}
30
+#'   * \code{\link{import_parallel_Vispa2Matrices}}
32 31
 #' * Aggregation functions:
33 32
 #'   * \code{\link{aggregate_metadata}}
34 33
 #'   * \code{\link{aggregate_values_by_key}}
... ...
@@ -58,6 +57,7 @@
58 57
 #'   * \code{\link{sharing_heatmap}}
59 58
 #'   * \code{\link{integration_alluvial_plot}}
60 59
 #'   * \code{\link{top_abund_tableGrob}}
60
+#'   * \code{\link{circos_genomic_density}}
61 61
 #' * Utility functions:
62 62
 #'   * \code{\link{generate_blank_association_file}}
63 63
 #'   * \code{\link{generate_Vispa2_launch_AF}}
... ...
@@ -65,16 +65,17 @@
65 65
 #'   * \code{\link{as_sparse_matrix}}
66 66
 #'
67 67
 #' @section Vignettes:
68
-#' * \code{vignette("How to use import functions", package = "ISAnalytics")}
69
-#' * \code{vignette("Collision removal functionality",
68
+#' * \code{vignette("how_to_import_functions", package = "ISAnalytics")}
69
+#' * \code{vignette("collision_removal",
70 70
 #' package = "ISAnalytics")}
71
-#' * \code{vignette("Working with aggregate functions",
71
+#' * \code{vignette("aggregate_function_usage",
72 72
 #' package = "ISAnalytics")}
73
-#' * \code{vignette("Using ISAnalytics without RStudio support",
73
+#' * \code{vignette("report_system",
74 74
 #' package = "ISAnalytics")}
75 75
 #'
76 76
 #' @docType package
77 77
 #' @name ISAnalytics
78
+#' @keywords internal
78 79
 NULL
79 80
 
80 81
 ## usethis namespace: start
... ...
@@ -1,17 +1,17 @@
1 1
 #------------------------------------------------------------------------------#
2 2
 # Aggregate functions
3 3
 #------------------------------------------------------------------------------#
4
-
5 4
 #' Performs aggregation on metadata contained in the association file.
6 5
 #'
7
-#' \lifecycle{maturing}
6
+#' \lifecycle{stable}
8 7
 #' Groups metadata by the specified grouping keys and returns a
9 8
 #' summary of info for each group. For more details on how to use this function:
10
-#' \code{vignette("Working with aggregate functions", package = "ISAnalytics")}
9
+#' \code{vignette("aggregate_function_usage", package = "ISAnalytics")}
11 10
 #'
12 11
 #' @param association_file The imported association file
13
-#' (via link{import_association_file})
14
-#' @param grouping_keys A character vector of column names to form a group
12
+#' (via \link{import_association_file})
13
+#' @param grouping_keys A character vector of column names to form a grouping
14
+#' operation
15 15
 #' @param aggregating_functions A data frame containing specifications
16 16
 #' of the functions to be applied to columns in the association file during
17 17
 #' aggregation. It defaults to \link{default_meta_agg}. The structure of
... ...
@@ -29,17 +29,11 @@
29 29
 #' @export
30 30
 #'
31 31
 #' @examples
32
-#' op <- options("ISAnalytics.widgets" = FALSE, "ISAnalytics.verbose" = FALSE)
33
-#' path_AF <- system.file("extdata", "ex_association_file.tsv",
34
-#'     package = "ISAnalytics"
35
-#' )
36
-#' root_correct <- system.file("extdata", "fs.zip", package = "ISAnalytics")
37
-#' root_correct <- unzip_file_system(root_correct, "fs")
38
-#' association_file <- import_association_file(path_AF, root_correct,
39
-#'     dates_format = "dmy"
32
+#' data("association_file", package = "ISAnalytics")
33
+#' aggreg_meta <- aggregate_metadata(
34
+#'     association_file = association_file
40 35
 #' )
41
-#' aggregated_meta <- aggregate_metadata(association_file)
42
-#' options(op)
36
+#' head(aggreg_meta)
43 37
 aggregate_metadata <- function(association_file,
44 38
     grouping_keys = c(
45 39
         "SubjectID",
... ...
@@ -78,11 +72,12 @@ aggregate_metadata <- function(association_file,
78 72
         function_tbl = aggregating_functions
79 73
     )
80 74
     if (is.null(aggregated)) {
81
-        rlang::inform(paste(
75
+        msg <- paste(
82 76
             "No columns in `aggregating_functions$Column`",
83 77
             "was found in column names of the association",
84 78
             "file. Nothing to return."
85
-        ))
79
+        )
80
+        rlang::inform(msg)
86 81
     }
87 82
     aggregated
88 83
 }
... ...
@@ -145,11 +140,11 @@ default_meta_agg <- function() {
145 140
 
146 141
 #' Aggregates matrices values based on specified key.
147 142
 #'
148
-#' \lifecycle{maturing}
143
+#' \lifecycle{stable}
149 144
 #' Performs aggregation on values contained in the integration matrices based
150 145
 #' on the key and the specified lambda. For more details on how to use this
151 146
 #' function:
152
-#' \code{vignette("Working with aggregate functions", package = "ISAnalytics")}
147
+#' \code{vignette("aggregate_function_usage", package = "ISAnalytics")}
153 148
 #'
154 149
 #' @details
155 150
 #' ## Setting the lambda parameter
... ...
@@ -190,8 +185,8 @@ default_meta_agg <- function() {
190 185
 #' * Function should return a single value or a list/data frame:
191 186
 #' if a list or a data frame is returned as a result, all the columns
192 187
 #' will be added to the final data frame.
193
-#' @param x A single integration matrix (tibble) or a list of imported
194
-#' integration matrices (tibble)
188
+#' @param x A single integration matrix or a list of imported
189
+#' integration matrices
195 190
 #' @param association_file The imported association file
196 191
 #' @param value_cols A character vector containing the names of the
197 192
 #' columns to apply the given lambdas. Must be numeric or integer
... ...
@@ -215,26 +210,14 @@ default_meta_agg <- function() {
215 210
 #' @export
216 211
 #'
217 212
 #' @examples
218
-#' op <- options("ISAnalytics.widgets" = FALSE, "ISAnalytics.verbose" = FALSE)
219
-#' path_AF <- system.file("extdata", "ex_association_file.tsv",
220
-#'     package = "ISAnalytics"
221
-#' )
222
-#' root_correct <- system.file("extdata", "fs.zip", package = "ISAnalytics")
223
-#' root_correct <- unzip_file_system(root_correct, "fs")
224
-#' association_file <- import_association_file(path_AF, root_correct,
225
-#'     dates_format = "dmy"
226
-#' )
227
-#' matrices <- import_parallel_Vispa2Matrices_auto(
228
-#'     association_file = association_file, root = NULL,
229
-#'     quantification_type = c("fragmentEstimate", "seqCount"),
230
-#'     matrix_type = "annotated", workers = 2, matching_opt = "ANY"
231
-#' )
232
-#' agg <- aggregate_values_by_key(
233
-#'     x = matrices,
213
+#' data("integration_matrices", package = "ISAnalytics")
214
+#' data("association_file", package = "ISAnalytics")
215
+#' aggreg <- aggregate_values_by_key(
216
+#'     x = integration_matrices,
234 217
 #'     association_file = association_file,
235
-#'     value_cols = c("fragmentEstimate", "seqCount")
218
+#'     value_cols = c("seqCount", "fragmentEstimate")
236 219
 #' )
237
-#' options(op)
220
+#' head(aggreg)
238 221
 aggregate_values_by_key <- function(x,
239 222
     association_file,
240 223
     value_cols = "Value",
... ...
@@ -255,7 +238,7 @@ aggregate_values_by_key <- function(x,
255 238
         purrr::walk(x, function(df) {
256 239
             stopifnot(is.data.frame(df))
257 240
             if (.check_mandatory_vars(df) == FALSE) {
258
-                rlang::abort(.non_ISM_error())
241
+                rlang::abort(.missing_mand_vars())
259 242
             }
260 243
             if (!all(join_af_by %in% colnames(df))) {
261 244
                 rlang::abort(c(
... ...
@@ -293,7 +276,7 @@ aggregate_values_by_key <- function(x,
293 276
         })
294 277
     } else {
295 278
         if (.check_mandatory_vars(x) == FALSE) {
296
-            rlang::abort(.non_ISM_error())
279
+            rlang::abort(.missing_mand_vars())
297 280
         }
298 281
         if (!all(join_af_by %in% colnames(x))) {
299 282
             rlang::abort(c(
... ...
@@ -4,7 +4,7 @@
4 4
 
5 5
 #' Computes the abundance for every integration event in the input data frame.
6 6
 #'
7
-#' \lifecycle{maturing}
7
+#' \lifecycle{stable}
8 8
 #' Abundance is obtained for every integration event by calculating the ratio
9 9
 #' between the single value and the total value for the given group.
10 10
 #'
... ...
@@ -30,8 +30,9 @@
30 30
 #' @family Analysis functions
31 31
 #'
32 32
 #' @importFrom magrittr `%>%`
33
-#' @import dplyr
34
-#' @importFrom rlang .data eval_tidy parse_expr
33
+#' @importFrom dplyr group_by across all_of summarise left_join mutate
34
+#' @importFrom dplyr cur_column distinct select contains rename_with
35
+#' @importFrom rlang .data eval_tidy parse_expr abort
35 36
 #' @importFrom purrr map_lgl
36 37
 #' @importFrom stringr str_replace
37 38
 #' @return Either a single data frame with computed abundance values or
... ...
@@ -39,29 +40,24 @@
39 40
 #' @export
40 41
 #'
41 42
 #' @examples
42
-#' path <- system.file("extdata", "ex_annotated_ISMatrix.tsv.xz",
43
-#'     package = "ISAnalytics"
43
+#' data("integration_matrices", package = "ISAnalytics")
44
+#' abund <- compute_abundance(
45
+#'     x = integration_matrices,
46
+#'     columns = "fragmentEstimate",
47
+#'     key = "CompleteAmplificationID"
44 48
 #' )
45
-#' matrix <- import_single_Vispa2Matrix(path)
46
-#'
47
-#' # Simple integration matrix - grouping by CompleteAmplificationID
48
-#' abundance1 <- compute_abundance(matrix)
49
-#' abundance1
50
-#'
51
-#' # Keeping totals as a separate data frame
52
-#' abundance2 <- compute_abundance(matrix, keep_totals = "df")
53
-#' abundance2
49
+#' head(abund)
54 50
 compute_abundance <- function(x,
55
-    columns = "Value",
51
+    columns = c("fragmentEstimate_sum"),
56 52
     percentage = TRUE,
57
-    key = "CompleteAmplificationID",
53
+    key = c("SubjectID", "CellMarker", "Tissue", "TimePoint"),
58 54
     keep_totals = FALSE) {
59 55
     ## Check parameters
60 56
     stopifnot(is.data.frame(x))
61 57
     stopifnot(is.character(columns))
62 58
     stopifnot(is.character(key))
63 59
     if (.check_mandatory_vars(x) == FALSE) {
64
-        stop(.non_ISM_error())
60
+        rlang::abort(.missing_mand_vars())
65 61
     }
66 62
     stopifnot(is.logical(percentage) & length(percentage) == 1)
67 63
     if (!all(columns %in% colnames(x)) | !all(key %in% colnames(x))) {
... ...
@@ -81,7 +77,7 @@ compute_abundance <- function(x,
81 77
         }
82 78
     })
83 79
     if (any(non_num_cols)) {
84
-        stop(.non_num_user_cols_error(columns[non_num_cols]))
80
+        rlang::abort(.non_num_user_cols_error(columns[non_num_cols]))
85 81
     }
86 82
     stopifnot(is.logical(keep_totals) || keep_totals == "df")
87 83
     ## Computation
... ...
@@ -139,8 +135,8 @@ compute_abundance <- function(x,
139 135
 #' obtain a single integration matrix from individual quantification
140 136
 #' matrices.
141 137
 #'
142
-#' \lifecycle{maturing}
143
-#' Takes a list of integration matrices referring to different qunatification
138
+#' \lifecycle{stable}
139
+#' Takes a list of integration matrices referring to different quantification
144 140
 #' types and merges them in a single data frame that has multiple
145 141
 #' value columns, each renamed according to their quantification type
146 142
 #' of reference.
... ...
@@ -171,21 +167,22 @@ compute_abundance <- function(x,
171 167
 #' @export
172 168
 #'
173 169
 #' @examples
174
-#' op <- options("ISAnalytics.widgets" = FALSE)
175
-#' path <- system.file("extdata", "ex_association_file.tsv",
170
+#' fs_path <- system.file("extdata", "fs.zip", package = "ISAnalytics")
171
+#' fs <- unzip_file_system(fs_path, "fs")
172
+#' af_path <- system.file("extdata", "asso.file.tsv.gz",
176 173
 #'     package = "ISAnalytics"
177 174
 #' )
178
-#' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
179
-#' root <- unzip_file_system(root_pth, "fs")
180
-#' matrices <- import_parallel_Vispa2Matrices_auto(
181
-#'     association_file = path, root = root,
182
-#'     quantification_type = c("fragmentEstimate", "seqCount"),
183
-#'     matrix_type = "annotated", workers = 2, patterns = NULL,
184
-#'     matching_opt = "ANY",
185
-#'     dates_format = "dmy", multi_quant_matrix = FALSE
175
+#' af <- import_association_file(af_path,
176
+#'     root = fs,
177
+#'     import_iss = FALSE,
178
+#'     report_path = NULL
186 179
 #' )
187
-#' total_matrix <- comparison_matrix(matrices)
188
-#' options(op)
180
+#' matrices <- import_parallel_Vispa2Matrices(af,
181
+#'     c("seqCount", "fragmentEstimate"),
182
+#'     mode = "AUTO", report_path = NULL, multi_quant_matrix = FALSE
183
+#' )
184
+#' multi_quant <- comparison_matrix(matrices)
185
+#' head(multi_quant)
189 186
 comparison_matrix <- function(x,
190 187
     fragmentEstimate = "fragmentEstimate",
191 188
     seqCount = "seqCount",
... ...
@@ -226,7 +223,7 @@ comparison_matrix <- function(x,
226 223
 #' Separate a multiple-quantification matrix into single quantification
227 224
 #' matrices.
228 225
 #'
229
-#' \lifecycle{maturing}
226
+#' \lifecycle{stable}
230 227
 #' The function separates a single multi-quantification integration
231 228
 #' matrix, obtained via \link{comparison_matrix}, into single
232 229
 #' quantification matrices as a named list of tibbles.
... ...
@@ -256,24 +253,11 @@ comparison_matrix <- function(x,
256 253
 #' @export
257 254
 #'
258 255
 #' @examples
259
-#' op <- options("ISAnalytics.widgets" = FALSE)
260
-#' path <- system.file("extdata", "ex_association_file.tsv",
261
-#'     package = "ISAnalytics"
256
+#' data("integration_matrices", package = "ISAnalytics")
257
+#' separated <- separate_quant_matrices(
258
+#'     integration_matrices
262 259
 #' )
263
-#' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
264
-#' root <- unzip_file_system(root_pth, "fs")
265
-#' association_file <- import_association_file(
266
-#'     path = path, root = root,
267
-#'     dates_format = "dmy"
268
-#' )
269
-#' matrices <- import_parallel_Vispa2Matrices_auto(
270
-#'     association_file = association_file,
271
-#'     quantification_type = c("seqCount", "fragmentEstimate"),
272
-#'     matrix_type = "annotated", workers = 2, patterns = NULL,
273
-#'     matching_opt = "ANY"
274
-#' )
275
-#' separated_matrix <- separate_quant_matrices(matrices)
276
-#' options(op)
260
+#' separated
277 261
 separate_quant_matrices <- function(x,
278 262
     fragmentEstimate = "fragmentEstimate",
279 263
     seqCount = "seqCount",
... ...
@@ -315,7 +299,9 @@ separate_quant_matrices <- function(x,
315 299
         rlang::inform(.non_quant_cols_msg(to_copy))
316 300
     }
317 301
     separated <- purrr::map(num_cols, function(quant) {
318
-        x[c(key, to_copy, quant)] %>% dplyr::rename(Value = quant)
302
+        x %>%
303
+            dplyr::select(dplyr::all_of(c(key, to_copy, quant))) %>%
304
+            dplyr::rename(Value = quant)
319 305
     }) %>% purrr::set_names(names(num_cols))
320 306
     separated
321 307
 }
... ...
@@ -388,12 +374,12 @@ separate_quant_matrices <- function(x,
388 374
 #' print(example_list)
389 375
 #'
390 376
 #' filtered <- threshold_filter(example_list,
391
-#'                              threshold = list(first = c(20, 60),
392
-#'                                               third = c(25)),
393
-#'                              cols_to_compare = list(first = c("a", "b"),
394
-#'                                                     third = c("a")),
395
-#'                              comparators = list(first = c(">", "<"),
396
-#'                                                 third = c(">=")))
377
+#' threshold = list(first = c(20, 60),
378
+#' third = c(25)),
379
+#' cols_to_compare = list(first = c("a", "b"),
380
+#' third = c("a")),
381
+#' comparators = list(first = c(">", "<"),
382
+#' third = c(">=")))
397 383
 #' print(filtered)
398 384
 #'
399 385
 #' ```
... ...
@@ -411,11 +397,11 @@ separate_quant_matrices <- function(x,
411 397
 #'
412 398
 #' ```r
413 399
 #' filtered <- threshold_filter(example_list,
414
-#'                              threshold = list(first = c(20, 60),
415
-#'                                               third = c(25, 65)),
416
-#'                              cols_to_compare = c("a", "b"),
417
-#'                              comparators = list(first = c(">", "<"),
418
-#'                                                 third = c(">=", "<=")))
400
+#' threshold = list(first = c(20, 60),
401
+#' third = c(25, 65)),
402
+#' cols_to_compare = c("a", "b"),
403
+#' comparators = list(first = c(">", "<"),
404
+#' third = c(">=", "<=")))
419 405
 #' ```
420 406
 #' In this example, different threshold and comparators will be applied
421 407
 #' to the same columns in all data frames.
... ...
@@ -468,6 +454,7 @@ separate_quant_matrices <- function(x,
468 454
 #'         third = c(">=")
469 455
 #'     )
470 456
 #' )
457
+#' filtered
471 458
 threshold_filter <- function(x,
472 459
     threshold,
473 460
     cols_to_compare = "Value",
... ...
@@ -510,7 +497,6 @@ threshold_filter <- function(x,
510 497
 #'
511 498
 #' @family Analysis functions
512 499
 #'
513
-#' @import dplyr
514 500
 #' @importFrom magrittr `%>%`
515 501
 #' @importFrom rlang abort
516 502
 #'
... ...
@@ -539,16 +525,18 @@ threshold_filter <- function(x,
539 525
 #'     keep = "Value2",
540 526
 #'     key = "CompleteAmplificationID"
541 527
 #' )
542
-top_integrations <- function(x, n = 50,
528
+top_integrations <- function(x,
529
+    n = 20,
543 530
     columns = "fragmentEstimate_sum_RelAbundance",
544
-    keep = "everything", key = NULL) {
531
+    keep = "everything",
532
+    key = NULL) {
545 533
     stopifnot(is.data.frame(x))
546 534
     stopifnot(is.numeric(n) & length(n) == 1 & n > 0)
547 535
     stopifnot(is.character(keep))
548 536
     stopifnot(is.character(columns))
549 537
     stopifnot(is.null(key) || is.character(key))
550 538
     if (!.check_mandatory_vars(x)) {
551
-        rlang::abort(.non_ISM_error())
539
+        rlang::abort(.missing_mand_vars())
552 540
     }
553 541
     if (!all(columns %in% colnames(x))) {
554 542
         rlang::abort(.missing_user_cols_error(
... ...
@@ -616,28 +604,18 @@ top_integrations <- function(x, n = 50,
616 604
 #' For example:
617 605
 #'
618 606
 #' ```r
619
-#' ### Importing association file and matrices
620
-#' path_AF <- system.file("extdata", "ex_association_file.tsv",
621
-#' package = "ISAnalytics")
622
-#' root_correct <- system.file("extdata", "fs.zip",
623
-#' package = "ISAnalytics")
624
-#' root_correct <- unzip_file_system(root_correct, "fs")
625
-#'
626
-#' association_file <- import_association_file(path_AF, root_correct)
627
-#' matrices <- import_parallel_Vispa2Matrices_auto(
628
-#' association_file = association_file , root = NULL,
629
-#' quantification_type = c("seqCount","fragmentEstimate"),
630
-#' matrix_type = "annotated", workers = 2, patterns = NULL,
631
-#' matching_opt = "ANY", dates_format = "dmy")
632
-#'
633
-#' ### Aggregating data (both by same key)
634
-#' aggreggated_x <- aggregate_values_by_key(matrices$seqCount,
635
-#' association_file)
636
-#' aggregated_meta <- aggregate_metadata(association_file)
637
-#'
638
-#' ### Sample statistics
639
-#' sample_stats <- sample_statistics(x = aggregated_x,
640
-#' metadata = aggregated_meta,
607
+#' data("integration_matrices", package = "ISAnalytics")
608
+#' data("association_file", package = "ISAnalytics")
609
+#' aggreg <- aggregate_values_by_key(
610
+#'  x = integration_matrices,
611
+#'  association_file = association_file,
612
+#'  value_cols = c("seqCount", "fragmentEstimate")
613
+#' )
614
+#' aggreg_meta <- aggregate_metadata(association_file = association_file)
615
+#'
616
+#' sample_stats <- sample_statistics(x = aggreg,
617
+#' metadata = aggreg_meta,
618
+#' value_columns = c("seqCount", "fragmentEstimate"),
641 619
 #' sample_key = c("SubjectID", "CellMarker","Tissue", "TimePoint"))
642 620
 #'
643 621
 #' ```
... ...
@@ -645,74 +623,64 @@ top_integrations <- function(x, n = 50,
645 623
 #' @param metadata The metadata data frame
646 624
 #' @param sample_key Character vector representing the key for identifying
647 625
 #' a sample
648
-#' @param value_columns THe name of the columns to be computed,
626
+#' @param value_columns The name of the columns to be computed,
649 627
 #' must be numeric or integer
650 628
 #' @param functions A named list of function or purrr-style lambdas
629
+#' @param add_integrations_count Add the count of distinct integration sites
630
+#' for each group? Can be computed only if `x` contains the mandatory columns
631
+#' `chr`, `integration_locus`, `strand`
651 632
 #'
652 633
 #' @family Analysis functions
653
-#' @importFrom rlang eval_tidy expr
654
-#' @importFrom purrr is_function is_formula
655
-#' @import dplyr
634
+#' @importFrom rlang eval_tidy expr abort .data sym inform
635
+#' @importFrom purrr is_function is_formula map_lgl walk map set_names
636
+#' @importFrom dplyr group_by across all_of summarise rename_with bind_cols
637
+#' @importFrom dplyr n_distinct left_join
656 638
 #' @importFrom magrittr `%>%`
657 639
 #'
658 640
 #' @return A list with modified x and metadata data frames
659 641
 #' @export
660 642
 #'
661 643
 #' @examples
662
-#' op <- options(ISAnalytics.widgets = FALSE)
663
-#'
664
-#' path_AF <- system.file("extdata", "ex_association_file.tsv",
665
-#'     package = "ISAnalytics"
666
-#' )
667
-#' root_correct <- system.file("extdata", "fs.zip",
668
-#'     package = "ISAnalytics"
644
+#' data("integration_matrices", package = "ISAnalytics")
645
+#' data("association_file", package = "ISAnalytics")
646
+#' stats <- sample_statistics(
647
+#'     x = integration_matrices,
648
+#'     metadata = association_file,
649
+#'     value_columns = c("seqCount", "fragmentEstimate")
669 650
 #' )
670
-#' root_correct <- unzip_file_system(root_correct, "fs")
671
-#'
672
-#' association_file <- import_association_file(path_AF, root_correct,
673
-#'     dates_format = "dmy"
674
-#' )
675
-#' matrices <- import_parallel_Vispa2Matrices_auto(
676
-#'     association_file = association_file, root = NULL,
677
-#'     quantification_type = c("seqCount", "fragmentEstimate"),
678
-#'     matrix_type = "annotated", workers = 2, patterns = NULL,
679
-#'     matching_opt = "ANY", multi_quant_matrix = FALSE
680
-#' )
681
-#'
682
-#' stats <- sample_statistics(matrices$seqCount, association_file)
683
-#' options(op)
651
+#' stats
684 652
 sample_statistics <- function(x, metadata,
685 653
     sample_key = "CompleteAmplificationID",
686 654
     value_columns = "Value",
687
-    functions = default_stats()) {
655
+    functions = default_stats(),
656
+    add_integrations_count = TRUE) {
688 657
     stopifnot(is.data.frame(x))
689 658
     stopifnot(is.data.frame(metadata))
690 659
     stopifnot(is.character(sample_key))
691 660
     stopifnot(is.character(value_columns))
692 661
     stopifnot(is.list(functions))
693
-    if (!all(sample_key %in% colnames(x))) {
694
-        stop(paste("Key columns not found in the data frame"))
662
+    stopifnot(is.logical(add_integrations_count))
663
+    if (!all(c(sample_key, value_columns) %in% colnames(x))) {
664
+        rlang::abort(.missing_user_cols_error(c(sample_key, value_columns)[
665
+            !c(sample_key, value_columns) %in% colnames(x)
666
+        ]))
695 667
     }
696 668
     if (!all(sample_key %in% colnames(metadata))) {
697
-        stop(paste("Key columns not found in metadata"))
669
+        rlang::abort(.missing_user_cols_meta_error(sample_key[
670
+            !sample_key %in% colnames(x)
671
+        ]))
698 672
     }
699
-    if (!all(value_columns %in% colnames(x))) {
700
-        stop(paste("Value columns not found in the data frame"))
701
-    }
702
-    purrr::walk(value_columns, function(col) {
673
+    vcols_are_numeric <- purrr::map_lgl(value_columns, function(col) {
703 674
         expr <- rlang::expr(`$`(x, !!col))
704
-        if (!is.numeric(rlang::eval_tidy(expr)) &&
705
-            !is.integer(rlang::eval_tidy(expr))) {
706
-            stop(paste("Some or all of value columns are not numeric"))
707
-        }
675
+        is.numeric(rlang::eval_tidy(expr)) ||
676
+            is.integer(rlang::eval_tidy(expr))
708 677
     })
678
+    if (any(vcols_are_numeric == FALSE)) {
679
+        rlang::abort(.non_num_user_cols_error(value_columns[!vcols_are_numeric]))
680
+    }
709 681
     purrr::walk(functions, function(f) {
710 682
         if (!(purrr::is_function(f) | purrr::is_formula(f))) {
711
-            stop(paste(
712
-                "The function parameter should contain a list",
713
-                "of either functions or formulas.",
714
-                "See ?sample_statistics for details"
715
-            ))
683
+            rlang::abort(.non_function_elem_error())
716 684
         }
717 685
     })
718 686
 
... ...
@@ -743,6 +711,24 @@ sample_statistics <- function(x, metadata,
743 711
         }
744 712
     }
745 713
 
714
+    if (add_integrations_count) {
715
+        if (all(mandatory_IS_vars() %in% colnames(x))) {
716
+            mand_sym <- purrr::map(mandatory_IS_vars(), rlang::sym)
717
+            nIS <- x %>%
718
+                dplyr::group_by(dplyr::across(dplyr::all_of(sample_key))) %>%
719
+                dplyr::summarise(
720
+                    nIS = dplyr::n_distinct(!!!mand_sym),
721
+                    .groups = "drop"
722
+                )
723
+            result <- result %>%
724
+                dplyr::left_join(nIS, by = sample_key)
725
+        } else {
726
+            if (getOption("ISAnalytics.verbose")) {
727
+                rlang::inform(.inform_skip_count_is())
728
+            }
729
+        }
730
+    }
731
+
746 732
     updated_meta <- metadata %>% dplyr::left_join(result, by = sample_key)
747 733
     return(list(x = result, metadata = updated_meta))
748 734
 }
... ...
@@ -750,7 +736,7 @@ sample_statistics <- function(x, metadata,
750 736
 
751 737
 #' Grubbs test for Common Insertion Sites (CIS).
752 738
 #'
753
-#' \lifecycle{experimental}
739
+#' \lifecycle{stable}
754 740
 #' Statistical approach for the validation of common insertion sites
755 741
 #' significance based on the comparison of the integration frequency
756 742
 #' at the CIS gene with respect to other genes contained in the
... ...
@@ -762,279 +748,145 @@ sample_statistics <- function(x, metadata,
762 748
 #' ## Genomic annotation file
763 749
 #' This file is a data base, or more simply a .tsv file to import, with
764 750
 #' genes annotation for the specific genome. The annotations for the
765
-#' human genome (hg19) is already included in this package.
751
+#' human genome (hg19) and murine genome (mm9 and mm10) are already
752
+#' included in this package: to use one of them just
753
+#' set the argument `genomic_annotation_file` to either `"hg19"`,
754
+#' `"mm9"` or `"mm10"`.
766 755
 #' If for any reason the user is performing an analysis on another genome,
767 756
 #' this file needs to be changed respecting the USCS Genome Browser
768
-#' format, meaning the input file headers should be:
769
-#'
770
-#' ```{r echo=FALSE, tidy=TRUE}
771
-#' cat(c(
772
-#'   paste(c("name2", "chrom", "strand"), collapse = ", "), "\n",
773
-#'   paste(c("min_txStart","max_txEnd", "minmax_TxLen"), collapse = ", "),
774
-#'   "\n",
775
-#'   paste(c("average_TxLen", "name", "min_cdsStart"), collapse = ", "),
776
-#'   "\n",
777
-#'   paste(c("max_cdsEnd","minmax_CdsLen", "average_CdsLen"), collapse = ", ")
778
-#' ))
757
+#' format, meaning the input file headers should include:
779 758
 #'
780
-#' ```
759
+#' `r refGene_table_cols()`
781 760
 #'
782 761
 #' @param x An integration matrix, must include the `mandatory_IS_vars()`
783 762
 #' columns and the `annotation_IS_vars()` columns
784 763
 #' @param genomic_annotation_file Database file for gene annotation,
785
-#' see details
764
+#' see details.
786 765
 #' @param grubbs_flanking_gene_bp Number of base pairs flanking a gene
787 766
 #' @param threshold_alpha Significance threshold
788
-#' @param add_standard_padjust Compute the standard padjust?
767
+#' @param by Either `NULL` or a character vector of column names. If not
768
+#' NULL, the function will perform calculations for each group and return
769
+#' a list of data frames with the results. E.g. for `by = "SubjectID"`,
770
+#' CIS will be computed for each distinct SubjectID found in the table
771
+#' (of course, "SubjectID" column must be included in the input data frame).
789 772
 #'
790 773
 #' @family Analysis functions
791 774
 #'
792
-#' @import dplyr
793 775
 #' @importFrom tibble as_tibble
794
-#' @importFrom rlang .data
776
+#' @importFrom rlang .data abort current_env eval_tidy sym
795 777
 #' @importFrom magrittr `%>%`
796
-#' @importFrom stats median pt p.adjust
797 778
 #' @importFrom utils read.csv
779
+#' @importFrom stringr str_replace_all
780
+#' @importFrom tidyr unite
781
+#' @importFrom purrr set_names map
798 782
 #'
799 783
 #' @return A data frame
800 784
 #' @export
801 785
 #'
802 786
 #' @examples
803
-#' op <- options(ISAnalytics.widgets = FALSE)
804
-#'
805
-#' path_AF <- system.file("extdata", "ex_association_file.tsv",
806
-#'     package = "ISAnalytics"
807
-#' )
808
-#' root_correct <- system.file("extdata", "fs.zip",
809
-#'     package = "ISAnalytics"
810
-#' )
811
-#' root_correct <- unzip_file_system(root_correct, "fs")
812
-#'
813
-#' matrices <- import_parallel_Vispa2Matrices_auto(
814
-#'     association_file = path_AF, root = root_correct,
815
-#'     quantification_type = c("seqCount", "fragmentEstimate"),
816
-#'     matrix_type = "annotated", workers = 2, patterns = NULL,
817
-#'     matching_opt = "ANY",
818
-#'     dates_format = "dmy"
819
-#' )
820
-#'
821
-#' cis <- CIS_grubbs(matrices)
822
-#'
823
-#' options(op)
787
+#' data("integration_matrices", package = "ISAnalytics")
788
+#' cis <- CIS_grubbs(integration_matrices)
789
+#' head(cis)
824 790
 CIS_grubbs <- function(x,
825
-    genomic_annotation_file =
826
-        system.file("extdata", "hg19.refGene.oracle.tsv.xz",
827
-            package = "ISAnalytics"
828
-        ),
791
+    genomic_annotation_file = "hg19",
829 792
     grubbs_flanking_gene_bp = 100000,
830 793
     threshold_alpha = 0.05,
831
-    add_standard_padjust = TRUE) {
794
+    by = NULL) {
832 795
     # Check x has the correct structure
833 796
     stopifnot(is.data.frame(x))
834 797
     if (!all(mandatory_IS_vars() %in% colnames(x))) {
835
-        stop(.non_ISM_error())
798
+        rlang::abort(.missing_mand_vars())
836 799
     }
837 800
     if (!.is_annotated(x)) {
838
-        stop(.missing_annot())
801
+        rlang::abort(.missing_annot())
839 802
     }
840 803
     # Check other parameters
841
-    stopifnot(is.character(genomic_annotation_file) &
842
-        length(genomic_annotation_file) == 1)
843
-    stopifnot(is.numeric(grubbs_flanking_gene_bp) |
844
-        is.integer(grubbs_flanking_gene_bp))
845
-    stopifnot(length(grubbs_flanking_gene_bp) == 1)
846
-    stopifnot(is.numeric(threshold_alpha) & length(threshold_alpha) == 1)
847
-    stopifnot(is.logical(add_standard_padjust) &
848
-        length(add_standard_padjust) == 1)
849
-    stopifnot(file.exists(genomic_annotation_file))
850
-    # Determine file extension
851
-    ext <- .check_file_extension(genomic_annotation_file)
852
-
853
-    # Try to import annotation file
854
-    if (ext == "tsv") {
855
-        refgenes <- utils::read.csv(
856
-            file = genomic_annotation_file,
857
-            header = TRUE, fill = TRUE, sep = "\t",
858
-            check.names = FALSE,
859
-            na.strings = c("NONE", "NA", "NULL", "NaN", "")
860
-        )
861
-        refgenes <- tibble::as_tibble(refgenes) %>%
862
-            dplyr::mutate(chrom = stringr::str_replace_all(
863
-                .data$chrom,
864
-                "chr", ""
865
-            ))
866
-    } else if (ext == "csv") {
867
-        refgenes <- utils::read.csv(
868
-            file = genomic_annotation_file,
869
-            header = TRUE, fill = TRUE,
870
-            check.names = FALSE,
871
-            na.strings = c("NONE", "NA", "NULL", "NaN", "")
872
-        )
873
-        refgenes <- tibble::as_tibble(refgenes) %>%
874
-            dplyr::mutate(chrom = stringr::str_replace_all(
875
-                .data$chrom,
876
-                "chr", ""
877
-            ))
804
+    stopifnot(is.character(genomic_annotation_file))
805
+    genomic_annotation_file <- genomic_annotation_file[1]
806
+    if (genomic_annotation_file %in% c("hg19", "mm9", "mm10")) {
807
+        gen_file <- paste0("refGenes_", genomic_annotation_file)
808
+        utils::data(list = gen_file, envir = rlang::current_env())
809
+        refgenes <- rlang::eval_tidy(rlang::sym(gen_file))
878 810
     } else {
879
-        stop(paste(
880
-            "The genomic annotation file must be either in",
881
-            ".tsv or .csv format (compressed or not)"
882
-        ))
883
-    }
884
-
885
-    # Check annotation file format
886
-    if (!all(c(
887
-        "name2", "chrom", "strand", "min_txStart", "max_txEnd",
888
-        "minmax_TxLen", "average_TxLen", "name", "min_cdsStart",
889
-        "max_cdsEnd", "minmax_CdsLen", "average_CdsLen"
890
-    ) %in%
891
-        colnames(refgenes))) {
892
-        stop(.non_standard_annotation_structure())
893
-    }
894
-    ### Begin - init phase
895
-    df_by_gene <- x %>%
896
-        dplyr::group_by(
897
-            .data$GeneName,
898
-            .data$GeneStrand,
899
-            .data$chr
900
-        ) %>%
901
-        dplyr::summarise(
902
-            n_IS_perGene = dplyr::n_distinct(
903
-                .data$integration_locus
904
-            ),
905
-            min_bp_integration_locus =
906
-                min(.data$integration_locus),
907
-            max_bp_integration_locus =
908
-                max(.data$integration_locus),
909
-            IS_span_bp = (max(.data$integration_locus) -
910
-                min(.data$integration_locus)),
911
-            avg_bp_integration_locus =
912
-                mean(.data$integration_locus),
913
-            median_bp_integration_locus =
914
-                stats::median(.data$integration_locus),
915
-            distinct_orientations = dplyr::n_distinct(.data$strand),
916
-            describe = list(tibble::as_tibble(
917
-                psych::describe(.data$integration_locus)
918
-            )),
919
-            .groups = "drop"
920
-        ) %>%
921
-        tidyr::unnest(.data$describe, keep_empty = TRUE, names_sep = "_")
922
-
923
-    df_bygene_withannotation <- df_by_gene %>%
924
-        dplyr::inner_join(refgenes, by = c(
925
-            "chr" = "chrom",
926
-            "GeneStrand" = "strand",
927
-            "GeneName" = "name2"
928
-        )) %>%
929
-        dplyr::select(c(
930
-            dplyr::all_of(colnames(df_by_gene)),
931
-            .data$average_TxLen
932
-        ))
933
-    n_elements <- nrow(df_bygene_withannotation)
934
-
935
-    df_bygene_withannotation <- df_bygene_withannotation %>%
936
-        dplyr::mutate(
937
-            geneIS_frequency_byHitIS = .data$n_IS_perGene / n_elements
938
-        )
939
-
940
-    ### Grubbs test
941
-    ### --- Gene Frequency
942
-    df_bygene_withannotation <- df_bygene_withannotation %>%
943
-        dplyr::mutate(
944
-            raw_gene_integration_frequency =
945
-                .data$n_IS_perGene / .data$average_TxLen,
946
-            integration_frequency_withtolerance = .data$n_IS_perGene /
947
-                (.data$average_TxLen + grubbs_flanking_gene_bp) * 1000,
948
-            minus_log2_integration_freq_withtolerance =
949
-                -log(x = .data$integration_frequency_withtolerance, base = 2)
950
-        )
951
-    ### --- z score
952
-    z_mlif <- function(x) {
953
-        sqrt((n_elements * (n_elements - 2) * x^2) /
954
-            (((n_elements - 1)^2) - (n_elements * x^2)))
955
-    }
956
-    df_bygene_withannotation <- df_bygene_withannotation %>%
957
-        dplyr::mutate(
958
-            zscore_minus_log2_int_freq_tolerance =
959
-                scale(-log(
960
-                    x = .data$integration_frequency_withtolerance,
961
-                    base = 2
962
-                )),
963
-            neg_zscore_minus_log2_int_freq_tolerance =
964
-                -.data$zscore_minus_log2_int_freq_tolerance,
965
-            t_z_mlif = z_mlif(
966
-                .data$neg_zscore_minus_log2_int_freq_tolerance
811
+        stopifnot(file.exists(genomic_annotation_file))
812
+        # Determine file extension
813
+        ext <- .check_file_extension(genomic_annotation_file)
814
+        # Try to import annotation file
815
+        if (ext == "tsv") {
816
+            refgenes <- utils::read.csv(
817
+                file = genomic_annotation_file,
818
+                header = TRUE, fill = TRUE, sep = "\t",
819
+                check.names = FALSE,
820
+                na.strings = c("NONE", "NA", "NULL", "NaN", "")
967 821
             )
968
-        )
969
-    ### --- tdist
970
-    t_dist_2t <- function(x, deg) {
971
-        return((1 - stats::pt(x, deg)) * 2)
972
-    }
973
-    df_bygene_withannotation <- df_bygene_withannotation %>%
974
-        dplyr::mutate(
975
-            tdist2t = t_dist_2t(.data$t_z_mlif, n_elements - 2),
976
-            tdist_pt = pt(
977
-                q = .data$t_z_mlif,
978
-                df = n_elements - 2
979
-            ),
980
-            tdist_bonferroni_default = ifelse(
981
-                .data$tdist2t * n_elements > 1, 1,
982
-                .data$tdist2t * n_elements
822
+            # Check annotation file format
823
+            if (!all(refGene_table_cols() %in% colnames(refgenes))) {
824
+                rlang::abort(.non_standard_annotation_structure())
825
+            }
826
+            refgenes <- tibble::as_tibble(refgenes) %>%
827
+                dplyr::mutate(chrom = stringr::str_replace_all(
828
+                    .data$chrom,
829
+                    "chr", ""
830
+                ))
831
+        } else if (ext == "csv") {
832
+            refgenes <- utils::read.csv(
833
+                file = genomic_annotation_file,
834
+                header = TRUE, fill = TRUE,
835
+                check.names = FALSE,
836
+                na.strings = c("NONE", "NA", "NULL", "NaN", "")
983 837
             )
984
-        )
985
-    if (add_standard_padjust == TRUE) {
986
-        df_bygene_withannotation <- df_bygene_withannotation %>%
987
-            dplyr::mutate(
988
-                tdist_bonferroni = stats::p.adjust(
989
-                    .data$tdist2t,
990
-                    method = "bonferroni",
991
-                    n = length(.data$tdist2t)
992
-                ),
993
-                tdist_fdr = stats::p.adjust(
994
-                    .data$tdist2t,
995
-                    method = "fdr",
996
-                    n = length(.data$tdist2t)
997
-                ),
998
-                tdist_benjamini = stats::p.adjust(
999
-                    .data$tdist2t,
1000
-                    method = "BY",
1001
-                    n = length(.data$tdist2t)
1002
-                )
838
+            # Check annotation file format
839
+            if (!all(refGene_table_cols() %in% colnames(refgenes))) {
840
+                rlang::abort(.non_standard_annotation_structure())
841
+            }
842
+            refgenes <- tibble::as_tibble(refgenes) %>%
843
+                dplyr::mutate(chrom = stringr::str_replace_all(
844
+                    .data$chrom,
845
+                    "chr", ""
846
+                ))
847
+        } else {
848
+            gen_file_err <- paste(
849
+                "The genomic annotation file must be either in",
850
+                ".tsv or .csv format (compressed or not)"
1003 851
             )
852
+            rlang::abort(gen_file_err)
853
+        }
1004 854
     }
1005
-    df_bygene_withannotation <- df_bygene_withannotation %>%
1006
-        dplyr::mutate(
1007
-            tdist_positive_and_corrected =
1008
-                ifelse(
1009
-                    (.data$tdist_bonferroni_default < threshold_alpha &
1010
-                        .data$neg_zscore_minus_log2_int_freq_tolerance > 0),
1011
-                    .data$tdist_bonferroni_default,
1012
-                    NA
1013
-                ),
1014
-            tdist_positive = ifelse(
1015
-                (.data$tdist2t < threshold_alpha &
1016
-                    .data$neg_zscore_minus_log2_int_freq_tolerance > 0),
1017
-                .data$tdist2t,
1018
-                NA
1019
-            )
1020
-        )
1021
-    EM_correction_N <- length(
1022
-        df_bygene_withannotation$tdist_positive[
1023
-            !is.na(df_bygene_withannotation$tdist_positive)
1024
-        ]
1025
-    )
1026
-    df_bygene_withannotation <- df_bygene_withannotation %>%
1027
-        dplyr::mutate(
1028
-            tdist_positive_and_correctedEM =
1029
-                ifelse(
1030
-                    (.data$tdist2t * EM_correction_N <
1031
-                        threshold_alpha &
1032
-                        .data$neg_zscore_minus_log2_int_freq_tolerance > 0),
1033
-                    .data$tdist2t * EM_correction_N,
1034
-                    NA
1035
-                )
855
+    stopifnot(is.numeric(grubbs_flanking_gene_bp) |
856
+        is.integer(grubbs_flanking_gene_bp))
857
+    grubbs_flanking_gene_bp <- grubbs_flanking_gene_bp[1]
858
+    stopifnot(is.numeric(threshold_alpha))
859
+    threshold_alpha <- threshold_alpha[1]
860
+    stopifnot(is.null(by) || is.character(by))
861
+    if (!all(by %in% colnames(x))) {
862
+        rlang::abort(.missing_user_cols_error(by[!by %in% colnames(x)]))
863
+    }
864
+    result <- if (is.null(by)) {
865
+        .cis_grubb_calc(
866
+            x = x,
867
+            refgenes = refgenes,
868
+            grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
869
+            threshold_alpha = threshold_alpha
1036 870
         )
1037
-    return(df_bygene_withannotation)
871
+    } else {
872
+        grouped <- x %>%
873
+            dplyr::group_by(dplyr::across(dplyr::all_of(by)))
874
+        group_keys <- grouped %>%
875
+            dplyr::group_keys() %>%
876
+            tidyr::unite(col = "id", dplyr::everything()) %>%
877
+            dplyr::pull(.data$id)
878
+        split <- x %>%
879
+            dplyr::group_by(dplyr::across(dplyr::all_of(by))) %>%
880
+            dplyr::group_split() %>%
881
+            purrr::set_names(group_keys)
882
+        purrr::map(split, ~ .cis_grubb_calc(
883
+            x = .x,
884
+            refgenes = refgenes,
885
+            grubbs_flanking_gene_bp = grubbs_flanking_gene_bp,
886
+            threshold_alpha = threshold_alpha
887
+        ))
888
+    }
889
+    return(result)
1038 890
 }
1039 891
 
1040 892
 #' Integrations cumulative count in time by sample
... ...
@@ -1082,9 +934,11 @@ CIS_grubbs <- function(x,
1082 934
 #'
1083 935
 #' @family Analysis functions
1084 936
 #'
1085
-#' @import dplyr
937
+#' @importFrom dplyr mutate filter across all_of select summarise group_by
938
+#' @importFrom dplyr arrange group_split first full_join starts_with distinct
939
+#' @importFrom dplyr semi_join n rename
1086 940
 #' @importFrom magrittr `%>%`
1087
-#' @importFrom rlang .data
941
+#' @importFrom rlang .data abort inform `:=`
1088 942
 #' @importFrom stringr str_pad
1089 943
 #' @importFrom purrr reduce is_empty
1090 944
 #' @importFrom tidyr pivot_longer
... ...
@@ -1094,37 +948,15 @@ CIS_grubbs <- function(x,
1094 948
 #' @export
1095 949
 #'
1096 950
 #' @examples
1097
-#' op <- options(ISAnalytics.widgets = FALSE)
1098
-#'
1099
-#' path_AF <- system.file("extdata", "ex_association_file.tsv",
1100
-#'     package = "ISAnalytics"
1101
-#' )
1102
-#' root_correct <- system.file("extdata", "fs.zip",
1103
-#'     package = "ISAnalytics"
1104
-#' )
1105
-#' root_correct <- unzip_file_system(root_correct, "fs")
1106
-#'
1107
-#' association_file <- import_association_file(path_AF, root_correct,
1108
-#'     dates_format = "dmy"
1109
-#' )
1110
-#' matrices <- import_parallel_Vispa2Matrices_auto(
1111
-#'     association_file = association_file, root = NULL,
1112
-#'     quantification_type = c("seqCount", "fragmentEstimate"),
1113
-#'     matrix_type = "annotated", workers = 2, patterns = NULL,
1114
-#'     matching_opt = "ANY", multi_quant_matrix = FALSE
1115
-#' )
1116
-#'
1117
-#' #### EXTERNAL AGGREGATION
1118
-#' aggregated <- aggregate_values_by_key(matrices$seqCount, association_file)
1119
-#' cumulative_count <- cumulative_count_union(aggregated)
1120
-#'
1121
-#' #### INTERNAL AGGREGATION
1122
-#' cumulative_count_2 <- cumulative_count_union(matrices$seqCount,
1123
-#'     association_file,
1124
-#'     aggregate = TRUE
951
+#' data("integration_matrices", package = "ISAnalytics")
952
+#' data("association_file", package = "ISAnalytics")
953
+#' aggreg <- aggregate_values_by_key(
954
+#'     x = integration_matrices,
955
+#'     association_file = association_file,
956
+#'     value_cols = c("seqCount", "fragmentEstimate")
1125 957
 #' )
1126
-#'
1127
-#' options(op)
958
+#' cumulative_count <- cumulative_count_union(aggreg)
959
+#' cumulative_count
1128 960
 cumulative_count_union <- function(x,
1129 961
     association_file = NULL,
1130 962
     timepoint_column = "TimePoint",
... ...
@@ -1146,14 +978,14 @@ cumulative_count_union <- function(x,
1146 978
     stopifnot(is.character(zero) & length(zero) == 1)
1147 979
     stopifnot(is.logical(aggregate))
1148 980
     if (aggregate == TRUE & is.null(association_file)) {
1149
-        stop(.agg_with_null_meta_err())
981
+        rlang::abort(.agg_with_null_meta_err())
1150 982
     }
1151 983
     if (!all(timepoint_column %in% key)) {
1152
-        stop(.key_without_tp_err())
984
+        rlang::abort(.key_without_tp_err())
1153 985
     }
1154 986
     if (aggregate == FALSE) {
1155 987
         if (!all(key %in% colnames(x))) {
1156
-            stop(.key_not_found())
988
+            rlang::abort(.key_not_found())
1157 989
         }
1158 990
     } else {
1159 991
         x <- aggregate_values_by_key(
... ...
@@ -1166,12 +998,14 @@ cumulative_count_union <- function(x,
1166 998
             max <- max(association_file[[timepoint_column]])
1167 999
             digits <- floor(log10(x)) + 1
1168 1000
             association_file <- association_file %>%
1169
-                dplyr::mutate({{ timepoint_column }} := stringr::str_pad(
1170
-                    as.character(.data$TimePoint),
1171
-                    digits,
1172
-                    side = "left",
1173
-                    pad = "0"
1174
-                ))
1001
+                dplyr::mutate(
1002
+                    {{ timepoint_column }} := stringr::str_pad(
1003
+                        as.character(.data$TimePoint),
1004
+                        digits,
1005
+                        side = "left",
1006
+                        pad = "0"
1007
+                    )
1008
+                )
1175 1009
             zero <- paste0(rep_len("0", digits), collapse = "")
1176 1010
         }
1177 1011
     }
... ...
@@ -1182,10 +1016,11 @@ cumulative_count_union <- function(x,
1182 1016
                 ~ .x != zero
1183 1017
             ))
1184 1018
         if (nrow(x) == 0) {
1185
-            message(paste(
1019
+            all_tp0_msg <- paste(
1186 1020
                 "All time points zeros were excluded, the data",
1187 1021
                 "frame is empty."
1188
-            ))
1022
+            )
1023
+            rlang::inform(all_tp0_msg)
1189 1024
             return(x)
1190 1025
         }
1191 1026
     }
... ...
@@ -1282,7 +1117,7 @@ cumulative_count_union <- function(x,
1282 1117
 #' function \code{\link{sharing_heatmap}}.
1283 1118
 #'
1284 1119
 #' @param x An integration matrix, aka a data frame containing the columns
1285
-#' `r mandatory_IS_vars()`. See details.
1120
+#' `mandatory_IS_vars()`. See details.
1286 1121
 #' @param group_key Character vector of column names which identify a
1287 1122
 #' single group. An associated group id will be derived by concatenating
1288 1123
 #' the values of these fields, separated by "_"
... ...
@@ -1291,16 +1126,27 @@ cumulative_count_union <- function(x,
1291 1126
 #' @param relative_is_sharing Logical, if TRUE also returns the relative
1292 1127
 #' sharing. See details.
1293 1128
 #'
1129
+#' @importFrom rlang abort .data
1130
+#' @importFrom dplyr select all_of distinct transmute inner_join filter
1131
+#' @importFrom dplyr pull mutate
1132
+#' @importFrom tidyr nest unite
1133
+#' @importFrom purrr map_int pmap_dbl
1134
+#' @importFrom tibble tibble add_case
1135
+#'
1294 1136
 #' @family Analysis functions
1295 1137
 #' @return A named list of data frames or a single data frame
1296 1138
 #' @export
1297 1139
 #'
1298 1140
 #' @examples
1299
-#' path <- system.file("extdata", "ex_annotated_ISMatrix.tsv.xz",
1300
-#'     package = "ISAnalytics"
1141
+#' data("integration_matrices", package = "ISAnalytics")
1142
+#' data("association_file", package = "ISAnalytics")
1143
+#' aggreg <- aggregate_values_by_key(
1144
+#'     x = integration_matrices,
1145
+#'     association_file = association_file,
1146
+#'     value_cols = c("seqCount", "fragmentEstimate")
1301 1147
 #' )
1302
-#' matrix <- import_single_Vispa2Matrix(path)
1303
-#' sharing <- is_sharing(matrix, group_key = "CompleteAmplificationID")
1148
+#' sharing <- is_sharing(aggreg)
1149
+#' sharing
1304 1150
 is_sharing <- function(x,
1305 1151
     group_key = c(
1306 1152
         "SubjectID",
... ...
@@ -2,250 +2,225 @@
2 2
 # Collision removal functions
3 3
 #------------------------------------------------------------------------------#
4 4
 
5
-#' Identifies and removes collisions based on the sequence count matrix.
5
+#' Identifies and removes collisions.
6 6
 #'
7
-#' \lifecycle{experimental}
7
+#' \lifecycle{stable}
8 8
 #' A collision is an integration (aka a unique combination of
9
-#' chr, integration_locus and strand) which is observed in more than one
10
-#' independent sample (a unique pair of ProjectID and SubjectID). The function
11
-#' tries to decide to which subject an integration should be assigned and if no
9
+#' `chr`, `integration_locus` and `strand`) which is observed in more than one
10
+#' independent sample (a unique pair of `ProjectID` and `SubjectID`).
11
+#' The function tries to decide to which subject an integration
12
+#' should be assigned to and, if no
12 13
 #' decision can be taken, the integration is completely removed from the data
13 14
 #' frame.
15
+#' For more details refer to the vignette "Collision removal functionality":
16
+#' \code{vignette("collision_removal", package = "ISAnalytics")}
14 17
 #'
15
-#' @details If you don't want the function to show details and messages do:
16
-#' \code{options(ISAnalitics.verbose = FALSE)}.
17
-#' To restore to the original value:
18
-#' \code{options(ISAnalitics.verbose = TRUE)}.
19
-#' For more details on how to use collision removal functionality:
20
-#' \code{vignette("Collision removal functionality", package = "ISAnalytics")}
21
-#'
22
-#' @param x A named list of matrices (names must be quantification types),
23
-#' a single integration matrix representing the sequence count matrix of
24
-#' interest or a multi-quantification matrix obtained via
25
-#' \link{comparison_matrix}
18
+#' @param x Either a multi-quantification matrix or a
19
+#' named list of matrices (names must be quantification types)
26 20
 #' @param association_file The association file imported via
27
-#' `import_association_file`
28
-#' @param date_col The date column that should be considered for the analysis.
21
+#' `import_association_file()`
22
+#' @param date_col The date column that should be considered.
29 23
 #' Must be one value in `date_columns_coll()`
30 24
 #' @param reads_ratio A single numeric value that represents the ratio that has
31
-#' to be considered when deciding between seqCount value.
32
-#' @param seq_count_col For support of multi-quantification matrix -
33
-#' the name of the sequence count values column
34
-#' @param max_rows_reports A numeric value, represents the maximum number of
35
-#' rows of the reports data frames that can be printed on console
36
-#' if the option `ISAnalytics.verbose` is active. If the data frames are too
37
-#' large they won't be printed on console - we recommend using widgets for
38
-#' detailed and more accessible info.
39
-#' @param save_widget_path Either NULL or a path where the html report file
40
-#' should be saved. If NULL the report is visualized via browser ONLY (not
41
-#' saved on disk).
25
+#' to be considered when deciding between `seqCount` value.
26
+#' @param quant_cols A named character vector where names are
27
+#' quantification types and
28
+#' values are the names of the corresponding columns. The quantification
29
+#' `seqCount` MUST be included in the vector.
30
+#' @param report_path The path where the report file should be saved.
31
+#' Can be a folder, a file or NULL if no report should be produced.
32
+#' Defaults to `{user_home}/ISAnalytics_reports`.
42 33
 #'
43 34
 #' @family Collision removal
44
-#' @importFrom dplyr bind_rows all_of select
45
-#' @importFrom tibble is_tibble
46 35
 #' @importFrom magrittr `%>%`
36
+#' @importFrom rlang inform abort exec .data
37
+#' @importFrom purrr map2
38
+#' @importFrom dplyr bind_rows select all_of group_by summarise across n
39
+#' @importFrom dplyr distinct
40
+#'
47 41
 #' @seealso \code{\link{date_columns_coll}}
48 42
 #'
49
-#' @return A list of tibbles with removed collisions
43
+#' @return Either a multi-quantification matrix or a list of data frames
50 44
 #' @export
51 45
 #'
52 46
 #' @examples
53
-#' op <- options("ISAnalytics.widgets" = FALSE)
54
-#' path <- system.file("extdata", "ex_association_file.tsv",
55
-#'     package = "ISAnalytics"
56
-#' )
57
-#' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
58
-#' root <- unzip_file_system(root_pth, "fs")
59
-#' association_file <- import_association_file(path, root,
60
-#'     dates_format = "dmy"
47
+#' data("integration_matrices", package = "ISAnalytics")
48
+#' data("association_file", package = "ISAnalytics")
49
+#' no_coll <- remove_collisions(
50
+#'     x = integration_matrices,
51
+#'     association_file = association_file,
52
+#'     report_path = NULL
61 53
 #' )
62
-#' matrices <- import_parallel_Vispa2Matrices_auto(
63
-#'     association_file = association_file, root = NULL,
64
-#'     quantification_type = c("fragmentEstimate", "seqCount"),
65
-#'     matrix_type = "annotated", workers = 2,
66
-#'     patterns = NULL, matching_opt = "ANY",
67
-#'     multi_quant_matrix = FALSE
68
-#' )
69
-#' matrices <- remove_collisions(matrices, association_file)
70
-#' options(op)
54
+#' head(no_coll)
71 55
 remove_collisions <- function(x,
72 56
     association_file,
73 57
     date_col = "SequencingDate",
74 58
     reads_ratio = 10,
75
-    seq_count_col = "seqCount",
76
-    max_rows_reports = 50,
77
-    save_widget_path = NULL) {
59
+    quant_cols = c(
60
+        seqCount = "seqCount",
61
+        fragmentEstimate = "fragmentEstimate"
62
+    ),
63
+    report_path = default_report_path()) {
64
+
78 65
     # Check basic parameter correctness
79 66
     stopifnot(is.list(x) & !is.null(names(x)))
80
-
67
+    stopifnot(is.character(quant_cols) && all(!is.na(names(quant_cols))))
68
+    if (!all(names(quant_cols) %in% quantification_types())) {
69
+        rlang::abort(
70
+            .quantifications_names_err(
71
+                quant_cols[!names(quant_cols) %in% quantification_types()]
72
+            )
73
+        )
74
+    }
81 75
     mode <- NULL
82
-    quantifications_cols <- NULL
83
-    if (tibble::is_tibble(x)) {
84
-        if (.check_mandatory_vars(x) == FALSE) {
85
-            stop(.non_ISM_error())
86
-        }
87
-        if (.check_complAmpID(x) == FALSE) {
88
-            stop(.missing_complAmpID_error())
89
-        }
90
-        ### SUPPORT FOR MULTI-QUANTIFICATION MATRIX
91
-        ## Check if it contains the "Value" column. If not find all numeric
92
-        ## columns that are not default columns
93
-        quantifications_cols <- if (.check_value_col(x) == FALSE) {
94
-            found <- .find_exp_cols(x,
95
-                                    c(mandatory_IS_vars(),
96
-                                      annotation_IS_vars(),
97
-                                      "CompleteAmplificationID"))
98
-            if (purrr::is_empty(found)) {
99
-                stop(.missing_num_cols_error())
100
-            }
101
-            mode <- "MULTI"
102
-            found
103
-        } else {
104
-            mode <- "SC"
105
-            "Value"
76
+    if (!is.data.frame(x)) {
77
+        if (!all(names(x) %in% quantification_types())) {
78
+            rlang::abort(.quantifications_names_err(
79
+                names(x)[!names(x) %in% quantification_types()]
80
+            ))
106 81
         }
107
-    } else {
108
-        stopifnot(all(names(x) %in% quantification_types()))
109 82
         ## remove_collisions requires seqCount matrix, check if the list
110 83
         ## contains one
111 84
         if ((!"seqCount" %in% names(x)) ||
112 85
             nrow(x$seqCount) == 0) {
113
-            stop(paste(
114
-                "Sequence count data frame is required for collision removal",
115
-                "but none was detected in x"
116
-            ))
86
+            rlang::abort(.seqCount_df_err())
117 87
         }
118
-        all_ISm <- purrr::map_lgl(x, .check_mandatory_vars)
119
-        if (!all(all_ISm)) {
120
-            stop(.non_ISM_error())
88
+        all_correct <- purrr::map2(x, names(x), function(m, quant) {
89
+            mand_cols <- .check_mandatory_vars(m)
90
+            cAmp_col <- .check_complAmpID(m)
91
+            if (mand_cols & cAmp_col) {
92
+                return(NA_character_)
93
+            } else {
94
+                msgs <- c()
95
+                if (!mand_cols) {
96
+                    msgs <- .missing_mand_vars()
97
+                }
98
+                if (!cAmp_col) {
99
+                    msgs <- c(msgs, .missing_cAmp_sub_msg())
100
+                }
101
+                msgs <- paste0(quant, " - ", paste0(msgs, collapse = ";\n"))
102
+                return(msgs)
103
+            }
104
+        })
105
+        if (!all(is.na(all_correct))) {
106
+            message <- unlist(all_correct[!is.na(all_correct)])
107
+            rlang::abort(c("Matrices miss required info, aborting", message))
121 108
         }
122
-        all_campid <- purrr::map_lgl(x, .check_complAmpID)
123
-        if (!all(all_campid)) {
124
-            stop(.missing_complAmpID_error())
109
+        ## Transform the list in a multi-quant matrix
110
+        mode <- "LIST"
111
+        quant_cols_lst <- as.list(quant_cols)
112
+        args <- append(list(x = x), quant_cols_lst)
113
+        x <- rlang::exec(comparison_matrix, !!!args)
114
+    } else {
115
+        if (.check_mandatory_vars(x) == FALSE) {
116
+            rlang::abort(.missing_mand_vars())
125 117
         }
126
-        if (.check_value_col(x$seqCount) == FALSE) {
127
-            stop(.missing_value_col_error())
118
+        if (.check_complAmpID(x) == FALSE) {
119
+            rlang::abort(.missing_cAmp_sub_msg())
128 120
         }
129
-        quantifications_cols <- "Value"
130
-        mode <- "LIST"
131
-    }
132
-    stopifnot(tibble::is_tibble(association_file))
133
-    stopifnot(is.character(date_col) & length(date_col) == 1)
134
-    stopifnot(date_col %in% date_columns_coll())
135
-    stopifnot(is.integer(reads_ratio) | is.numeric(reads_ratio))
136
-    stopifnot(length(reads_ratio) == 1)
137
-
138
-    if (mode == "SC" || mode == "LIST") {
139
-        seq_count_col <- "Value"
140
-    }
141
-    if (mode == "MULTI") {
142
-        if (!seq_count_col %in% quantifications_cols) {
143
-            stop(paste(
144
-                "Sequence count column name not found in the data",
145
-                "frame"
121
+        if (!all(quant_cols %in% colnames(x))) {
122
+            rlang::abort(.missing_user_cols_error(
123
+                quant_cols[!quant_cols %in% colnames(x)]
146 124
             ))
147 125
         }
126
+        if (!"seqCount" %in% names(quant_cols)) {
127
+            rlang::abort(.seqCount_col_err())
128
+        }
129
+        mode <- "MULTI"
148 130
     }
131
+
132
+    stopifnot(is.data.frame(association_file))
133
+    stopifnot(is.character(date_col))
134
+    date_col <- date_col[1]
135
+    stopifnot(date_col %in% date_columns_coll())
136
+    stopifnot(is.integer(reads_ratio) || is.numeric(reads_ratio))
137
+    reads_ratio <- reads_ratio[1]
149 138
     # Check association file correctness
150 139
     needed_af_cols <- c(
151 140
         "SubjectID", "ProjectID", "PoolID",
152
-        "ReplicateNumber", date_col
141
+        "ReplicateNumber", date_col,
142
+        "CompleteAmplificationID"
153 143
     )
154
-
155 144
     if (any(!needed_af_cols %in% colnames(association_file))) {
156 145
         missing_af_cols <- needed_af_cols[!needed_af_cols %in%
157 146
             colnames(association_file)]
158
-        msg <- paste0(c(
159
-            "Missing needed columns in the association file: ",
160
-            paste0(missing_af_cols, collapse = ", ")
161
-        ))
162
-        stop(msg)
147
+        rlang::abort(.missing_af_needed_cols(missing_af_cols))
163 148
     }
149
+    seq_count_col <- quant_cols["seqCount"]
164 150
 
165 151
     # Check date_col
166
-    if (any(is.na(association_file[date_col]))) {
167
-        stop(paste(
168
-            "Selected date column contains NA values, please check",
169
-            "and fix the association file"
170
-        ))
152
+    if (any(is.na(association_file[[date_col]]))) {
153
+        rlang::abort(.na_in_date_col())
171 154
     }
172 155
 
173
-    # Check imported matrices vs association file
174
-    seq_count_df_pre <- if (mode == "LIST") {
175
-        x$seqCount
176
-    } else {
177
-        x
178
-    }
179 156
     ## Check if association file contains all info relative to content the of
180 157
     ## the matrix
181
-    missing <- numeric(0)
182 158
     verbose <- getOption("ISAnalytics.verbose")
183
-    all_contained <- all(seq_count_df_pre$CompleteAmplificationID %in%
184
-        association_file$CompleteAmplificationID)
185
-    seq_count_df <- NULL
186
-    if (!all_contained) {
187
-        missing <- which(!seq_count_df_pre$CompleteAmplificationID %in%
159
+    ### - Are all sample in the matrix present in the AF?
160
+    missing_ind <- if (!all(x$CompleteAmplificationID %in%
161
+        association_file$CompleteAmplificationID)) {
162
+        which(!x$CompleteAmplificationID %in%
188 163
             association_file$CompleteAmplificationID)
189
-        warning(paste(
190
-            "The association file is missing needed info",
191
-            "on some experiments. Missing samples will be removed",
192
-            "from the matrices."
193
-        ), immediate. = TRUE)
194
-        if (verbose == TRUE) {
195
-            if (length(missing) > max_rows_reports) {
196
-                message(paste(
197
-                    "Missing info data frame is too big",
198
-                    "to be printed, skipping"
199
-                ))
200
-            } else {
201
-                cat("Missing info for these observations: ", sep = "\n")
202
-                print(seq_count_df_pre[missing, ],
203
-                    width = Inf,
204
-                    n = nrow(seq_count_df_pre[missing, ])
205
-                )
206
-            }
207
-        }
208
-        seq_count_df <- seq_count_df_pre[-missing, ]
209 164
     } else {
210
-        seq_count_df <- seq_count_df_pre
165
+        NULL
211 166
     }
212
-    ## Check if association file contains more info than matrix
213
-    not_included <- NULL
214
-    if (verbose == TRUE) {
215
-        not_included <- .check_same_info(association_file, seq_count_df)
216
-        if (nrow(not_included) > 0) {
217
-            message(paste("Found additional data relative to some projects",
218
-                "that are not included in the imported matrices.",
219
-                "Here is a summary",
220
-                collapse = "\n"
221
-            ))
222
-            if (nrow(not_included) > max_rows_reports) {
223
-                message(paste(
224
-                    "Additional info data frame is too big",
225
-                    "to be printed, skipping"
226
-                ))
227
-            } else {
228
-                print(not_included, n = nrow(not_included), width = Inf)
229
-            }
167
+    pre_process <- if (!is.null(missing_ind)) {
168
+        rlang::inform(.missing_af_samples_msg(
169
+            length(unique(x$CompleteAmplificationID[missing_ind]))
170
+        ))
171
+        x[-missing_ind, ]
172
+    } else {
173
+        x
174
+    }
175
+
176
+    ## If after removing missing the matrix is empty, stop
177
+    if (nrow(pre_process) == 0) {
178
+        rlang::inform("Matrix is empty after removing missing samples")
179
+        to_return <- if (mode == "LIST") {
180
+            args <- append(list(x = x), as.list(quant_cols))
181
+            rlang::exec(separate_quant_matrices, !!!args)
182
+        } else {
183
+            x
230 184
         }
185
+        return(to_return)
186
+    }
187
+
188
+    ## Check if association file contains more info than matrix and
189
+    ## keep only metadata that concerns projectIDs in the matrix
190
+    add_samples <- .check_same_info(association_file, pre_process)
191
+    association_file <- add_samples$reduced_af
192
+    add_samples <- add_samples$miss
193
+    if (nrow(add_samples) > 0) {
194
+        if (verbose) {
195
+            rlang::inform(.additional_ad_samples_msg())
196
+        }
197
+    } else {
198
+        add_samples <- NULL
231 199
     }
232 200
 
233 201
     # Begin workflow
234
-    ## Join based on CompleteAmplificationID
235
-    joined <- .join_matrix_af(seq_count_df, association_file, date_col)
236
-    if (verbose == TRUE) {
237
-        message("Identifying collisions...")
202
+    ## - Join based on CompleteAmplificationID
203
+    joined <- .join_matrix_af(pre_process, association_file, date_col)
204
+    if (verbose) {
205
+        rlang::inform("Identifying collisions...")
238 206
     }
207
+    ## - Separate collisions from non-collisions
239 208
     splitted_df <- .identify_independent_samples(joined)
240 209
 
241 210
     if (nrow(splitted_df$collisions) == 0) {
242
-        message(.no_collisions_msg())
243
-        return(NULL)
211
+        rlang::inform(.no_collisions_msg())
212
+        to_return <- if (mode == "LIST") {
213
+            args <- append(list(x = x), as.list(quant_cols))
214
+            rlang::exec(separate_quant_matrices, !!!args)
215
+        } else {
216
+            x
217
+        }
218
+        return(to_return)
244 219
     }
245 220
 
246 221
     # Remove collisions