Browse code

[UPDATE] Update to version 1.1.10

Giulia Pais authored on 08/04/2021 08:38:56
Showing 52 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,48 @@
1
+on:
2
+  push:
3
+    branches:
4
+      - RELEASE_3_12
5
+      - master
6
+
7
+name: pkgdown
8
+
9
+jobs:
10
+  pkgdown:
11
+    runs-on: macOS-latest
12
+    env:
13
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
14
+    steps:
15
+      - uses: actions/checkout@v2
16
+
17
+      - uses: r-lib/actions/setup-r@v1
18
+
19
+      - uses: r-lib/actions/setup-pandoc@v1
20
+
21
+      - name: Query dependencies
22
+        run: |
23
+          install.packages('remotes')
24
+          saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
25
+          writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
26
+        shell: Rscript {0}
27
+
28
+      - name: Cache R packages
29
+        uses: actions/cache@v2
30
+        with:
31
+          path: ${{ env.R_LIBS_USER }}
32
+          key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
33
+          restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-
34
+
35
+      - name: Install dependencies
36
+        run: |
37
+          remotes::install_deps(dependencies = TRUE)
38
+          install.packages("pkgdown", type = "binary")
39
+        shell: Rscript {0}
40
+
41
+      - name: Install package
42
+        run: R CMD INSTALL .
43
+
44
+      - name: Deploy package
45
+        run: |
46
+          git config --local user.email "actions@github.com"
47
+          git config --local user.name "GitHub Actions"
48
+          Rscript -e 'pkgdown::deploy_to_branch(new_process = FALSE)'
... ...
@@ -1,6 +1,6 @@
1 1
 Package: ISAnalytics
2 2
 Title: Analyze gene therapy vector insertion sites data identified from genomics next generation sequencing reads for clonal tracking studies
3
-Version: 1.1.9
3
+Version: 1.1.10
4 4
 Date: 2020-07-03
5 5
 Authors@R: c(
6 6
   person(given = "Andrea",
... ...
@@ -45,7 +45,10 @@ Imports:
45 45
     stats,
46 46
     upsetjs,
47 47
     psych,
48
-    grDevices
48
+    grDevices,
49
+    data.table,
50
+    readxl,
51
+    tools
49 52
 Encoding: UTF-8
50 53
 LazyData: false
51 54
 Roxygen: list(markdown = TRUE)
... ...
@@ -59,7 +62,9 @@ Suggests:
59 62
     sessioninfo,
60 63
     rmarkdown,
61 64
     roxygen2,
62
-    vegan
65
+    vegan,
66
+    withr
63 67
 VignetteBuilder: knitr
64 68
 RdMacros: 
65 69
     lifecycle
70
+Config/testthat/edition: 3
... ...
@@ -37,7 +37,6 @@ import(BiocParallel)
37 37
 import(dplyr)
38 38
 import(ggplot2)
39 39
 import(lifecycle)
40
-import(lubridate)
41 40
 import(upsetjs)
42 41
 importFrom(BiocParallel,MulticoreParam)
43 42
 importFrom(BiocParallel,SnowParam)
... ...
@@ -45,13 +44,15 @@ importFrom(BiocParallel,bplapply)
45 44
 importFrom(BiocParallel,bpok)
46 45
 importFrom(BiocParallel,bpstop)
47 46
 importFrom(BiocParallel,bptry)
47
+importFrom(data.table,fread)
48
+importFrom(data.table,melt.data.table)
49
+importFrom(data.table,rbindlist)
50
+importFrom(data.table,setDT)
48 51
 importFrom(dplyr,across)
49 52
 importFrom(dplyr,all_of)
50 53
 importFrom(dplyr,arrange)
51 54
 importFrom(dplyr,bind_cols)
52 55
 importFrom(dplyr,bind_rows)
53
-importFrom(dplyr,contains)
54
-importFrom(dplyr,desc)
55 56
 importFrom(dplyr,distinct)
56 57
 importFrom(dplyr,filter)
57 58
 importFrom(dplyr,full_join)
... ...
@@ -64,7 +65,6 @@ importFrom(dplyr,rename)
64 65
 importFrom(dplyr,select)
65 66
 importFrom(dplyr,semi_join)
66 67
 importFrom(dplyr,slice)
67
-importFrom(dplyr,slice_head)
68 68
 importFrom(forcats,as_factor)
69 69
 importFrom(forcats,fct_inseq)
70 70
 importFrom(fs,as_fs_path)
... ...
@@ -75,6 +75,7 @@ importFrom(fs,file_exists)
75 75
 importFrom(fs,is_dir)
76 76
 importFrom(fs,path)
77 77
 importFrom(fs,path_dir)
78
+importFrom(fs,path_ext)
78 79
 importFrom(fs,path_wd)
79 80
 importFrom(ggrepel,geom_label_repel)
80 81
 importFrom(grDevices,colorRamp)
... ...
@@ -88,6 +89,7 @@ importFrom(htmltools,h4)
88 89
 importFrom(htmltools,save_html)
89 90
 importFrom(htmltools,tagList)
90 91
 importFrom(htmltools,tags)
92
+importFrom(lubridate,parse_date_time)
91 93
 importFrom(magrittr,`%>%`)
92 94
 importFrom(psych,describe)
93 95
 importFrom(purrr,flatten)
... ...
@@ -98,6 +100,7 @@ importFrom(purrr,map)
98 100
 importFrom(purrr,map2)
99 101
 importFrom(purrr,map2_dfr)
100 102
 importFrom(purrr,map2_lgl)
103
+importFrom(purrr,map_chr)
101 104
 importFrom(purrr,map_dbl)
102 105
 importFrom(purrr,map_dfr)
103 106
 importFrom(purrr,map_lgl)
... ...
@@ -113,14 +116,23 @@ importFrom(reactable,colDef)
113 116
 importFrom(reactable,colFormat)
114 117
 importFrom(reactable,reactable)
115 118
 importFrom(reactable,reactableTheme)
119
+importFrom(readr,cols)
116 120
 importFrom(readr,parse_factor)
121
+importFrom(readr,problems)
122
+importFrom(readr,read_delim)
117 123
 importFrom(readr,write_tsv)
124
+importFrom(readxl,read_excel)
118 125
 importFrom(rlang,.data)
119 126
 importFrom(rlang,`:=`)
127
+importFrom(rlang,abort)
120 128
 importFrom(rlang,arg_match)
129
+importFrom(rlang,call2)
130
+importFrom(rlang,dots_list)
121 131
 importFrom(rlang,env_bind)
122 132
 importFrom(rlang,eval_tidy)
123 133
 importFrom(rlang,expr)
134
+importFrom(rlang,fn_fmls_names)
135
+importFrom(rlang,inform)
124 136
 importFrom(rlang,parse_expr)
125 137
 importFrom(stats,median)
126 138
 importFrom(stats,na.omit)
... ...
@@ -148,6 +160,7 @@ importFrom(tidyr,pivot_wider)
148 160
 importFrom(tidyr,separate)
149 161
 importFrom(tidyr,unite)
150 162
 importFrom(tidyr,unnest)
163
+importFrom(tools,file_path_sans_ext)
151 164
 importFrom(utils,read.csv)
152 165
 importFrom(utils,read.delim)
153 166
 importFrom(utils,tail)
... ...
@@ -1,5 +1,50 @@
1 1
 \title{ISAnalytics News}
2 2
 
3
+# ISAnalytics 1.1.10 (2021-04-08)
4
+
5
+## FIXES
6
+
7
+* Fixed issue in `compute_near_integrations`: when provided recalibration
8
+map export path as a folder now the function works correctly and produces
9
+an automatically generated file name
10
+* Fixed issue in `aggregate_metadata`: now paths to folder that contains
11
+Vispa2 stats is looked up correctly. Also, VISPA2 stats columns are aggregated
12
+if found in the input data frame independently from the parameter 
13
+`import_stats`.
14
+
15
+## IMPROVEMENTS
16
+
17
+* `compute_abundance` can now take as input aggregated matrices and has
18
+additional parameters to offer more flexibility to the user. Major updates
19
+and improvements also on documentation and reproducible examples.
20
+* Major improvements in function `import_single_Vispa2Matrix`: import is 
21
+now preferentially carried out using `data.table::fread` greatly speeding up 
22
+the process - where not possible `readr::read_delim` is used instead
23
+* Major improvements in function `import_association_file`: greatly improved
24
+parsing precision (each column has a dedicated type), import report now 
25
+signals parsing problems and their location and signals also
26
+problems in parsing dates.
27
+Report also includes potential problems in column names and signals missing
28
+data in important columns. Added also the possibility to give various file 
29
+formats in input including `*.xls(x)` formats.
30
+* Function `top_integrations` can now take additional parameters to compute
31
+top n genes for each specified group
32
+* Removed faceting parameters in `CIS_volcano_plot` due to poor precision
33
+(easier to add faceting manually) and added parameters to return the 
34
+data frame that generated the plot as an additional result. Also, it is 
35
+now possible to specify a vector of gene names to highlight even if they're
36
+not above the annotation threshold.
37
+
38
+## MINOR
39
+
40
+* ISAnalytics website has improved graphic theme and has an additional button
41
+on the right that leads to the devel (or release) version of the website
42
+* Updated vignettes
43
+
44
+## FOR DEVS ONLY
45
+
46
+* Complete rework of test suite to be compliant to testthat v.3
47
+
3 48
 # ISAnalytics 1.1.9 (2021-02-17)
4 49
 
5 50
 ## FIXES
... ...
@@ -21,7 +21,7 @@
21 21
 #' @export
22 22
 #'
23 23
 #' @examples
24
-#' op <- options("ISAnalytics.widgets" = FALSE)
24
+#' op <- options("ISAnalytics.widgets" = FALSE, "ISAnalytics.verbose" = FALSE)
25 25
 #' path_AF <- system.file("extdata", "ex_association_file.tsv",
26 26
 #'     package = "ISAnalytics"
27 27
 #' )
... ...
@@ -180,13 +180,13 @@ aggregate_metadata <- function(association_file,
180 180
 #' @family Aggregate functions
181 181
 #'
182 182
 #' @importFrom purrr walk
183
-#' @importFrom rlang expr eval_tidy
183
+#' @importFrom rlang expr eval_tidy abort
184 184
 #'
185 185
 #' @return A list of tibbles or a single tibble according to input
186 186
 #' @export
187 187
 #'
188 188
 #' @examples
189
-#' op <- options("ISAnalytics.widgets" = FALSE)
189
+#' op <- options("ISAnalytics.widgets" = FALSE, "ISAnalytics.verbose" = FALSE)
190 190
 #' path_AF <- system.file("extdata", "ex_association_file.tsv",
191 191
 #'     package = "ISAnalytics"
192 192
 #' )
... ...
@@ -201,12 +201,12 @@ aggregate_metadata <- function(association_file,
201 201
 #'     matrix_type = "annotated", workers = 2, matching_opt = "ANY"
202 202
 #' )
203 203
 #' agg <- aggregate_values_by_key(
204
-#'     x = matrices$seqCount,
205
-#'     association_file = association_file
204
+#'     x = matrices,
205
+#'     association_file = association_file,
206
+#'     value_cols = c("fragmentEstimate", "seqCount")
206 207
 #' )
207 208
 #' options(op)
208
-aggregate_values_by_key <- function(
209
-    x,
209
+aggregate_values_by_key <- function(x,
210 210
     association_file,
211 211
     value_cols = "Value",
212 212
     key = c(
... ...
@@ -225,39 +225,55 @@ aggregate_values_by_key <- function(
225 225
         purrr::walk(x, function(df) {
226 226
             stopifnot(is.data.frame(df))
227 227
             if (.check_mandatory_vars(df) == FALSE) {
228
-                stop(.non_ISM_error())
228
+                rlang::abort(.non_ISM_error())
229 229
             }
230 230
             if (.check_complAmpID(df) == FALSE) {
231
-                stop(.missing_complAmpID_error())
231
+                rlang::abort(.missing_complAmpID_error())
232 232
             }
233 233
             if (!all(value_cols %in% colnames(df))) {
234
-                stop(.missing_user_cols_error())
234
+                rlang::abort(.missing_user_cols_error(
235
+                    value_cols[!value_cols %in% colnames(df)]
236
+                ))
235 237
             }
236
-            purrr::walk(value_cols, function(col) {
237
-                expr <- rlang::expr(`$`(df, !!col))
238
-                if (!is.numeric(rlang::eval_tidy(expr)) &&
239
-                    !is.integer(rlang::eval_tidy(expr))) {
240
-                    stop(.non_num_user_cols_error())
238
+            is_numeric_col <- purrr::map_lgl(value_cols, function(col) {
239
+                if (!is.double(df[[col]]) &&
240
+                    !is.integer(df[[col]])) {
241
+                    FALSE
242
+                } else {
243
+                    TRUE
241 244
                 }
242
-            })
245
+            }) %>% purrr::set_names(value_cols)
246
+            if (any(!is_numeric_col)) {
247
+                rlang::abort(.non_num_user_cols_error(
248
+                    names(is_numeric_col)[!is_numeric_col]
249
+                ))
250
+            }
243 251
         })
244 252
     } else {
245 253
         if (.check_mandatory_vars(x) == FALSE) {
246
-            stop(.non_ISM_error())
254
+            rlang::abort(.non_ISM_error())
247 255
         }
248 256
         if (.check_complAmpID(x) == FALSE) {
249
-            stop(.missing_complAmpID_error())
257
+            rlang::abort(.missing_complAmpID_error())
250 258
         }
251 259
         if (!all(value_cols %in% colnames(x))) {
252
-            stop(.missing_user_cols_error())
260
+            rlang::abort(.missing_user_cols_error(
261
+                value_cols[!value_cols %in% colnames(x)]
262
+            ))
253 263
         }
254
-        purrr::walk(value_cols, function(col) {
255
-            expr <- rlang::expr(`$`(x, !!col))
256
-            if (!is.numeric(rlang::eval_tidy(expr)) &&
257
-                !is.integer(rlang::eval_tidy(expr))) {
258
-                stop(.non_num_user_cols_error())
264
+        is_numeric_col <- purrr::map_lgl(value_cols, function(col) {
265
+            if (!is.double(x[[col]]) &&
266
+                !is.integer(x[[col]])) {
267
+                FALSE
268
+            } else {
269
+                TRUE
259 270
             }
260
-        })
271
+        }) %>% purrr::set_names(value_cols)
272
+        if (any(!is_numeric_col)) {
273
+            rlang::abort(.non_num_user_cols_error(
274
+                names(is_numeric_col)[!is_numeric_col]
275
+            ))
276
+        }
261 277
     }
262 278
     # Check association file
263 279
     stopifnot(is.data.frame(association_file))
... ...
@@ -1,11 +1,12 @@
1 1
 #------------------------------------------------------------------------------#
2 2
 # Analysis functions
3 3
 #------------------------------------------------------------------------------#
4
-#' Computes the abundance of every integration in the sample.
4
+
5
+#' Computes the abundance for every integration event in the input data frame.
5 6
 #'
6 7
 #' \lifecycle{maturing}
7
-#' Abundance is obtained for every row by calculating the ratio
8
-#' between the single value and the total value for the sample.
8
+#' Abundance is obtained for every integration event by calculating the ratio
9
+#' between the single value and the total value for the given group.
9 10
 #'
10 11
 #' @details Abundance will be computed upon the user selected columns
11 12
 #' in the `columns` parameter. For each column a corresponding
... ...
@@ -13,19 +14,28 @@
13 14
 #' column) will be produced.
14 15
 #'
15 16
 #' @param x An integration matrix - aka a data frame that includes
16
-#' the `mandatory_IS_vars()` as columns
17
+#' the `mandatory_IS_vars()` as columns. The matrix can either be aggregated
18
+#' (via `aggregate_values_by_key()`) or not.
17 19
 #' @param columns A character vector of column names to process,
18 20
 #' must be numeric or integer columns
19 21
 #' @param percentage Add abundance as percentage?
22
+#' @param key The key to group by when calculating totals
23
+#' @param keep_totals A value between `TRUE`, `FALSE` or `df`. If `TRUE`,
24
+#' the intermediate totals for each group will be kept in the output
25
+#' data frame as a dedicated column with a trailing "_tot". If `FALSE`,
26
+#' totals won't be included in the output data frame. If `df`, the totals
27
+#' are returned to the user as a separate data frame, together with the
28
+#' abundance data frame.
20 29
 #'
21 30
 #' @family Analysis functions
22 31
 #'
23 32
 #' @importFrom magrittr `%>%`
24
-#' @importFrom tibble is_tibble
25 33
 #' @import dplyr
26 34
 #' @importFrom rlang .data eval_tidy parse_expr
35
+#' @importFrom purrr map_lgl
27 36
 #' @importFrom stringr str_replace
28
-#' @return An integration matrix
37
+#' @return Either a single data frame with computed abundance values or
38
+#' a list of 2 data frames (abundance_df, quant_totals)
29 39
 #' @export
30 40
 #'
31 41
 #' @examples
... ...
@@ -33,54 +43,79 @@
33 43
 #'     package = "ISAnalytics"
34 44
 #' )
35 45
 #' matrix <- import_single_Vispa2Matrix(path)
36
-#' abundance <- compute_abundance(matrix)
37
-compute_abundance <- function(x, columns = "Value", percentage = TRUE) {
46
+#'
47
+#' # Simple integration matrix - grouping by CompleteAmplificationID
48
+#' abundance1 <- compute_abundance(matrix)
49
+#' abundance1
50
+#'
51
+#' # Keeping totals as a separate data frame
52
+#' abundance2 <- compute_abundance(matrix, keep_totals = "df")
53
+#' abundance2
54
+compute_abundance <- function(x,
55
+    columns = "Value",
56
+    percentage = TRUE,
57
+    key = "CompleteAmplificationID",
58
+    keep_totals = FALSE) {
38 59
     ## Check parameters
39
-    stopifnot(tibble::is_tibble(x))
60
+    stopifnot(is.data.frame(x))
40 61
     stopifnot(is.character(columns))
62
+    stopifnot(is.character(key))
41 63
     if (.check_mandatory_vars(x) == FALSE) {
42 64
         stop(.non_ISM_error())
43 65
     }
44
-    if (.check_complAmpID(x) == FALSE) {
45
-        stop(.missing_complAmpID_error())
46
-    }
47 66
     stopifnot(is.logical(percentage) & length(percentage) == 1)
48
-    if (!all(columns %in% colnames(x))) {
49
-        stop(.missing_user_cols_error())
67
+    if (!all(columns %in% colnames(x)) | !all(key %in% colnames(x))) {
68
+        missing_cols <- c(
69
+            columns[!columns %in% colnames(x)],
70
+            key[!key %in% colnames(x)]
71
+        )
72
+        rlang::abort(.missing_user_cols_error(missing_cols))
50 73
     }
51
-    purrr::walk(columns, function(col) {
74
+    non_num_cols <- purrr::map_lgl(columns, function(col) {
52 75
         expr <- rlang::expr(`$`(x, !!col))
53
-        if (!is.numeric(rlang::eval_tidy(expr)) &
54
-            !is.numeric(rlang::eval_tidy(expr))) {
55
-            stop(.non_num_user_cols_error())
76
+        if (is.numeric(rlang::eval_tidy(expr)) |
77
+            is.integer(rlang::eval_tidy(expr))) {
78
+            return(FALSE)
79
+        } else {
80
+            return(TRUE)
56 81
         }
57 82
     })
83
+    if (any(non_num_cols)) {
84
+        stop(.non_num_user_cols_error(columns[non_num_cols]))
85
+    }
86
+    stopifnot(is.logical(keep_totals) || keep_totals == "df")
58 87
     ## Computation
88
+    ### Computes totals for each group defined by key
59 89
     totals <- x %>%
60
-        dplyr::group_by(.data$CompleteAmplificationID) %>%
90
+        dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
61 91
         dplyr::summarise(
62 92
             dplyr::across(dplyr::all_of(columns),
63 93
                 sum,
64
-                .names = "{.col}_sum"
94
+                .names = "{.col}_tot"
65 95
             ),
66 96
             .groups = "drop"
67 97
         )
98
+    ### Computes abundance as value (for each col) / total of the corresponding
99
+    ### group (defined by key)
68 100
     abundance_df <- x %>%
69
-        dplyr::left_join(totals, by = "CompleteAmplificationID") %>%
101
+        dplyr::left_join(totals, by = key) %>%
70 102
         dplyr::mutate(dplyr::across(dplyr::all_of(columns),
71 103
             list(ab = ~ .x / rlang::eval_tidy(
72 104
                 rlang::parse_expr(
73 105
                     paste(
74 106
                         dplyr::cur_column(),
75
-                        "sum",
107
+                        "tot",
76 108
                         sep = "_"
77 109
                     )
78 110
                 )
79 111
             )),
80 112
             .names = "{.col}_RelAbundance"
81 113
         )) %>%
82
-        dplyr::select(-c(dplyr::all_of(paste(columns, "sum", sep = "_")))) %>%
83 114
         dplyr::distinct()
115
+    if (keep_totals == FALSE || keep_totals == "df") {
116
+        abundance_df <- abundance_df %>%
117
+            dplyr::select(-c(dplyr::all_of(paste(columns, "tot", sep = "_"))))
118
+    }
84 119
     if (percentage == TRUE) {
85 120
         abundance_df <- abundance_df %>%
86 121
             dplyr::mutate(
... ...
@@ -93,7 +128,11 @@ compute_abundance <- function(x, columns = "Value", percentage = TRUE) {
93 128
                 dplyr::contains("PercAbundance")
94 129
             )
95 130
     }
96
-    abundance_df
131
+    if (keep_totals == "df") {
132
+        return(list(abundance_df = abundance_df, quant_totals = totals))
133
+    } else {
134
+        return(abundance_df)
135
+    }
97 136
 }
98 137
 
99 138
 
... ...
@@ -139,9 +178,11 @@ compute_abundance <- function(x, columns = "Value", percentage = TRUE) {
139 178
 #' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
140 179
 #' root <- unzip_file_system(root_pth, "fs")
141 180
 #' matrices <- import_parallel_Vispa2Matrices_auto(
142
-#'     path, root,
143
-#'     c("fragmentEstimate", "seqCount"), "annotated", 2, NULL, "ANY",
144
-#'     dates_format = "dmy"
181
+#'     association_file = path, root = root,
182
+#'     quantification_type = c("fragmentEstimate", "seqCount"),
183
+#'     matrix_type = "annotated", workers = 2, patterns = NULL,
184
+#'     matching_opt = "ANY",
185
+#'     dates_format = "dmy", multi_quant_matrix = FALSE
145 186
 #' )
146 187
 #' total_matrix <- comparison_matrix(matrices)
147 188
 #' options(op)
... ...
@@ -225,13 +266,17 @@ comparison_matrix <- function(x,
225 266
 #' )
226 267
 #' root_pth <- system.file("extdata", "fs.zip", package = "ISAnalytics")
227 268
 #' root <- unzip_file_system(root_pth, "fs")
228
-#' matrices <- import_parallel_Vispa2Matrices_auto(
229
-#'     path, root,
230
-#'     c("fragmentEstimate", "seqCount"), "annotated", 2, NULL, "ANY",
269
+#' association_file <- import_association_file(
270
+#'     path = path, root = root,
231 271
 #'     dates_format = "dmy"
232 272
 #' )
233
-#' total_matrix <- comparison_matrix(matrices)
234
-#' separated_matrix <- separate_quant_matrices(total_matrix)
273
+#' matrices <- import_parallel_Vispa2Matrices_auto(
274
+#'     association_file = association_file,
275
+#'     quantification_type = c("seqCount", "fragmentEstimate"),
276
+#'     matrix_type = "annotated", workers = 2, patterns = NULL,
277
+#'     matching_opt = "ANY"
278
+#' )
279
+#' separated_matrix <- separate_quant_matrices(matrices)
235 280
 #' options(op)
236 281
 separate_quant_matrices <- function(x, fragmentEstimate = "fragmentEstimate",
237 282
     seqCount = "seqCount",
... ...
@@ -446,33 +491,40 @@ threshold_filter <- function(x,
446 491
 }
447 492
 
448 493
 
449
-#' Sorts and keeps the top n integration sites in a data frame.
494
+#' Sorts and keeps the top n integration sites based on the values
495
+#' in a given column.
450 496
 #'
451 497
 #' \lifecycle{experimental}
452 498
 #' The input data frame will be sorted by the highest values in
453 499
 #' the columns specified and the top n rows will be returned as output.
454 500
 #' The user can choose to keep additional columns in the output
455 501
 #' by passing a vector of column names or passing 2 "shortcuts":
456
-#' * `keep` = "everything" keeps all columns in the original data frame
457
-#' * `keep` = "nothing" only keeps the mandatory columns
502
+#' * `keep = "everything"` keeps all columns in the original data frame
503
+#' * `keep = "nothing"` only keeps the mandatory columns
458 504
 #' (`mandatory_IS_vars()`) plus the columns in the `columns` parameter.
459 505
 #'
460 506
 #' @param x An integration matrix (data frame containing
461 507
 #' `mandatory_IS_vars()`)
462
-#' @param n How many rows should the output have? Must be numeric
508
+#' @param n How many integrations should be sliced (in total or
509
+#'  for each group)? Must be numeric
463 510
 #' or integer and greater than 0
464 511
 #' @param columns Columns to use for the sorting. If more than a column
465 512
 #' is supplied primary ordering is done on the first column,
466 513
 #' secondary ordering on all other columns
467 514
 #' @param keep Names of the columns to keep besides `mandatory_IS_vars()`
468 515
 #' and `columns`
516
+#' @param key Either `NULL` or a character vector of column names to group
517
+#' by. If not `NULL` the input will be grouped and the top fraction will
518
+#' be extracted from each group.
469 519
 #'
470 520
 #' @family Analysis functions
471 521
 #'
472
-#' @importFrom dplyr arrange across all_of desc slice_head select
522
+#' @import dplyr
473 523
 #' @importFrom magrittr `%>%`
524
+#' @importFrom rlang abort
474 525
 #'
475
-#' @return A data frame with `n` rows
526
+#' @return Either a data frame with at most n rows or
527
+#' a data frames with at most n*(number of groups) rows.
476 528
 #' @export
477 529
 #'
478 530
 #' @examples
... ...
@@ -490,21 +542,40 @@ threshold_filter <- function(x,
490 542
 #'     columns = c("Value", "Value2"),
491 543
 #'     keep = "nothing"
492 544
 #' )
493
-top_integrations <- function(x, n = 50, columns = "RelAbundance",
494
-    keep = "everything") {
545
+#' top_key <- top_integrations(smpl,
546
+#'     n = 3,
547
+#'     columns = "Value",
548
+#'     keep = "Value2",
549
+#'     key = "CompleteAmplificationID"
550
+#' )
551
+top_integrations <- function(x, n = 50,
552
+    columns = "fragmentEstimate_sum_RelAbundance",
553
+    keep = "everything", key = NULL) {
495 554
     stopifnot(is.data.frame(x))
496 555
     stopifnot(is.numeric(n) & length(n) == 1 & n > 0)
497 556
     stopifnot(is.character(keep))
498 557
     stopifnot(is.character(columns))
558
+    stopifnot(is.null(key) || is.character(key))
499 559
     if (!.check_mandatory_vars(x)) {
500
-        stop(.non_ISM_error())
560
+        rlang::abort(.non_ISM_error())
501 561
     }
502 562
     if (!all(columns %in% colnames(x))) {
503
-        stop(.missing_user_cols_error())
563
+        rlang::abort(.missing_user_cols_error(
564
+            columns[!columns %in% colnames(x)]
565
+        ))
504 566
     }
505 567
     if (!(all(keep == "everything") || all(keep == "nothing"))) {
506 568
         if (any(!keep %in% colnames(x))) {
507
-            stop(.missing_user_cols_error())
569
+            rlang::abort(.missing_user_cols_error(
570
+                keep[!keep %in% colnames(x)]
571
+            ))
572
+        }
573
+    }
574
+    if (!is.null(key)) {
575
+        if (!all(key %in% colnames(x))) {
576
+            rlang::abort(.missing_user_cols_error(
577
+                key[!key %in% colnames(x)]
578
+            ))
508 579
         }
509 580
     }
510 581
     essential_cols <- c(mandatory_IS_vars(), columns)
... ...
@@ -515,6 +586,18 @@ top_integrations <- function(x, n = 50, columns = "RelAbundance",
515 586
     } else {
516 587
         keep[!keep %in% essential_cols]
517 588
     }
589
+    if (!is.null(key)) {
590
+        result <- x %>%
591
+            dplyr::group_by(dplyr::across(dplyr::all_of(key))) %>%
592
+            dplyr::arrange(dplyr::across(
593
+                dplyr::all_of(columns),
594
+                dplyr::desc
595
+            ), .by_group = TRUE) %>%
596
+            dplyr::slice_head(n = n) %>%
597
+            dplyr::select(dplyr::all_of(c(key, essential_cols, to_keep))) %>%
598
+            dplyr::ungroup()
599
+        return(result)
600
+    }
518 601
     result <- x %>%
519 602
         dplyr::arrange(dplyr::across(
520 603
             dplyr::all_of(columns),
... ...
@@ -602,7 +685,7 @@ top_integrations <- function(x, n = 50, columns = "RelAbundance",
602 685
 #'     association_file = association_file, root = NULL,
603 686
 #'     quantification_type = c("seqCount", "fragmentEstimate"),
604 687
 #'     matrix_type = "annotated", workers = 2, patterns = NULL,
605
-#'     matching_opt = "ANY"
688
+#'     matching_opt = "ANY", multi_quant_matrix = FALSE
606 689
 #' )
607 690
 #'
608 691
 #' stats <- sample_statistics(matrices$seqCount, association_file)
... ...
@@ -719,6 +802,7 @@ sample_statistics <- function(x, metadata,
719 802
 #' @importFrom rlang .data
720 803
 #' @importFrom magrittr `%>%`
721 804
 #' @importFrom stats median pt p.adjust
805
+#' @importFrom utils read.csv
722 806
 #'
723 807
 #' @return A data frame
724 808
 #' @export
... ...
@@ -742,7 +826,7 @@ sample_statistics <- function(x, metadata,
742 826
 #'     dates_format = "dmy"
743 827
 #' )
744 828
 #'
745
-#' cis <- CIS_grubbs(matrices$seqCount)
829
+#' cis <- CIS_grubbs(matrices)
746 830
 #'
747 831
 #' options(op)
748 832
 CIS_grubbs <- function(x,
... ...
@@ -776,28 +860,34 @@ CIS_grubbs <- function(x,
776 860
 
777 861
     # Try to import annotation file
778 862
     if (ext == "tsv") {
779
-        refgenes <- read.csv(
863
+        refgenes <- utils::read.csv(
780 864
             file = genomic_annotation_file,
781 865
             header = TRUE, fill = TRUE, sep = "\t",
782 866
             check.names = FALSE,
783 867
             na.strings = c("NONE", "NA", "NULL", "NaN", "")
784 868
         )
785 869
         refgenes <- tibble::as_tibble(refgenes) %>%
786
-            dplyr::mutate(chrom = stringr::str_replace_all(.data$chrom,
787
-                                                           "chr", ""))
870
+            dplyr::mutate(chrom = stringr::str_replace_all(
871
+                .data$chrom,
872
+                "chr", ""
873
+            ))
788 874
     } else if (ext == "csv") {
789
-        refgenes <- read.csv(
875
+        refgenes <- utils::read.csv(
790 876
             file = genomic_annotation_file,
791 877
             header = TRUE, fill = TRUE,
792 878
             check.names = FALSE,
793 879
             na.strings = c("NONE", "NA", "NULL", "NaN", "")
794 880
         )
795 881
         refgenes <- tibble::as_tibble(refgenes) %>%
796
-            dplyr::mutate(chrom = stringr::str_replace_all(.data$chrom,
797
-                                                           "chr", ""))
882
+            dplyr::mutate(chrom = stringr::str_replace_all(
883
+                .data$chrom,
884
+                "chr", ""
885
+            ))
798 886
     } else {
799
-        stop(paste("The genomic annotation file must be either in",
800
-                   ".tsv or .csv format (compressed or not)"))
887
+        stop(paste(
888
+            "The genomic annotation file must be either in",
889
+            ".tsv or .csv format (compressed or not)"
890
+        ))
801 891
     }
802 892
 
803 893
     # Check annotation file format
... ...
@@ -832,7 +922,8 @@ CIS_grubbs <- function(x,
832 922
                 stats::median(.data$integration_locus),
833 923
             distinct_orientations = dplyr::n_distinct(.data$strand),
834 924
             describe = list(tibble::as_tibble(
835
-                psych::describe(.data$integration_locus))),
925
+                psych::describe(.data$integration_locus)
926
+            )),
836 927
             .groups = "drop"
837 928
         ) %>%
838 929
         tidyr::unnest(.data$describe, keep_empty = TRUE, names_sep = "_")
... ...
@@ -1028,7 +1119,7 @@ CIS_grubbs <- function(x,
1028 1119
 #'     association_file = association_file, root = NULL,
1029 1120
 #'     quantification_type = c("seqCount", "fragmentEstimate"),
1030 1121
 #'     matrix_type = "annotated", workers = 2, patterns = NULL,
1031
-#'     matching_opt = "ANY"
1122
+#'     matching_opt = "ANY", multi_quant_matrix = FALSE
1032 1123
 #' )
1033 1124
 #'
1034 1125
 #' #### EXTERNAL AGGREGATION
... ...
@@ -60,8 +60,11 @@
60 60
 #'     dates_format = "dmy"
61 61
 #' )
62 62
 #' matrices <- import_parallel_Vispa2Matrices_auto(
63
-#'     association_file, NULL,
64
-#'     c("fragmentEstimate", "seqCount"), "annotated", 2, NULL, "ANY"
63
+#'     association_file = association_file, root = NULL,
64
+#'     quantification_type = c("fragmentEstimate", "seqCount"),
65
+#'     matrix_type = "annotated", workers = 2,
66
+#'     patterns = NULL, matching_opt = "ANY",
67
+#'     multi_quant_matrix = FALSE
65 68
 #' )
66 69
 #' matrices <- remove_collisions(matrices, association_file)
67 70
 #' options(op)
... ...
@@ -406,8 +409,11 @@ remove_collisions <- function(x,
406 409
 #'     dates_format = "dmy"
407 410
 #' )
408 411
 #' matrices <- import_parallel_Vispa2Matrices_auto(
409
-#'     association_file, NULL,
410
-#'     c("fragmentEstimate", "seqCount"), "annotated", 2, NULL, "ANY"
412
+#'     association_file = association_file, root = NULL,
413
+#'     quantification_type = c("fragmentEstimate", "seqCount"),
414
+#'     matrix_type = "annotated", workers = 2,
415
+#'     patterns = NULL, matching_opt = "ANY",
416
+#'     multi_quant_matrix = FALSE
411 417
 #' )
412 418
 #' sc_matrix <- remove_collisions(matrices$seqCount, association_file)
413 419
 #' others <- matrices[!names(matrices) %in% "seqCount"]
... ...
@@ -1,3 +1,7 @@
1
+#------------------------------------------------------------------------------#
2
+# Exported/Internal variables
3
+#------------------------------------------------------------------------------#
4
+
1 5
 #' Names of mandatory variables for an integration matrix.
2 6
 #'
3 7
 #' Contains the names of the columns that need to be present in order for a
... ...
@@ -12,6 +16,25 @@ mandatory_IS_vars <- function() {
12 16
     c("chr", "integration_locus", "strand")
13 17
 }
14 18
 
19
+# Internal: associates column types with column names for a more precise
20
+# import
21
+.mandatory_IS_types <- function(mode) {
22
+    if (mode == "fread") {
23
+        return(list(
24
+            character = c("chr", "strand"),
25
+            integer = "integration_locus"
26
+        ))
27
+    } else {
28
+        return(
29
+            list(
30
+                chr = "c",
31
+                integration_locus = "i",
32
+                strand = "c"
33
+            )
34
+        )
35
+    }
36
+}
37
+
15 38
 #' Names of the annotation variables for an integration matrix.
16 39
 #'
17 40
 #' Contains the names of the columns that are present if the integration matrix
... ...
@@ -26,6 +49,19 @@ annotation_IS_vars <- function() {
26 49
     c("GeneName", "GeneStrand")
27 50
 }
28 51
 
52
+# Internal: associates column types with column names for a more precise
53
+# import
54
+.annotation_IS_types <- function(mode) {
55
+    if (mode == "fread") {
56
+        return(list(character = c("GeneName", "GeneStrand")))
57
+    } else {
58
+        return(list(
59
+            GeneName = "c",
60
+            GeneStrand = "c"
61
+        ))
62
+    }
63
+}
64
+
29 65
 #' Names of the columns in the association file.
30 66
 #'
31 67
 #' All the names of the columns present in the association file.
... ...
@@ -55,10 +91,95 @@ association_file_columns <- function() {
55 91
         "Kapa", "ulForPool", "CompleteAmplificationID", "UniqueID",
56 92
         "StudyTestID",
57 93
         "StudyTestGroup", "MouseID", "Tigroup", "Tisource",
58
-        "PathToFolderProjectID"
94
+        "PathToFolderProjectID",
95
+        "SamplesNameCheck",
96
+        "TimepointDays", "TimepointMonths",
97
+        "TimepointYears", "ng DNA corrected"
59 98
     )
60 99
 }
61 100
 
101
+.af_col_types <- function(mode) {
102
+    if (mode == "fread") {
103
+        types <- list(
104
+            character = c(
105
+                "ProjectID", "FUSIONID", "PoolID", "TagSequence",
106
+                "SubjectID", "VectorType", "VectorID", "ExperimentID",
107
+                "Tissue", "TimePoint", "DNAFragmentation",
108
+                "PCRMethod", "TagIDextended", "Keywords",
109
+                "CellMarker", "TagID", "NGSProvider", "NGSTechnology",
110
+                "ConverrtedFilesDir", "ConverrtedFilesName",
111
+                "SourceFileFolder", "SourceFileNameR1",
112
+                "SourceFileNameR2", "DNAnumber", "LinearPCRID",
113
+                "1stExpoPCRID", "2ndExpoID", "FusionPrimerPCRID",
114
+                "Genome", "Genotype", "Notes", "AddedField1",
115
+                "AddedField2", "AddedField3", "AddedField4",
116
+                "concatenatePoolIDSeqRun", "CompleteAmplificationID",
117
+                "UniqueID", "StudyTestID", "Tigroup", "Tisource",
118
+                "PathToFolderProjectID", "SamplesNameCheck",
119
+                "DNAextractionDate", "LinearPCRDate",
120
+                "SonicationDate", "LigationDate",
121
+                "FusionPrimerPCRDate", "PoolDate", "SequencingDate",
122
+                "MOI", "AddedField6_RelativeBloodPercentage",
123
+                "TestGroup"
124
+            ),
125
+            double = c(
126
+                "DNAngUsed", "VCN", "Engraftment", "Transduction",
127
+                "AddedField7_PurityTestFeasibility",
128
+                "AddedField8_FacsSeparationPurity", "Kapa",
129
+                "ulForPool", "TimepointMonths", "TimepointYears",
130
+                "ng DNA corrected"
131
+            ),
132
+            integer = c(
133
+                "ReplicateNumber", "SequencingRound",
134
+                "StudyTestGroup", "MouseID", "TimepointDays"
135
+            )
136
+        )
137
+        return(types)
138
+    }
139
+    if (mode == "readr") {
140
+        # date_format <- unlist(strsplit(date_format))
141
+        # date_format <- paste0("%", date_format)
142
+        # date_format <- paste0(date_format, collapse = "%.")
143
+        types <- list(
144
+            ProjectID = "c", FUSIONID = "c", PoolID = "c", TagSequence = "c",
145
+            SubjectID = "c", VectorType = "c", VectorID = "c",
146
+            ExperimentID = "c", Tissue = "c", TimePoint = "c",
147
+            DNAFragmentation = "c", PCRMethod = "c", TagIDextended = "c",
148
+            Keywords = "c", CellMarker = "c", TagID = "c",
149
+            NGSProvider = "c", NGSTechnology = "c",
150
+            ConverrtedFilesDir = "c", ConverrtedFilesName = "c",
151
+            SourceFileFolder = "c", SourceFileNameR1 = "c",
152
+            SourceFileNameR2 = "c", DNAnumber = "c", LinearPCRID = "c",
153
+            `1stExpoPCRID` = "c", `2ndExpoID` = "c",
154
+            FusionPrimerPCRID = "c", Genome = "c", Genotype = "c",
155
+            Notes = "c", AddedField1 = "c",
156
+            AddedField2 = "c", AddedField3 = "c", AddedField4 = "c",
157
+            concatenatePoolIDSeqRun = "c", CompleteAmplificationID = "c",
158
+            UniqueID = "c", StudyTestID = "c", Tigroup = "c", Tisource = "c",
159
+            PathToFolderProjectID = "c", SamplesNameCheck = "c",
160
+            DNAextractionDate = "c",
161
+            LinearPCRDate = "c",
162
+            SonicationDate = "c",
163
+            LigationDate = "c",
164
+            FusionPrimerPCRDate = "c",
165
+            PoolDate = "c",
166
+            SequencingDate = "c",
167
+            MOI = "c", AddedField6_RelativeBloodPercentage = "c",
168
+            DNAngUsed = "d", VCN = "d", Engraftment = "d", Transduction = "d",
169
+            AddedField7_PurityTestFeasibility = "d",
170
+            AddedField8_FacsSeparationPurity = "d", Kapa = "d",
171
+            ulForPool = "d", TimepointMonths = "d", TimepointYears = "d",
172
+            `ng DNA corrected` = "d",
173
+            ReplicateNumber = "i", SequencingRound = "i", TestGroup = "c",
174
+            MouseID = "i", TimepointDays = "i",
175
+            `1stExpoPCRDate` = "c",
176
+            `2ndExpoDate` = "c",
177
+            StudyTestGroup = "i"
178
+        )
179
+        return(types)
180
+    }
181
+}
182
+
62 183
 #' Names of the columns of the association file to consider for
63 184
 #' Vispa2 launch.
64 185
 #'
... ...
@@ -80,3 +201,23 @@ reduced_AF_columns <- function() {
80 201
         "PoolID"
81 202
     )
82 203
 }
204
+
205
+# Names of the columns of iss stats considered for aggregation
206
+# USED IN:
207
+# - .join_and_aggregate
208
+.agg_iss_cols <- function() {
209
+    c(
210
+        "BARCODE_MUX", "TRIMMING_FINAL_LTRLC",
211
+        "LV_MAPPED",
212
+        "BWA_MAPPED_OVERALL",
213
+        "ISS_MAPPED_PP"
214
+    )
215
+}
216
+
217
+.compressed_formats <- function() {
218
+    c("gz", "bz2", "xz", "zip")
219
+}
220
+
221
+.supported_fread_compression_formats <- function() {
222
+    c("gz", "bz2")
223
+}
... ...
@@ -1,20 +1,30 @@
1 1
 #------------------------------------------------------------------------------#
2 2
 # Importing functions
3 3
 #------------------------------------------------------------------------------#
4
+
4 5
 #' Import a single integration matrix from file
5 6
 #'
6 7
 #' @description \lifecycle{stable}
7 8
 #' This function allows to read and import an integration matrix
8 9
 #' produced as the output of Vispa2 pipeline and converts it to a tidy
9
-#' tibble.
10
+#' format.
10 11
 #'
11 12
 #' @param path The path to the file on disk
13
+#' @param to_exclude Either NULL or a character vector of column names that
14
+#' should be ignored when importing
15
+#' @param separator The column delimiter used
12 16
 #'
13
-#' @return A tidy tibble
17
+#' @return A data.table object in tidy format
14 18
 #' @family Import functions
19
+#' @importFrom rlang abort inform
20
+#' @importFrom fs path_ext
21
+#' @importFrom readr read_delim cols
15 22
 #' @importFrom tidyr separate
16
-#' @importFrom utils read.csv
17 23
 #' @importFrom magrittr `%>%`
24
+#' @importFrom dplyr mutate
25
+#' @importFrom stringr str_replace
26
+#' @importFrom BiocParallel SnowParam MulticoreParam bplapply bpstop
27
+#' @importFrom data.table melt.data.table rbindlist
18 28
 #' @details The import series of functions is designed to work in combination
19 29
 #' with the use of Vispa2 pipeline, please refer to this article for more
20 30
 #' details: \href{https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5702242/}{VISPA2:
... ...
@@ -29,30 +39,139 @@
29 39
 #'     package = "ISAnalytics"
30 40
 #' )
31 41
 #' isa_dataframe <- import_single_Vispa2Matrix(path_to_file)
32
-import_single_Vispa2Matrix <- function(path) {
42
+import_single_Vispa2Matrix <- function(path,
43
+    to_exclude = NULL,
44
+    separator = "\t") {
33 45
     stopifnot(!missing(path) & is.character(path))
34 46
     if (!file.exists(path)) {
35
-        stop(paste("File not found at", path))
47
+        rlang::abort(paste("File not found at", path))
48
+    }
49
+    if (!fs::is_file(path)) {
50
+        rlang::abort(paste("Path exists but is not a file"))
36 51
     }
37
-    df <- read.csv(path,
38
-        sep = "\t", header = TRUE, fill = TRUE,
39
-        check.names = FALSE, stringsAsFactors = FALSE
52
+    mode <- "fread"
53
+    ## Is the file compressed?
54
+    is_compressed <- fs::path_ext(path) %in% .compressed_formats()
55
+    if (is_compressed) {
56
+        ## The compression type is supported by data.table::fread?
57
+        compression_type <- fs::path_ext(path)
58
+        if (!compression_type %in% .supported_fread_compression_formats()) {
59
+            ### If not, switch to classic for reading
60
+            mode <- "classic"
61
+            if (getOption("ISAnalytics.verbose") == TRUE) {
62
+                rlang::inform(.unsupported_comp_format_inf(),
63
+                    class = "unsup_comp_format"
64
+                )
65
+            }
66
+        }
67
+    }
68
+    ### Peak headers
69
+    peek_headers <- readr::read_delim(path,
70
+        delim = separator, n_max = 0,
71
+        col_types = readr::cols()
40 72
     )
41
-    df <- tibble::as_tibble(df)
42
-    df_type <- .auto_detect_type(df)
73
+    ## - Detect type
74
+    df_type <- .auto_detect_type(peek_headers)
75
+    if (df_type == "MALFORMED") {
76
+        rlang::abort(.malformed_ISmatrix_error(),
77
+            class = "malformed_ism"
78
+        )
79
+    }
80
+    is_annotated <- .is_annotated(peek_headers)
81
+    ## - Start reading
82
+    if (getOption("ISAnalytics.verbose") == TRUE) {
83
+        rlang::inform(c("Reading file...", i = paste0("Mode: ", mode)))
84
+    }
85
+    df <- if (mode == "fread") {
86
+        .read_with_fread(
87
+            path = path, to_drop = to_exclude,
88
+            df_type = df_type, annotated = is_annotated,
89
+            sep = separator
90
+        )
91
+    } else {
92
+        .read_with_readr(
93
+            path = path, to_drop = to_exclude,
94
+            df_type = df_type, annotated = is_annotated,
95
+            sep = separator
96
+        )
97
+    }
98
+    ## - Report summary
99
+    if (getOption("ISAnalytics.verbose") == TRUE) {
100
+        rlang::inform(.summary_ism_import_msg(
101
+            df_type,
102
+            .is_annotated(df),
103
+            dim(df),
104
+            mode
105
+        ),
106
+        class = "ism_import_summary"
107
+        )
108
+    }
43 109
     if (df_type == "OLD") {
44
-        df <- df %>% tidyr::separate(
45
-            col = .data$IS_genomicID,
46
-            into = mandatory_IS_vars(),
47
-            sep = "_", remove = TRUE,
48
-            convert = TRUE
110
+        df <- df %>%
111
+            tidyr::separate(
112
+                col = .data$IS_genomicID,
113
+                into = mandatory_IS_vars(),
114
+                sep = "_", remove = TRUE,
115
+                convert = TRUE
116
+            ) %>%
117
+            dplyr::mutate(chr = stringr::str_replace(
118
+                .data$chr, "chr", ""
119
+            ))
120
+    }
121
+    ## - Split in chunks
122
+    if (getOption("ISAnalytics.verbose") == TRUE) {
123
+        rlang::inform("Reshaping...")
124
+    }
125
+    chunks <- split(df,
126
+        by = c("chr"),
127
+        verbose = FALSE
128
+    )
129
+    ## - Melt in parallel
130
+    p <- if (.Platform$OS.type == "windows") {
131
+        BiocParallel::SnowParam(
132
+            tasks = length(chunks),
133
+            progressbar = getOption("ISAnalytics.verbose"),
134
+            exportglobals = FALSE,
135
+            stop.on.error = TRUE
136
+        )
137
+    } else {
138
+        BiocParallel::MulticoreParam(
139
+            tasks = length(chunks),
140
+            progressbar = getOption("ISAnalytics.verbose"),
141
+            exportglobals = FALSE,
142
+            stop.on.error = TRUE
49 143
         )
50 144
     }
51
-    if (df_type == "MALFORMED") {
52
-        warning(.malformed_ISmatrix_warning())
145
+    mt <- function(data, annot) {
146
+        id_vars <- if (annot) {
147
+            c(
148
+                mandatory_IS_vars(),
149
+                annotation_IS_vars()
150
+            )
151
+        } else {
152
+            mandatory_IS_vars()
153
+        }
154
+        data.table::melt.data.table(data,
155
+            id.vars = id_vars,
156
+            variable.name = "CompleteAmplificationID",
157
+            value.name = "Value",
158
+            na.rm = TRUE,
159
+            verbose = FALSE
160
+        )
161
+    }
162
+    tidy_chunks <- BiocParallel::bplapply(
163
+        X = chunks,
164
+        FUN = mt,
165
+        annot = is_annotated,
166
+        BPPARAM = p
167
+    )
168
+    BiocParallel::bpstop(p)
169
+    tidy <- data.table::rbindlist(tidy_chunks)
170
+    tidy <- tidy["Value" > 0]
171
+    if (getOption("ISAnalytics.verbose") == TRUE) {
172
+        rlang::inform("Done!")
53 173
     }
54
-    isadf <- .messy_to_tidy(df)
55
-    isadf
174
+    return(tidy)
56 175
 }
57 176
 
58 177
 
... ...
@@ -70,6 +189,13 @@ import_single_Vispa2Matrix <- function(path) {
70 189
 #' specified (ex: 1 becomes 0001 with a tp_padding of 4)
71 190
 #' @param dates_format A single string indicating how dates should be parsed.
72 191
 #' Must be a value in: \code{date_formats()}
192
+#' @param separator The column separator used in the file
193
+#' @param filter_for A named list where names represent column names that
194
+#' must be filtered. For example: `list(ProjectID = c("PROJECT1", "PROJECT2))`
195
+#' will filter the association file so that it contains only those rows
196
+#' for which the value of the column "ProjectID" is one of the specified
197
+#' values. If multiple columns are present in the list all filtering
198
+#' conditions are applied as a logical AND.
73 199
 #' @param export_widget_path A path on disk to save produced widgets or NULL
74 200
 #' if the user doesn't wish to save the html file
75 201
 #' @family Import functions
... ...
@@ -102,6 +228,9 @@ import_single_Vispa2Matrix <- function(path) {
102 228
 #' If 'NULL' the file system alignment step is skipped.
103 229
 #' @export
104 230
 #'
231
+#' @importFrom purrr map_lgl set_names is_empty
232
+#' @importFrom rlang inform
233
+#' @importFrom magrittr `%>%`
105 234
 #' @seealso \code{\link{date_formats}}
106 235
 #' @examples
107 236
 #' op <- options("ISAnalytics.widgets" = FALSE)
... ...
@@ -114,6 +243,8 @@ import_single_Vispa2Matrix <- function(path) {
114 243
 #' options(op)
115 244
 import_association_file <- function(path,
116 245
     root = NULL, tp_padding = 4, dates_format = "ymd",
246
+    separator = "\t",
247
+    filter_for = NULL,
117 248
     export_widget_path = NULL) {
118 249
     # Check parameters
119 250
     stopifnot(is.character(path) & length(path) == 1)
... ...
@@ -125,32 +256,80 @@ import_association_file <- function(path,
125 256
     stopifnot((is.numeric(tp_padding) |
126 257
         is.integer(tp_padding)) & length(tp_padding) == 1)
127 258
     stopifnot(length(dates_format) == 1 & dates_format %in% date_formats())
128
-
259
+    stopifnot(is.character(separator) && length(separator) == 1)
260
+    # Check filter
261
+    stopifnot(is.null(filter_for) ||
262
+        (is.list(filter_for) && !is.null(names(filter_for))))
129 263
     # Read file and check the correctness
130
-    af_checks <- .manage_association_file(path, root, tp_padding, dates_format)
264
+    af_checks <- .manage_association_file(
265
+        path, root, tp_padding, dates_format,
266
+        separator, filter_for
267
+    )
131 268
     as_file <- af_checks$af
269
+    parsing_problems <- af_checks$parsing_probs
270
+    date_problems <- af_checks$date_probs
132 271
     checks <- af_checks$check
133
-
134
-    if (!is.null(checks)) {
135
-        # Checks if the association file and the file system are aligned
272
+    col_probs <- NULL
273
+    if (!.check_af_correctness(as_file)) {
274
+        col_probs[["missing"]] <- association_file_columns()[
275
+            !association_file_columns() %in% colnames(as_file)
276
+        ]
277
+    }
278
+    non_standard <- colnames(as_file)[
279
+        !colnames(as_file) %in% c(association_file_columns(), "Path")
280
+    ]
281
+    if (!purrr::is_empty(non_standard)) {
282
+        col_probs[["non_standard"]] <- non_standard
283
+    }
284
+    missing_dates <- purrr::map_lgl(date_columns_coll(), function(date_col) {
285
+        any(is.na(as_file[[date_col]]))
286
+    }) %>% purrr::set_names(date_columns_coll())
287
+    missing_dates <- names(missing_dates)[missing_dates == TRUE]
288
+    something_to_report <- any(!is.null(c(
289
+        parsing_problems,
290
+        date_problems,
291
+        checks,
292
+        col_probs,
293
+        missing_dates
294
+    )))
295
+    if (something_to_report) {
296
+        summary_report <- .summary_af_import_msg(
297
+            pars_prob = parsing_problems, dates_prob = date_problems,
298
+            cols_prob = col_probs, crit_na = missing_dates,
299
+            checks = ifelse(is.null(checks),
300
+                yes = "skipped",
301
+                no = ifelse(any(!checks$Found),
302
+                    "problems detected",
303
+                    "no problems detected"
304
+                )
305
+            )
306
+        )
136 307
         if (getOption("ISAnalytics.widgets") == TRUE) {
137 308
             withCallingHandlers(
138 309
                 expr = {
139 310
                     withRestarts(
140 311
                         {
141
-                            widg <- .checker_widget(checks)
312
+                            widg <- .checker_widget(
313
+                                parsing_problems,
314
+                                date_problems,
315
+                                checks,
316
+                                col_probs,
317
+                                missing_dates
318
+                            )
142 319
                             print(widg)
143 320
                         },
144 321
                         print_err = function() {
145
-                            message(.widgets_error())
322
+                            rlang::inform(.widgets_error())
146 323
                             if (getOption("ISAnalytics.verbose") == TRUE) {
147
-                                print(checks, n = nrow(checks))
324
+                                rlang::inform(summary_report,
325
+                                    class = "summary_report"
326
+                                )
148 327
                             }
149 328
                         }
150 329
                     )
151 330
                 },
152 331
                 error = function(cnd) {
153
-                    message(conditionMessage(cnd))
332
+                    rlang::inform(conditionMessage(cnd))
154 333
                     invokeRestart("print_err")
155 334
                 }
156 335
             )
... ...
@@ -161,7 +340,9 @@ import_association_file <- function(path,
161 340
                 )
162 341
             }
163 342
         } else if (getOption("ISAnalytics.verbose") == TRUE) {
164
-            print(checks, n = nrow(checks), width = Inf)
343
+            rlang::inform(summary_report,
344
+                class = "summary_report"
345
+            )
165 346
         }
166 347
     }
167 348
     as_file
... ...
@@ -188,38 +369,34 @@ import_association_file <- function(path,
188 369
 #' if necessary, duplicate files. During the execution, a series of reports is
189 370
 #' shown in html format.
190 371
 #' @param association_file A single string containing the path to the
191
-#' association file on disk, or a tibble resulting from the previous call of
372
+#' association file on disk, or a data frame resulting from a previous call to
192 373
 #' `import_association_file`
193
-#' @param root A single string containing the path to the root folder containing
194
-#' Vispa2 output. Can be NULL if association_file parameter is a tibble
195 374
 #' @param quantification_type A vector of requested quantification_types. Must
196 375
 #' be one in `quantification_types()`
197 376
 #' @param matrix_type A single string representing the type of matrices to
198 377
 #' be imported. Can only be one in `"annotated"` or `"not_annotated"`
199
-#' @param workers A single number representing the number of parallel workers to
200
-#' use for the import
201
-#' @param tp_padding Timepoint padding, indicates the number of digits of the
202
-#' "Timepoint" column once imported. Fills the content with 0s up to the length
203
-#' specified (ex: 1 becomes 0001 with a tp_padding of 4)
204
-#' @param dates_format A single string indicating how dates should be parsed.
205
-#' Must be a value in: \code{date_formats()}
378
+#' @param workers A single integer representing the number
379
+#' of parallel workers to use for the import
206 380
 #' @param multi_quant_matrix If set to TRUE will produce a
207 381
 #' multi-quantification matrix (data frame) through `comparison_matrix`
208 382
 #' instead of a list.
209
-#' @param export_widget_path A path on disk to save produced widgets or NULL
210
-#' if the user doesn't wish to save the html file
211
-#' @param ... Additional arguments to pass to `comparison_matrix`
383
+#' @param export_report_path A path on disk to save produced import report
384
+#'  or NULL if the user doesn't wish to save the html file
385
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Additional named arguments
386
+#' to pass to `ìmport_association_file` and `comparison_matrix`
212 387
 #'
213
-#' @seealso \code{\link{comparison_matrix}}
388
+#' @seealso \code{\link{comparison_matrix}},
389
+#' \code{\link{import_association_file}}
214 390
 #'
215 391
 #' @importFrom htmltools browsable tagList
216
-#' @importFrom tibble is_tibble
217
-#' @importFrom tidyr unnest
218
-#' @importFrom dplyr select
392
+#' @importFrom dplyr filter
393
+#' @importFrom rlang dots_list inform abort call2 eval_tidy fn_fmls_names
394
+#' @importFrom magrittr `%>%`
219 395
 #' @family Import functions
220 396
 #'
221
-#' @return A named list of tibbles containing data from all imported integration
222
-#' matrices, divided by quantification type
397
+#' @return A named list of data frames containing data from
398
+#' all imported integration
399
+#' matrices, divided by quantification type or a multi-quantification matrix
223 400
 #' @export
224 401
 #'
225 402
 #' @examples
... ...
@@ -227,28 +404,24 @@ import_association_file <- function(path,
227 404
 #' # Can't run because it's interactive and requires user input
228 405
 #' matrices <- import_parallel_Vispa2Matrices_interactive(
229 406
 #'     association_file,
230
-#'     root, quantification_type, matrix_type, workers,
231
-#'     dates_format = "dmy"
407
+#'     quantification_type,
408
+#'     matrix_type = "annotated",
409
+#'     workers = 2,
410
+#'     multi_quant_matrix = FALSE,
411
+#'     export_report_path = NULL,
232 412
 #' )
233 413
 #' }
234 414
 import_parallel_Vispa2Matrices_interactive <- function(association_file,
235
-    root,
236 415
     quantification_type,
237 416
     matrix_type = "annotated",
238 417
     workers = 2,
239
-    tp_padding = 4,
240
-    dates_format = "ymd",
241
-    multi_quant_matrix = FALSE,
242
-    export_widget_path = NULL,
418
+    multi_quant_matrix = TRUE,
419
+    export_report_path = NULL,
243 420
     ...) {
244 421
     # Check parameters
245
-    stopifnot(!missing(association_file))
246
-    stopifnot(is.character(association_file) |
422
+    stopifnot((is.character(association_file) &
423
+        length(association_file) == 1) ||
247 424
         is.data.frame(association_file))
248
-    stopifnot((is.character(root) && length(root) == 1) | is.null(root))
249
-    if (is.character(association_file)) {
250
-        stopifnot(length(association_file) == 1)
251
-    }
252 425
     stopifnot(is.numeric(workers) & length(workers) == 1)
253 426
     workers <- floor(workers)
254 427
     stopifnot(!missing(quantification_type))
... ...
@@ -257,18 +430,49 @@ import_parallel_Vispa2Matrices_interactive <- function(association_file,
257 430
         "annotated",
258 431
         "not_annotated"
259 432
     ))
260
-    stopifnot((is.numeric(tp_padding) |
261
-        is.integer(tp_padding)) & length(tp_padding) == 1)
262
-    stopifnot(length(dates_format) == 1 & dates_format %in% date_formats())
263 433
     stopifnot(is.logical(multi_quant_matrix) & length(multi_quant_matrix) == 1)
264
-
265
-    # Manage association file
266
-    association_file <- .manage_association_file(
267
-        association_file, root,
268
-        tp_padding, dates_format
269
-    )
270
-    checker_widg <- association_file[[2]]
271
-    association_file <- association_file[[1]]
434
+    ## Collect dot args
435
+    if (is.character(association_file) || isTRUE(multi_quant_matrix)) {
436
+        dots_args <- rlang::dots_list(..., .named = TRUE, .homonyms = "first")
437
+        if (is.character(association_file)) {
438
+            import_af_arg_names <- rlang::fn_fmls_names(import_association_file)
439
+            import_af_arg_names <- import_af_arg_names[
440
+                import_af_arg_names != "path"
441
+            ]
442
+            import_af_args <- dots_args[names(dots_args) %in%
443
+                import_af_arg_names]
444
+        }
445
+        if (isTRUE(multi_quant_matrix)) {
446
+            mult_arg_names <- rlang::fn_fmls_names(comparison_matrix)
447
+            mult_arg_names <- mult_arg_names[mult_arg_names != "x"]
448
+            mult_args <- dots_args[names(dots_args) %in%
449
+                mult_arg_names]
450
+        }
451
+    }
452
+    ## Import association file if provided a path
453
+    if (is.character(association_file)) {
454
+        association_file <- rlang::eval_tidy(
455
+            rlang::call2("import_association_file",
456
+                path = association_file,
457
+                !!!import_af_args
458
+            )
459
+        )
460
+    }