Browse code

[UPDATE] Partial update for v. 1.5.4 - major changes, switch to dynamic vars use and import functions adaptation

Giulia Pais authored on 23/03/2022 16:43:23
Showing 47 changed files

... ...
@@ -47,8 +47,14 @@ export(realign_after_collisions)
47 47
 export(reduced_AF_columns)
48 48
 export(refGene_table_cols)
49 49
 export(remove_collisions)
50
+export(reset_af_columns_def)
51
+export(reset_annotation_IS_vars)
52
+export(reset_mandatory_IS_vars)
50 53
 export(sample_statistics)
51 54
 export(separate_quant_matrices)
55
+export(set_af_columns_def)
56
+export(set_annotation_IS_vars)
57
+export(set_mandatory_IS_vars)
52 58
 export(sharing_heatmap)
53 59
 export(sharing_venn)
54 60
 export(threshold_filter)
... ...
@@ -2,183 +2,233 @@
2 2
 # Exported/Internal variables
3 3
 #------------------------------------------------------------------------------#
4 4
 
5
-#' Names of mandatory variables for an integration matrix.
6
-#'
7
-#' Contains the names of the columns that need to be present in order for a
8
-#' tibble to be considered an integration matrix.
9
-#'
10
-#' @return A character vector
11
-#' @export
12
-#'
13
-#' @examples
14
-#' mandatory_IS_vars()
15
-mandatory_IS_vars <- function() {
16
-    c("chr", "integration_locus", "strand")
5
+# Internal: default mandatory IS vars and associated column types.
6
+# The combination of these fields defines a unique integration site.
7
+.default_mandatory_IS_vars <- function() {
8
+  tibble::tribble(
9
+    ~ names, ~ types, ~ transform, ~ flag, ~ tag,
10
+    "chr", "char", NULL, "required", "chromosome",
11
+    "integration_locus", "int", NULL, "required", "locus",
12
+    "strand", "char", NULL, "required", "is_strand"
13
+  )
17 14
 }
18 15
 
19
-# Internal: associates column types with column names for a more precise
20
-# import
21
-.mandatory_IS_types <- function(mode) {
22
-    if (mode == "fread") {
23
-        return(list(
24
-            character = c("chr", "strand"),
25
-            integer = "integration_locus"
26
-        ))
27
-    } else {
28
-        return(
29
-            list(
30
-                chr = "c",
31
-                integration_locus = "i",
32
-                strand = "c"
33
-            )
34
-        )
35
-    }
16
+# Internal: default genomic annotation IS vars and associated column types.
17
+.default_annotation_IS_vars <- function() {
18
+  tibble::tribble(
19
+    ~ names, ~ types, ~ transform, ~ flag, ~ tag,
20
+    "GeneName", "char", NULL, "required", "gene_symbol",
21
+    "GeneStrand", "char", NULL, "required", "gene_strand"
22
+  )
36 23
 }
37 24
 
38
-#' Names of the annotation variables for an integration matrix.
39
-#'
40
-#' Contains the names of the columns that are present if the integration matrix
41
-#' is annotated.
42
-#'
43
-#' @return A character vector
44
-#' @export
45
-#'
46
-#' @examples
47
-#' annotation_IS_vars()
48
-annotation_IS_vars <- function() {
49
-    c("GeneName", "GeneStrand")
25
+# Internal: default association file columns and types
26
+.default_af_cols <- function() {
27
+  tibble::tribble(
28
+    ~ names, ~ types, ~ transform, ~ flag, ~ tag,
29
+    "ProjectID", "char", NULL, "required", "project_id",
30
+    "FUSIONID", "char", NULL, "optional", NA_character_,
31
+    "PoolID", "char", NULL, "required", "pool_id",
32
+    "TagSequence", "char", NULL, "required", "tag_seq",
33
+    "SubjectID", "char", NULL, "required", "subject",
34
+    "VectorType", "char", NULL, "optional", NA_character_,
35
+    "VectorID", "char", NULL, "required", NA_character_,
36
+    "ExperimentID", "char", NULL, "optional", NA_character_,
37
+    "Tissue", "char", NULL, "required", "tissue",
38
+    "TimePoint", "char", ~ stringr::str_pad(.x, 4, side = "left", pad = "0"),
39
+    "required", "tp_days",
40
+    "DNAFragmentation", "char", NULL, "optional", NA_character_,
41
+    "PCRMethod", "char", NULL, "required", "pcr_method",
42
+    "TagIDextended", "char", NULL, "optional", NA_character_,
43
+    "Keywords","char", NULL, "optional", NA_character_,
44
+    "CellMarker", "char", NULL, "required", "cell_marker",
45
+    "TagID", "char", NULL, "required", NA_character_,
46
+    "NGSProvider", "char", NULL, "optional", NA_character_,
47
+    "NGSTechnology", "char", NULL, "required", "ngs_tech",
48
+    "ConverrtedFilesDir", "char", NULL, "optional", NA_character_,
49
+    "ConverrtedFilesName", "char", NULL, "optional", NA_character_,
50
+    "SourceFileFolder", "char", NULL, "optional", NA_character_,
51
+    "SourceFileNameR1", "char", NULL, "optional", NA_character_,
52
+    "SourceFileNameR2", "char", NULL, "optional", NA_character_,
53
+    "DNAnumber", "char", NULL, "required", "dna_num",
54
+    "ReplicateNumber", "int", NULL, "required", "pcr_replicate",
55
+    "DNAextractionDate", "date", NULL, "optional", NA_character_,
56
+    "DNAngUsed", "numeric", NULL, "required", NA_character_,
57
+    "LinearPCRID", "char", NULL, "optional", NA_character_,
58
+    "LinearPCRDate", "date", NULL, "optional", NA_character_,
59
+    "SonicationDate", "date", NULL, "optional", NA_character_,
60
+    "LigationDate", "date", NULL, "optional", NA_character_,
61
+    "1stExpoPCRID", "char", NULL, "optional", NA_character_,
62
+    "1stExpoPCRDate", "date", NULL, "optional", NA_character_,
63
+    "2ndExpoID", "char", NULL, "optional", NA_character_,
64
+    "2ndExpoDate", "date", NULL, "optional", NA_character_,
65
+    "FusionPrimerPCRID", "char", NULL, "optional", NA_character_,
66
+    "FusionPrimerPCRDate", "date", NULL, "optional", NA_character_,
67
+    "PoolDate", "date", NULL, "optional", NA_character_,
68
+    "SequencingDate", "date", NULL, "required", NA_character_,
69
+    "VCN", "numeric", NULL, "required", "vcn",
70
+    "Genome", "char", NULL, "required", "genome",
71
+    "SequencingRound", "int", NULL, "optional", NA_character_,
72
+    "Genotype", "char", NULL, "optional", NA_character_,
73
+    "TestGroup", "char", NULL, "optional", NA_character_,
74
+    "MOI", "char", NULL, "optional", NA_character_,
75
+    "Engraftment", "numeric", NULL, "optional", NA_character_,
76
+    "Transduction", "numeric", NULL, "optional", NA_character_,
77
+    "Notes", "char", NULL, "optional", NA_character_,
78
+    "AddedField1", "char", NULL, "optional", NA_character_,
79
+    "AddedField2", "char", NULL, "optional", NA_character_,
80
+    "AddedField3", "char", NULL, "optional", NA_character_,
81
+    "AddedField4", "char", NULL, "optional", NA_character_,
82
+    "concatenatePoolIDSeqRun", "char", NULL,"required", "vispa_concatenate",
83
+    "AddedField6_RelativeBloodPercentage", "char", NULL, "optional",
84
+    NA_character_,
85
+    "AddedField7_PurityTestFeasibility", "char", NULL, "optional",
86
+    NA_character_,
87
+    "AddedField8_FacsSeparationPurity", "char", NULL, "optional", NA_character_,
88
+    "Kapa", "numeric", NULL, "required", NA_character_,
89
+    "ulForPool", "numeric", NULL, "required", NA_character_,
90
+    "CompleteAmplificationID", "char", NULL, "required", "pcr_repl_id",
91
+    "UniqueID", "char", NULL, "required", NA_character_,
92
+    "StudyTestID", "char", NULL, "optional", NA_character_,
93
+    "StudyTestGroup", "char", NULL, "optional", NA_character_,
94
+    "MouseID", "char", NULL, "optional", NA_character_,
95
+    "Tigroup", "char", NULL, "optional", NA_character_,
96
+    "Tisource", "char", NULL, "optional", NA_character_,
97
+    "PathToFolderProjectID", "char", NULL, "required", "proj_folder",
98
+    "SamplesNameCheck", "char", NULL, "optional", NA_character_,
99
+    "TimepointDays", "char", NULL, "optional", NA_character_,
100
+    "TimepointMonths", "char", NULL, "optional", NA_character_,
101
+    "TimepointYears", "char", NULL, "optional", NA_character_,
102
+    "ng DNA corrected", "numeric", NULL, "optional", NA_character_
103
+  )
104
+}
105
+
106
+# Internal: default columns and types of vispa2 stats cols
107
+.default_iss_stats_specs <- function() {
108
+  tibble::tribble(
109
+    ~ names, ~ types, ~ transform, ~ flag, ~ tag,
110
+    "RUN_NAME", "char", NULL, "required", NA_character_,
111
+    "POOL", "char", NULL, "required", "vispa_concatenate",
112
+    "TAG", "char", ~ stringr::str_replace_all(.x, pattern = "\\.",
113
+                                              replacement = ""), "required",
114
+    "tag_seq",
115
+    "RAW_READS", "int", NULL, "optional", NA_character_,
116
+    "QUALITY_PASSED", "int", NULL, "optional", NA_character_,
117
+    "PHIX_MAPPING", "int", NULL, "optional", NA_character_,
118
+    "PLASMID_MAPPED_BYPOOL", "int", NULL, "optional", NA_character_,
119
+    "BARCODE_MUX", "int", NULL, "required", NA_character_,
120
+    "LTR_IDENTIFIED", "int", NULL, "optional", NA_character_,
121
+    "TRIMMING_FINAL_LTRLC", "int", NULL, "optional", NA_character_,
122
+    "LV_MAPPED", "int", NULL, "optional", NA_character_,
123
+    "BWA_MAPPED_OVERALL", "int", NULL, "optional", NA_character_,
124
+    "ISS_MAPPED_OVERALL", "int", NULL, "optional", NA_character_,
125
+    "ISS_MAPPED_PP", "int", NULL, "optional", NA_character_
126
+  )
127
+}
128
+
129
+# Mappings between input format and formats requested by external parsing
130
+# functions
131
+.types_mapping <- function() {
132
+  tibble::tribble(
133
+    ~ types, ~ mapping, ~ fread,
134
+    "char", "c", "character",
135
+    "int", "i", "integer",
136
+    "logi", "l", "logical",
137
+    "numeric", "d", "numeric",
138
+    "factor", "f", "factor",
139
+    "date", "c", "charcter",
140
+    "ymd", "c", "character",
141
+    "ydm", "c", "character",
142
+    "mdy", "c", "character",
143
+    "myd", "c", "character",
144
+    "dmy", "c", "character",
145
+    "yq", "c", "character",
146
+    "ym", "c", "character",
147
+    "my", "c", "character",
148
+    "ymd_hms", "c", "character",
149
+    "ymd_hm", "c", "character",
150
+    "ymd_h", "c", "character",
151
+    "dmy_hms", "c", "character",
152
+    "dmy_hm", "c", "character",
153
+    "dmy_h", "c", "character",
154
+    "mdy_hms", "c", "character",
155
+    "mdy_hm", "c", "character",
156
+    "mdy_h", "c", "character",
157
+    "ydm_hms", "c", "character",
158
+    "ydm_hm", "c", "character",
159
+    "ydm_h", "c", "character"
160
+  )
50 161
 }
51 162
 
52 163
 # Internal: associates column types with column names for a more precise
53 164
 # import
54
-.annotation_IS_types <- function(mode) {
55
-    if (mode == "fread") {
56
-        return(list(character = c("GeneName", "GeneStrand")))
57
-    } else {
58
-        return(list(
59
-            GeneName = "c",
60
-            GeneStrand = "c"
61
-        ))
62
-    }
165
+.mandatory_IS_types <- function(mode) {
166
+  specs <- mandatory_IS_vars(include_types = TRUE)
167
+  specs_mappings <- specs %>%
168
+    dplyr::left_join(.types_mapping(), by = "types")
169
+  if (mode == "fread") {
170
+    specs_mappings <- specs_mappings %>%
171
+      dplyr::select(.data$names, .data$fread) %>%
172
+      dplyr::group_by(.data$fread)
173
+    types <- specs_mappings %>%
174
+      dplyr::group_keys() %>%
175
+      dplyr::pull(.data$fread)
176
+    specs_mappings <- specs_mappings %>%
177
+      dplyr::group_split(.keep = FALSE)
178
+    names(specs_mappings) <- types
179
+    types <- purrr::map(specs_mappings, ~ .x$names)
180
+    return(types)
181
+  }
182
+  types <- as.list(setNames(specs_mappings$mapping, specs_mappings$names))
183
+  return(types)
63 184
 }
64 185
 
65
-#' Names of the columns in the association file.
66
-#'
67
-#' All the names of the columns present in the association file.
68
-#'
69
-#' @return A character vector
70
-#' @export
71
-#'
72
-#' @examples
73
-#' association_file_columns()
74
-association_file_columns <- function() {
75
-    c(
76
-        "ProjectID", "FUSIONID", "PoolID", "TagSequence", "SubjectID",
77
-        "VectorType", "VectorID", "ExperimentID", "Tissue", "TimePoint",
78
-        "DNAFragmentation", "PCRMethod", "TagIDextended", "Keywords",
79
-        "CellMarker",
80
-        "TagID", "NGSProvider", "NGSTechnology", "ConverrtedFilesDir",
81
-        "ConverrtedFilesName", "SourceFileFolder", "SourceFileNameR1",
82
-        "SourceFileNameR2", "DNAnumber", "ReplicateNumber", "DNAextractionDate",
83
-        "DNAngUsed", "LinearPCRID", "LinearPCRDate", "SonicationDate",
84
-        "LigationDate", "1stExpoPCRID", "1stExpoPCRDate", "2ndExpoID",
85
-        "2ndExpoDate", "FusionPrimerPCRID", "FusionPrimerPCRDate",
86
-        "PoolDate", "SequencingDate", "VCN", "Genome", "SequencingRound",
87
-        "Genotype", "TestGroup", "MOI", "Engraftment", "Transduction", "Notes",
88
-        "AddedField1", "AddedField2", "AddedField3", "AddedField4",
89
-        "concatenatePoolIDSeqRun", "AddedField6_RelativeBloodPercentage",
90
-        "AddedField7_PurityTestFeasibility", "AddedField8_FacsSeparationPurity",
91
-        "Kapa", "ulForPool", "CompleteAmplificationID", "UniqueID",
92
-        "StudyTestID",
93
-        "StudyTestGroup", "MouseID", "Tigroup", "Tisource",
94
-        "PathToFolderProjectID",
95
-        "SamplesNameCheck",
96
-        "TimepointDays", "TimepointMonths",
97
-        "TimepointYears", "ng DNA corrected"
98
-    )
186
+# Internal: associates column types with column names for a more precise
187
+# import
188
+.annotation_IS_types <- function(mode) {
189
+  specs <- annotation_IS_vars(include_types = TRUE)
190
+  specs_mappings <- specs %>%
191
+    dplyr::left_join(.types_mapping(), by = "types")
192
+  if (mode == "fread") {
193
+    specs_mappings <- specs_mappings %>%
194
+      dplyr::select(.data$names, .data$fread) %>%
195
+      dplyr::group_by(.data$fread)
196
+    types <- specs_mappings %>%
197
+      dplyr::group_keys() %>%
198
+      dplyr::pull(.data$fread)
199
+    specs_mappings <- specs_mappings %>%
200
+      dplyr::group_split(.keep = FALSE)
201
+    names(specs_mappings) <- types
202
+    types <- purrr::map(specs_mappings, ~ .x$names)
203
+    return(types)
204
+  }
205
+  types <- as.list(setNames(specs_mappings$mapping, specs_mappings$names))
206
+  return(types)
99 207
 }
100 208
 
101 209
 # Internal: associates column types with column names for a more precise
102 210
 # import
103 211
 .af_col_types <- function(mode) {
104
-    if (mode == "fread") {
105
-        types <- list(
106
-            character = c(
107
-                "ProjectID", "FUSIONID", "PoolID", "TagSequence",
108
-                "SubjectID", "VectorType", "VectorID", "ExperimentID",
109
-                "Tissue", "TimePoint", "DNAFragmentation",
110
-                "PCRMethod", "TagIDextended", "Keywords",
111
-                "CellMarker", "TagID", "NGSProvider", "NGSTechnology",
112
-                "ConverrtedFilesDir", "ConverrtedFilesName",
113
-                "SourceFileFolder", "SourceFileNameR1",
114
-                "SourceFileNameR2", "DNAnumber", "LinearPCRID",
115
-                "1stExpoPCRID", "2ndExpoID", "FusionPrimerPCRID",
116
-                "Genome", "Genotype", "Notes", "AddedField1",
117
-                "AddedField2", "AddedField3", "AddedField4",
118
-                "concatenatePoolIDSeqRun", "CompleteAmplificationID",
119
-                "UniqueID", "StudyTestID", "Tigroup", "Tisource",
120
-                "PathToFolderProjectID", "SamplesNameCheck",
121
-                "DNAextractionDate", "LinearPCRDate",
122
-                "SonicationDate", "LigationDate",
123
-                "FusionPrimerPCRDate", "PoolDate", "SequencingDate",
124
-                "MOI", "AddedField6_RelativeBloodPercentage",
125
-                "TestGroup"
126
-            ),
127
-            double = c(
128
-                "DNAngUsed", "VCN", "Engraftment", "Transduction",
129
-                "AddedField7_PurityTestFeasibility",
130
-                "AddedField8_FacsSeparationPurity", "Kapa",
131
-                "ulForPool", "TimepointMonths", "TimepointYears",
132
-                "ng DNA corrected"
133
-            ),
134
-            integer = c(
135
-                "ReplicateNumber", "SequencingRound",
136
-                "StudyTestGroup", "MouseID", "TimepointDays"
137
-            )
138
-        )
139
-        return(types)
140
-    }
141
-    if (mode == "readr") {
142
-        types <- list(
143
-            ProjectID = "c", FUSIONID = "c", PoolID = "c", TagSequence = "c",
144
-            SubjectID = "c", VectorType = "c", VectorID = "c",
145
-            ExperimentID = "c", Tissue = "c", TimePoint = "c",
146
-            DNAFragmentation = "c", PCRMethod = "c", TagIDextended = "c",
147
-            Keywords = "c", CellMarker = "c", TagID = "c",
148
-            NGSProvider = "c", NGSTechnology = "c",
149
-            ConverrtedFilesDir = "c", ConverrtedFilesName = "c",
150
-            SourceFileFolder = "c", SourceFileNameR1 = "c",
151
-            SourceFileNameR2 = "c", DNAnumber = "c", LinearPCRID = "c",
152
-            `1stExpoPCRID` = "c", `2ndExpoID` = "c",
153
-            FusionPrimerPCRID = "c", Genome = "c", Genotype = "c",
154
-            Notes = "c", AddedField1 = "c",
155
-            AddedField2 = "c", AddedField3 = "c", AddedField4 = "c",
156
-            concatenatePoolIDSeqRun = "c", CompleteAmplificationID = "c",
157
-            UniqueID = "c", StudyTestID = "c", Tigroup = "c", Tisource = "c",
158
-            PathToFolderProjectID = "c", SamplesNameCheck = "c",
159
-            DNAextractionDate = "c",
160
-            LinearPCRDate = "c",
161
-            SonicationDate = "c",
162
-            LigationDate = "c",
163
-            FusionPrimerPCRDate = "c",
164
-            PoolDate = "c",
165
-            SequencingDate = "c",
166
-            MOI = "c", AddedField6_RelativeBloodPercentage = "c",
167
-            DNAngUsed = "d", VCN = "d", Engraftment = "d", Transduction = "d",
168
-            AddedField7_PurityTestFeasibility = "d",
169
-            AddedField8_FacsSeparationPurity = "d", Kapa = "d",
170
-            ulForPool = "d", TimepointMonths = "d", TimepointYears = "d",
171
-            `ng DNA corrected` = "d",
172
-            ReplicateNumber = "i", SequencingRound = "i", TestGroup = "c",
173
-            MouseID = "i", TimepointDays = "i",
174
-            `1stExpoPCRDate` = "c",
175
-            `2ndExpoDate` = "c",
176
-            StudyTestGroup = "i"
177
-        )
178
-        return(types)
179
-    }
212
+  specs <- association_file_columns(include_types = TRUE)
213
+  specs_mappings <- specs %>%
214
+    dplyr::left_join(.types_mapping(), by = "types")
215
+  if (mode == "fread") {
216
+    specs_mappings <- specs_mappings %>%
217
+      dplyr::select(.data$names, .data$fread) %>%
218
+      dplyr::group_by(.data$fread)
219
+    types <- specs_mappings %>%
220
+      dplyr::group_keys() %>%
221
+      dplyr::pull(.data$fread)
222
+    specs_mappings <- specs_mappings %>%
223
+      dplyr::group_split(.keep = FALSE)
224
+    names(specs_mappings) <- types
225
+    types <- purrr::map(specs_mappings, ~ .x$names)
226
+    return(types)
227
+  }
228
+  types <- as.list(setNames(specs_mappings$mapping, specs_mappings$names))
180 229
 }
181 230
 
231
+
182 232
 # Internal: used for file system alignment in import_association_file,
183 233
 # gives the names of the columns that respectively contain:
184 234
 # - the absolute path on disk of the project
... ...
@@ -188,6 +238,11 @@ association_file_columns <- function() {
188 238
     list(project = "Path", quant = "Path_quant", iss = "Path_iss")
189 239
 }
190 240
 
241
+
242
+.default_matrix_suffixes <- function() {
243
+
244
+}
245
+
191 246
 .matrix_annotated_suffixes <- function() {
192 247
     c(".no0.annotated")
193 248
 }
... ...
@@ -277,3 +332,22 @@ refGene_table_cols <- function() {
277 332
         "max_cdsEnd", "minmax_CdsLen", "average_CdsLen"
278 333
     )
279 334
 }
335
+
336
+
337
+available_column_tags <- function() {
338
+    list(
339
+        critical = list(af = c(
340
+            "project_id", "pool_id", "tag_seq", "subject", "tissue",
341
+            "cell_marker", "pcr_replicate", "vispa_concatenate",
342
+            "pcr_repl_id", "proj_folder"
343
+        ),
344
+        matrix = c("chromosome", "locus", "is_strand", "gene_symbol"),
345
+        stats = c("vispa_concatenate", "tag_seq")),
346
+        optional = list(af = c(
347
+            "tp_days", "pcr_method", "ngs_tech", "dna_num",
348
+            "vcn", "genome"
349
+        ),
350
+        matrix = c("gene_strand"),
351
+        stats = c())
352
+    )
353
+}
... ...
@@ -5,18 +5,24 @@
5 5
 #'
6 6
 #' @description \lifecycle{stable}
7 7
 #' This function allows to read and import an integration matrix
8
-#' produced as the output of Vispa2 pipeline and converts it to a tidy
8
+#' (ideally produced by VISPA2) and converts it to a tidy
9 9
 #' format.
10 10
 #'
11 11
 #' @param path The path to the file on disk
12
-#' @param to_exclude Either NULL or a character vector of column names that
12
+#' @param to_exclude Either `NULL` or a character vector of column names that
13 13
 #' should be ignored when importing
14 14
 #' @param keep_excluded Keep the columns in `to_exclude` as additional
15
-#' id columns?
15
+#' columns? (also as a vector)
16 16
 #' @param separator The column delimiter used, defaults to `\t`
17 17
 #'
18
+#' @section Required vars and tags:
19
+#' ## Required vars
20
+#' TODO
21
+#'
18 22
 #' @return A data.table object in tidy format
23
+#'
19 24
 #' @family Import functions
25
+#'
20 26
 #' @importFrom rlang abort inform
21 27
 #' @importFrom fs path_ext
22 28
 #' @importFrom readr read_delim cols
... ...
@@ -45,12 +51,15 @@
45 51
 #' matrix <- import_single_Vispa2Matrix(matrix_path)
46 52
 #' head(matrix)
47 53
 import_single_Vispa2Matrix <- function(path,
48
-    to_exclude = NULL,
49
-    keep_excluded = FALSE,
50
-    separator = "\t") {
54
+    separator = "\t",
55
+    additional_cols = NULL,
56
+    transformations = NULL,
57
+    sample_names_to = pcr_id_column(),
58
+    values_to = "Value",
59
+    to_exclude = lifecycle::deprecated(),
60
+    keep_excluded = lifecycle::deprecated()
61
+    ) {
51 62
     stopifnot(!missing(path) & is.character(path))
52
-    stopifnot(is.null(to_exclude) || is.character(to_exclude))
53
-    stopifnot(is.logical(keep_excluded))
54 63
     stopifnot(is.character(separator))
55 64
     if (!file.exists(path)) {
56 65
         not_found_msg <- paste("File not found at", path)
... ...
@@ -59,150 +68,42 @@ import_single_Vispa2Matrix <- function(path,
59 68
     if (!fs::is_file(path)) {
60 69
         rlang::abort("Path exists but is not a file")
61 70
     }
62
-    mode <- "fread"
63
-    ## Is the file compressed?
64
-    is_compressed <- fs::path_ext(path) %in% .compressed_formats()
65
-    if (is_compressed) {
66
-        ## The compression type is supported by data.table::fread?
67
-        compression_type <- fs::path_ext(path)
68
-        if (!compression_type %in% .supported_fread_compression_formats()) {
69
-            ### If not, switch to classic for reading
70
-            mode <- "classic"
71
-            if (getOption("ISAnalytics.verbose") == TRUE) {
72
-                rlang::inform(.unsupported_comp_format_inf(),
73
-                    class = "unsup_comp_format"
74
-                )
75
-            }
76
-        }
71
+    stopifnot(is.null(transformations) ||
72
+                (is.list(transformations) && !is.null(names(transformations))))
73
+    stopifnot(is.character(sample_names_to))
74
+    sample_names_to <- sample_names_to[1]
75
+    stopifnot(is.character(values_to))
76
+    values_to <- values_to[1]
77
+    deprecation_details <- paste("Arguments 'to_exclude' and 'keep_excluded'",
78
+                                 "are deprecated in favor of a single argument",
79
+                                 "which allows a more refined tuning. See",
80
+                                 "`?import_single_Vispa2Matrix` for details")
81
+    if (lifecycle::is_present(to_exclude)) {
82
+      lifecycle::deprecate_warn(
83
+        when = "1.5.4",
84
+        what = "import_single_Vispa2Matrix(to_exclude)",
85
+        with = "import_single_Vispa2Matrix(additional_cols)",
86
+        details = deprecation_details
87
+      )
88
+      return(NULL)
77 89
     }
78
-    ### Peak headers
79
-    peek_headers <- readr::read_delim(path,
80
-        delim = separator, n_max = 0,
81
-        col_types = readr::cols()
82
-    )
83
-    ## - Detect type
84
-    df_type <- .auto_detect_type(peek_headers)
85
-    if (df_type == "MALFORMED") {
86
-        rlang::abort(.malformed_ISmatrix_error(),
87
-            class = "malformed_ism"
88
-        )
89
-    }
90
-    is_annotated <- .is_annotated(peek_headers)
91
-    ## - Start reading
92
-    if (getOption("ISAnalytics.verbose") == TRUE) {
93
-        rlang::inform(c("Reading file...", i = paste0("Mode: ", mode)))
90
+    if (lifecycle::is_present(keep_excluded)) {
91
+      lifecycle::deprecate_warn(
92
+        when = "1.5.4",
93
+        what = "import_single_Vispa2Matrix(keep_excluded)",
94
+        with = "import_single_Vispa2Matrix(additional_cols)",
95
+        details = deprecation_details
96
+      )
97
+      return(NULL)
94 98
     }
95
-    df <- if (mode == "fread") {
96
-        if (!keep_excluded) {
97
-            .read_with_fread(
98
-                path = path, to_drop = to_exclude,
99
-                df_type = df_type, annotated = is_annotated,
100
-                sep = separator
101
-            )
102
-        } else {
103
-            .read_with_fread(
104
-                path = path, to_drop = NULL,
105
-                df_type = df_type, annotated = is_annotated,
106
-                sep = separator
107
-            )
108
-        }
109
-    } else {
110
-        if (!keep_excluded) {
111
-            .read_with_readr(
112
-                path = path, to_drop = to_exclude,
113
-                df_type = df_type, annotated = is_annotated,
114
-                sep = separator
115
-            )
116
-        } else {
117
-            .read_with_readr(
118
-                path = path, to_drop = NULL,
119
-                df_type = df_type, annotated = is_annotated,
120
-                sep = separator
121
-            )
122
-        }
123
-    }
124
-    ## - Report summary
125
-    if (getOption("ISAnalytics.verbose") == TRUE) {
126
-        rlang::inform(.summary_ism_import_msg(
127
-            df_type,
128
-            .is_annotated(df),
129
-            dim(df),
130
-            mode
131
-        ),
132
-        class = "ism_import_summary"
133
-        )
134
-    }
135
-    if (df_type == "OLD") {
136
-        df <- df %>%
137
-            tidyr::separate(
138
-                col = .data$IS_genomicID,
139
-                into = mandatory_IS_vars(),
140
-                sep = "_", remove = TRUE,
141
-                convert = TRUE
142
-            ) %>%
143
-            dplyr::mutate(chr = stringr::str_replace(
144
-                .data$chr, "chr", ""
145
-            ))
146
-    }
147
-    ## - Split in chunks
148
-    if (getOption("ISAnalytics.verbose") == TRUE) {
149
-        rlang::inform("Reshaping...")
150
-    }
151
-    chunks <- split(df,
152
-        by = c("chr"),
153
-        verbose = FALSE
154
-    )
155
-    ## - Melt in parallel
156
-    p <- if (.Platform$OS.type == "windows") {
157
-        BiocParallel::SnowParam(
158
-            tasks = length(chunks),
159
-            progressbar = getOption("ISAnalytics.verbose"),
160
-            exportglobals = TRUE,
161
-            stop.on.error = TRUE
162
-        )
163
-    } else {
164
-        BiocParallel::MulticoreParam(
165
-            tasks = length(chunks),
166
-            progressbar = getOption("ISAnalytics.verbose"),
167
-            exportglobals = FALSE,
168
-            stop.on.error = TRUE
169
-        )
170
-    }
171
-    mt <- function(data, annot) {
172
-        id_vars <- if (annot) {
173
-            c(
174
-                mandatory_IS_vars(),
175
-                annotation_IS_vars()
176
-            )
177
-        } else {
178
-            mandatory_IS_vars()
179
-        }
180
-        if (!is.null(to_exclude)) {
181
-            if (length(to_exclude) > 0 && keep_excluded) {
182
-                id_vars <- c(id_vars, to_exclude)
183
-            }
184
-        }
185
-        data.table::melt.data.table(data,
186
-            id.vars = id_vars,
187
-            variable.name = "CompleteAmplificationID",
188
-            value.name = "Value",
189
-            na.rm = TRUE,
190
-            verbose = FALSE
191
-        )
192
-    }
193
-    tidy_chunks <- BiocParallel::bplapply(
194
-        X = chunks,
195
-        FUN = mt,
196
-        annot = is_annotated,
197
-        BPPARAM = p
198
-    )
199
-    BiocParallel::bpstop(p)
200
-    tidy <- data.table::rbindlist(tidy_chunks)
201
-    tidy <- tidy["Value" > 0]
202
-    if (getOption("ISAnalytics.verbose") == TRUE) {
203
-        rlang::inform("Done!")
204
-    }
205
-    return(tidy)
99
+
100
+    tidy_df <- .import_single_matrix(path = path, separator = separator,
101
+                          additional_cols = additional_cols,
102
+                          transformations = transformations,
103
+                          call_mode = "EXTERNAL",
104
+                          id_col_name = sample_names_to,
105
+                          val_col_name = values_to)
106
+    return(tidy_df)
206 107
 }
207 108
 
208 109
 
... ...
@@ -256,41 +157,122 @@ import_single_Vispa2Matrix <- function(path,
256 157
 #' @examples
257 158
 #' fs_path <- system.file("extdata", "fs.zip", package = "ISAnalytics")
258 159
 #' fs <- unzip_file_system(fs_path, "fs")
259
-#' af_path <- system.file("extdata", "asso.file.tsv.gz", package = "ISAnalytics")
160
+#' af_path <- system.file("extdata", "asso.file.tsv.gz",
161
+#'     package = "ISAnalytics"
162
+#' )
260 163
 #' af <- import_association_file(af_path, root = fs, report_path = NULL)
261 164
 #' head(af)
262 165
 import_association_file <- function(path,
263 166
     root = NULL,
264
-    tp_padding = 4,
265 167
     dates_format = "ymd",
266 168
     separator = "\t",
267 169
     filter_for = NULL,
268 170
     import_iss = FALSE,
269 171
     convert_tp = TRUE,
270 172
     report_path = default_report_path(),
173
+    transformations = default_af_transform(convert_tp),
174
+    tp_padding = lifecycle::deprecated(),
271 175
     ...) {
272 176
     # Check parameters
273
-    stopifnot(is.character(path) & length(path) == 1)
177
+    stopifnot(is.character(path))
178
+    path <- path[1]
274 179
     stopifnot((is.character(root) & length(root) == 1) || (is.null(root)))
275 180
     stopifnot(file.exists(path))
276 181
     if (!is.null(root) && root != "") {
277 182
         stopifnot(file.exists(root))
278 183
     }
279
-    stopifnot((is.numeric(tp_padding) |
280
-        is.integer(tp_padding)) & length(tp_padding) == 1)
281 184
     stopifnot(length(dates_format) == 1 & dates_format %in% date_formats())
282
-    stopifnot(is.character(separator) && length(separator) == 1)
283
-    stopifnot(is.logical(import_iss) && length(import_iss) == 1)
185
+    stopifnot(is.character(separator))
186
+    separator <- separator[1]
187
+    stopifnot(is.logical(import_iss))
188
+    import_iss <- import_iss[1]
284 189
     if (import_iss & is.null(root)) {
285 190
         rlang::abort(.no_stats_import_err())
286 191
     }
192
+    stopifnot(is.null(transformations) ||
193
+        (is.list(transformations) && !is.null(names(transformations))))
194
+    if (lifecycle::is_present(tp_padding)) {
195
+        lifecycle::deprecate_warn(
196
+            when = "1.5.4",
197
+            what = "import_association_file(tp_padding)",
198
+            details = c(paste(
199
+                "The argument is now deprecated in favor of custom",
200
+                "column transformations"
201
+            ),
202
+            i = paste(
203
+                "See the documentation of `transform_columns`",
204
+                "or browse the package vignettes for more details"
205
+            )
206
+            )
207
+        )
208
+    }
287 209
     # Check filter
288 210
     stopifnot(is.null(filter_for) ||
289 211
         (is.list(filter_for) && !is.null(names(filter_for))))
212
+
213
+    # Check presence of required tags
214
+    required_tags <- list()
215
+    req_tags_politic <- c()
216
+    if (!is.null(root)) {
217
+        ### Tags required for file system alignment
218
+        required_tags <- append(
219
+            required_tags,
220
+            list(
221
+                "project_id" = "char",
222
+                "proj_folder" = "char",
223
+                "vispa_concatenate" = "char"
224
+            )
225
+        )
226
+        req_tags_politic <- c(req_tags_politic,
227
+            "project_id" = "error",
228
+            "proj_folder" = "error",
229
+            "vispa_concatenate" = "error"
230
+        )
231
+    }
232
+    if (convert_tp) {
233
+        ### tags required for time point conversion
234
+        required_tags <- append(
235
+            required_tags,
236
+            list("tp_days" = c("char", "int", "numeric"))
237
+        )
238
+        req_tags_politic <- c(req_tags_politic,
239
+            "tp_days" = "first"
240
+        )
241
+    }
242
+    tags_to_cols <- if (!purrr::is_empty(required_tags)) {
243
+        .check_required_cols(
244
+            required_tags = required_tags,
245
+            vars_df = association_file_columns(TRUE),
246
+            duplicate_politic = req_tags_politic
247
+        )
248
+    } else {
249
+        NULL
250
+    }
290 251
     # Read file and check the correctness
291 252
     af_checks <- .manage_association_file(
292
-        path, root, tp_padding, dates_format,
293
-        separator, filter_for
253
+        af_path = path,
254
+        root = root,
255
+        format = dates_format,
256
+        delimiter = separator,
257
+        filter = filter_for,
258
+        proj_fold_col = dplyr::if_else(!is.null(tags_to_cols),
259
+            tags_to_cols %>%
260
+                dplyr::filter(.data$tag == "proj_folder") %>%
261
+                dplyr::pull(.data$names),
262
+            NULL
263
+        ),
264
+        concat_pool_col = dplyr::if_else(!is.null(tags_to_cols),
265
+            tags_to_cols %>%
266
+                dplyr::filter(.data$tag == "vispa_concatenate") %>%
267
+                dplyr::pull(.data$names),
268
+            NULL
269
+        ),
270
+        project_id_col = dplyr::if_else(!is.null(tags_to_cols),
271
+            tags_to_cols %>%
272
+                dplyr::filter(.data$tag == "project_id") %>%
273
+                dplyr::pull(.data$names),
274
+            NULL
275
+        )
294 276
     )
295 277
     as_file <- af_checks$af
296 278
     parsing_problems <- af_checks$parsing_probs
... ...
@@ -299,68 +281,35 @@ import_association_file <- function(path,
299 281
     if (nrow(parsing_problems) == 0) {
300 282
         parsing_problems <- NULL
301 283
     }
302
-    if (nrow(date_problems) == 0) {
284
+    if (is.null(date_problems) || nrow(date_problems) == 0) {
303 285
         date_problems <- NULL
304 286
     }
305 287
     col_probs <- list(missing = NULL, non_standard = NULL)
306 288
     if (!.check_af_correctness(as_file)) {
307
-        col_probs[["missing"]] <- association_file_columns()[
308
-            !association_file_columns() %in% colnames(as_file)
289
+        min_required_cols <- association_file_columns(TRUE) %>%
290
+            dplyr::filter(.data$flag == "required") %>%
291
+            dplyr::pull(.data$names)
292
+        col_probs[["missing"]] <- min_required_cols[
293
+            !min_required_cols %in% colnames(as_file)
309 294
         ]
310 295
     }
311 296
     non_standard <- colnames(as_file)[
312 297
         !colnames(as_file) %in% c(
313
-            association_file_columns(), "Path",
314
-            "Path_quant", "Path_iss"
298
+            association_file_columns(), .path_cols_names()
315 299
         )
316 300
     ]
317 301
     if (!purrr::is_empty(non_standard)) {
318 302
         col_probs[["non_standard"]] <- non_standard
319 303
     }
320
-    missing_dates <- purrr::map_lgl(date_columns_coll(), function(date_col) {
321
-        any(is.na(as_file[[date_col]]))
322
-    }) %>% purrr::set_names(date_columns_coll())
323
-    missing_dates <- names(missing_dates)[missing_dates == TRUE]
324
-    if (length(missing_dates) == 0) {
325
-        missing_dates <- NULL
326
-    }
327 304
     ## Fix timepoints
328 305
     if (convert_tp) {
329
-        if (!"TimepointMonths" %in% colnames(as_file)) {
330
-            as_file <- as_file %>%
331
-                tibble::add_column(TimepointMonths = NA_real_)
332
-        }
333
-        if (!"TimepointYears" %in% colnames(as_file)) {
334
-            as_file <- as_file %>%
335
-                tibble::add_column(TimepointYears = NA_real_)
336
-        }
306
+        tp_col <- tags_to_cols %>%
307
+            dplyr::filter(.data$tag == "tp_days") %>%
308
+            dplyr::pull(.data$names)
337 309
         as_file <- as_file %>%
338 310
             dplyr::mutate(
339
-                TimepointMonths = dplyr::if_else(
340
-                    condition = as.numeric(.data$TimePoint) == 0,
341
-                    true = 0,
342
-                    false = dplyr::if_else(
343
-                        condition = as.numeric(.data$TimePoint) > 0 &
344
-                            as.numeric(.data$TimePoint) < 30,
345
-                        true = ceiling(as.numeric(.data$TimePoint) / 30),
346
-                        false = round(as.numeric(.data$TimePoint) / 30)
347
-                    )
348
-                ),
349
-                TimepointYears = dplyr::if_else(
350
-                    condition = as.numeric(.data$TimePoint) == 0,
351
-                    true = 0,
352
-                    false = ceiling(as.numeric(.data$TimePoint) / 360)
353
-                )
354
-            ) %>%
355
-            dplyr::mutate(
356
-                TimepointMonths = stringr::str_pad(
357
-                    as.character(.data$TimepointMonths),
358
-                    pad = "0", side = "left", width = 2
359
-                ),
360
-                TimepointYears = stringr::str_pad(
361
-                    as.character(.data$TimepointYears),
362
-                    pad = "0", side = "left", width = 2
363
-                ),
311
+                TimepointMonths = .timepoint_to_months(.data[[tp_col]]),
312
+                TimepointYears = .timepoint_to_years(.data[[tp_col]])
364 313
             )
365 314
     }
366 315
     import_stats_rep <- NULL
... ...
@@ -406,6 +355,29 @@ import_association_file <- function(path,
406 355
             }
407 356
         }
408 357
     }
358
+
359
+    if (!is.null(transformations)) {
360
+        as_file <- transform_columns(as_file, transformations)
361
+    }
362
+
363
+    crit_colnames <- association_file_columns(TRUE) %>%
364
+        dplyr::filter(.data$tag %in% available_column_tags()$critical$af) %>%
365
+        dplyr::pull(.data$names)
366
+    crit_colnames <- colnames(as_file)[colnames(as_file) %in% crit_colnames]
367
+    crit_nas <- if (length(crit_colnames) > 0) {
368
+        nas_crit <- purrr::map_lgl(crit_colnames, ~ {
369
+            any(is.na(as_file[[.x]]))
370
+        }) %>%
371
+            purrr::set_names(crit_colnames)
372
+        nas_crit <- names(purrr::keep(nas_crit, ~ .x == TRUE))
373
+        if (length(nas_crit) == 0) {
374
+            NULL
375
+        } else {
376
+            nas_crit
377
+        }
378
+    } else {
379
+        NULL
380
+    }
409 381
     withCallingHandlers(
410 382
         {
411 383
             .produce_report("asso_file",
... ...
@@ -413,7 +385,7 @@ import_association_file <- function(path,
413 385
                     parsing_prob = parsing_problems,
414 386
                     dates_prob = date_problems,
415 387
                     col_prob = col_probs,
416
-                    crit_nas = missing_dates,
388
+                    crit_nas = crit_nas,
417 389
                     fs_align = checks,
418 390
                     iss_stats = import_stats_rep,
419 391
                     iss_stats_miss = missing_stats_rep
... ...
@@ -430,7 +402,7 @@ import_association_file <- function(path,
430 402
         summary_report <- .summary_af_import_msg(
431 403
             pars_prob = parsing_problems, dates_prob = date_problems,
432 404
             cols_prob = col_probs[[!is.null(col_probs)]],
433
-            crit_na = missing_dates,
405
+            crit_na = crit_nas,
434 406
             checks = ifelse(is.null(checks),
435 407
                 yes = "skipped",
436 408
                 no = ifelse(any(!checks$Found),
... ...
@@ -503,13 +475,22 @@ import_Vispa2_stats <- function(association_file,
503 475
     if (!path_cols$iss %in% colnames(association_file)) {
504 476
         rlang::abort(.af_not_aligned_err())
505 477
     }
506
-    min_cols <- c(
507
-        "ProjectID",
508
-        "CompleteAmplificationID",
509
-        pool_col,
510
-        path_cols$iss,
511
-        "TagSequence"
478
+    required_tags <- list(
479
+        "project_id" = "char",
480
+        "tag_seq" = "char",
481
+        "pcr_repl_id" = "char"
482
+    )
483
+    tag_politics <- list(
484
+        "project_id" = "error",
485
+        "tag_seq" = "error",
486
+        "pcr_repl_id" = "error"
487
+    )
488
+    tags_to_cols <- .check_required_cols(
489
+        required_tags = required_tags,
490
+        vars_df = association_file_columns(TRUE),
491
+        duplicate_politic = tag_politics
512 492
     )
493
+    min_cols <- c(tags_to_cols$names, pool_col, path_cols$iss)
513 494
     if (!all(min_cols %in% colnames(association_file))) {
514 495
         rlang::abort(
515 496
             .missing_needed_cols(
... ...
@@ -527,7 +508,10 @@ import_Vispa2_stats <- function(association_file,
527 508
     ## Import
528 509
     stats <- .import_stats_iss(
529 510
         association_file = association_file,
530
-        prefixes = file_prefixes
511
+        prefixes = file_prefixes,
512
+        pool_col = pool_col,
513
+        path_iss_col = path_cols$iss,
514
+        tags = tags_to_cols
531 515
     )
532 516
     report <- stats$report
533 517
     stats <- stats$stats
... ...
@@ -536,7 +520,7 @@ import_Vispa2_stats <- function(association_file,
536 520
         if (getOption("ISAnalytics.verbose") == TRUE) {
537 521
             rlang::inform(.no_stat_files_imported())
538 522
         }
539
-        if (report_path == "INTERNAL") {
523
+        if (!is.null(report_path) && report_path == "INTERNAL") {
540 524
             ## If function was called from import_association_file
541 525
             return(stats = association_file, report = report)
542 526
         }
... ...
@@ -564,32 +548,77 @@ import_Vispa2_stats <- function(association_file,
564 548
     ## - IF STATS NOT NULL
565 549
     ## Merge if requested
566 550
     if (join_with_af) {
551
+        required_tags_for_join <- list(
552
+            "tag_seq" = "char",
553
+            "vispa_concatenate" = "char"
554
+        )
555
+        iss_tags_to_cols <- .check_required_cols(required_tags_for_join,
556
+            iss_stats_specs(TRUE),
557
+            duplicate_politic = "error"
558
+        )
559
+        if (any(!iss_tags_to_cols$names %in% colnames(stats))) {
560
+            msg <- c("Error in joining VISPA2 stats with AF - skipping",
561
+                .missing_needed_cols(iss_tags_to_cols$names[
562
+                    !iss_tags_to_cols$names %in% colnames(stats)
563
+                ]),
564
+                i = paste(
565
+                    "Needed columns for join were not found in",
566
+                    "imported stats. Check your iss stats specs",
567
+                    "with `iss_stats_specs(TRUE)` and check the files",
568
+                    "are not malformed."
569
+                ),
570
+                i = paste("Returning imported data only")
571
+            )
572
+            rlang::inform(msg, class = "iss_join_missing")
573
+            return(stats)
574
+        }
575
+        iss_pool_col <- iss_tags_to_cols %>%
576
+            dplyr::filter(.data$tag == "vispa_concatenate") %>%
577
+            dplyr::pull(.data$names)
578
+        iss_tag_col <- iss_tags_to_cols %>%
579
+            dplyr::filter(.data$tag == "tag_seq") %>%
580
+            dplyr::pull(.data$names)
567 581
         association_file <- association_file %>%
568 582
             dplyr::left_join(stats, by = c(
569
-                stats::setNames("POOL", pool_col),
570
-                "TagSequence" = "TAG"
583
+                stats::setNames(iss_pool_col, pool_col),
584
+                stats::setNames(iss_tag_col, tags_to_cols %>%
585
+                    dplyr::filter(.data$tag == "tag_seq") %>%
586
+                    dplyr::pull(.data$names))
571 587
             ))
572 588
         ## Detect potential problems
573
-        addit_columns <- c("SubjectID", "CellMarker", "Tissue", "TimePoint")
574
-        addit_columns <- addit_columns[addit_columns %in%
589
+        addit_columns <- association_file_columns(TRUE) %>%
590
+            dplyr::filter(.data$tag %in% c(
591
+                "subject",
592
+                "tissue",
593
+                "cell_marker",
594
+                "tp_days"
595
+            ))
596
+        addit_columns_names <- addit_columns %>%
597
+            dplyr::pull(.data$names)
598
+        addit_columns_names <- addit_columns_names[addit_columns_names %in%
575 599
             colnames(association_file)]
600
+        iss_cols_in_af <- colnames(association_file)[
601
+            colnames(association_file) %in% iss_stats_specs()
602
+        ]
603
+
576 604
         missing_stats <- association_file %>%
577
-            dplyr::filter(is.na(.data$RUN_NAME)) %>%
578
-            dplyr::select(
579
-                .data$ProjectID,
580
-                dplyr::all_of(pool_col),
581
-                .data$CompleteAmplificationID,
582
-                .data$TagSequence,
583
-                dplyr::all_of(addit_columns)
584
-            ) %>%
605
+            dplyr::filter(dplyr::if_all(dplyr::all_of(iss_cols_in_af), is.na)) %>%
606
+            dplyr::select(dplyr::all_of(c(
607
+                tags_to_cols$names,
608
+                pool_col,
609
+                addit_columns_names
610
+            ))) %>%
585 611
             dplyr::distinct()
586
-        if (report_path == "INTERNAL") {
612
+        all_af_tags <- tags_to_cols %>%
613
+            dplyr::bind_rows(addit_columns)
614
+        if (!is.null(report_path) && report_path == "INTERNAL") {
587 615
             ## If function was called from import_association_file
588 616
             return(list(
589 617
                 stats = association_file,
590 618
                 report = list(
591 619
                     import = report,
592
-                    miss = missing_stats
620
+                    miss = missing_stats,
621
+                    af_tag_map = all_af_tags
593 622
                 )
594 623
             ))
595 624
         }
... ...
@@ -662,7 +691,8 @@ import_Vispa2_stats <- function(association_file,
662 691
 #' INTERACTIVE, the function will ask for input from the user on console,
663 692
 #' otherwise the process is fully automated (with limitations, see vignette).
664 693
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Additional named arguments
665
-#' to pass to `ìmport_association_file` and `comparison_matrix`
694
+#' to pass to `ìmport_association_file`, `comparison_matrix` and
695
+#' `import_single_Vispa2_matrix`
666 696
 #'
667 697
 #' @importFrom rlang fn_fmls_names dots_list arg_match inform abort
668 698
 #' @importFrom rlang eval_tidy call2
... ...
@@ -689,8 +719,8 @@ import_Vispa2_stats <- function(association_file,
689 719
 #' )
690 720
 #' head(matrices)
691 721
 import_parallel_Vispa2Matrices <- function(association_file,
692
-    quantification_type,
693
-    matrix_type = "annotated",
722
+    quantification_type = c("seqCount", "fragmentEstimate"),
723
+    matrix_type = c("annotated", "not_annotated"),
694 724
     workers = 2,
695 725
     multi_quant_matrix = TRUE,
696 726
     report_path = default_report_path(),
... ...
@@ -702,14 +732,15 @@ import_parallel_Vispa2Matrices <- function(association_file,
702 732
         association_file, quantification_type, matrix_type,
703 733
         workers, multi_quant_matrix
704 734
     )
705
-    rlang::arg_match(mode)
735
+    matrix_type <- rlang::arg_match(matrix_type)
736
+    mode <- rlang::arg_match(mode)
706 737
     ## Collect dot args
707 738
     if (is.character(association_file) || isTRUE(multi_quant_matrix)) {
708 739
         dots_args <- rlang::dots_list(..., .named = TRUE, .homonyms = "first")
709 740
         if (is.character(association_file)) {
710 741
             import_af_arg_names <- rlang::fn_fmls_names(import_association_file)
711 742
             import_af_arg_names <- import_af_arg_names[
712
-                import_af_arg_names != "path"
743
+                !import_af_arg_names %in% c("path", "report_path")
713 744
             ]
714 745
             import_af_args <- dots_args[names(dots_args) %in%
715 746
                 import_af_arg_names]
... ...
@@ -721,27 +752,51 @@ import_parallel_Vispa2Matrices <- function(association_file,
721 752
                 mult_arg_names]
722 753
         }
723 754
     }
724
-    association_file <- .pre_manage_af(association_file, import_af_args)
755
+    import_matrix_arg_names <- rlang::fn_fmls_names(
756
+      import_single_Vispa2Matrix)
757
+    import_matrix_arg_names <- import_matrix_arg_names[
758
+      !import_matrix_arg_names %in% c("path", "to_exclude",
759
+                                      "keep_excluded")]
760
+    import_matrix_args <- dots_args[names(dots_args) %in%
761
+                                      import_matrix_arg_names]
762
+    association_file <- .pre_manage_af(
763
+        association_file,
764
+        import_af_args,
765
+        report_path
766
+    )
725 767
     if (nrow(association_file) == 0) {
726 768
         rlang::inform(.af_empty_msg())
727 769
         return(NULL)
728 770
     }
729 771
     ## Workflows
772
+    af_tags <- association_file_columns(TRUE)
773
+    proj_col <- af_tags %>%
774
+        dplyr::filter(.data$tag == "project_id") %>%
775
+        dplyr::pull(.data$names)
776
+    pool_col <- af_tags %>%
777
+        dplyr::filter(.data$tag == "vispa_concatenate") %>%
778
+        dplyr::pull(.data$names)
730 779
     ### --- Interactive
731 780
     if (mode == "INTERACTIVE") {
732 781
         ## User selects projects to keep
733 782
         association_file <- .interactive_select_projects_import(
734
-            association_file
783
+            association_file,
784
+            proj_col = proj_col
735 785
         )
736 786
         ## User selects pools to keep
737
-        association_file <- .interactive_select_pools_import(association_file)
787
+        association_file <- .interactive_select_pools_import(association_file,
788
+            proj_col = proj_col,
789
+            pool_col = pool_col
790
+        )
738 791
         ## Scan the appropriate file system paths and look for files
739 792
         files_found <- .lookup_matrices(
740 793
             association_file, quantification_type,
741
-            matrix_type
794
+            matrix_type, proj_col, pool_col
742 795
         )
743 796
         ## Manage missing files and duplicates
744
-        files_to_import <- .manage_anomalies_interactive(files_found)
797
+        files_to_import <- .manage_anomalies_interactive(files_found,
798
+                                                         proj_col,
799
+                                                         pool_col)
745 800
     } else {
746 801
         ### --- Auto
747 802
         ## In automatic workflow all projects and pools contained
... ...
@@ -757,27 +812,30 @@ import_parallel_Vispa2Matrices <- function(association_file,
757 812
             stopifnot(is.character(patterns))
758 813
         }
759 814
         ### Evaluate matching_opt
760
-        matching_option <- match.arg(matching_opt)
815
+        matching_option <- rlang::arg_match(matching_opt)
761 816
         stopifnot(is.character(matching_option))
762 817
         ## Scan the appropriate file system paths and look for files
763 818
         files_found <- .lookup_matrices_auto(
764 819
             association_file, quantification_type,
765
-            matrix_type, patterns, matching_option
820
+            matrix_type, patterns, matching_option,
821
+            proj_col, pool_col
766 822
         )
767 823
         ## Manage missing files and duplicates
768
-        files_to_import <- .manage_anomalies_auto(files_found)
824
+        files_to_import <- .manage_anomalies_auto(files_found,
825
+                                                  proj_col, pool_col)
769 826
     }
770 827
     ## If files to import are 0 just terminate
771 828
     if (nrow(files_to_import) == 0) {
772 829
         rlang::abort("No files to import")
773 830
     }
774 831
     ## Import
775
-    matrices <- .parallel_import_merge(files_to_import, workers)
776
-    fimported <- matrices[[2]]
832
+    matrices <- .parallel_import_merge(files_to_import, workers,
833
+                                       import_matrix_args)
834
+    fimported <- matrices$summary
777 835
     if (nrow(fimported) == 0) {
778 836
         fimported <- NULL
779 837
     }
780
-    matrices <- matrices[[1]]
838
+    matrices <- matrices$matrix
781 839
     if (multi_quant_matrix == TRUE) {
782 840
         matrices <- rlang::eval_tidy(rlang::call2(comparison_matrix,
783 841
             x = matrices,
... ...
@@ -795,10 +853,17 @@ import_parallel_Vispa2Matrices <- function(association_file,
795 853
     } else {
796 854
         NULL
797 855
     }
856
+    launch_params <- list()
857
+    if (!is.null(patterns)) {
858
+      launch_params[["patterns"]] <- patterns
859
+      launch_params[["matching_opt"]] <- matching_option
860
+    }
798 861
     withCallingHandlers(
799 862
         {
800 863
             .produce_report("matrix_imp",
801 864
                 params = list(
865
+                  launch_params = launch_params,
866
+                  set_vars = list(proj_col = proj_col, pool_col = pool_col),
802 867
                     files_found = files_found,
803 868
                     files_imp = fimported,
804 869
                     annot_prob = annotation_problems
... ...
@@ -1043,7 +1108,11 @@ matching_options <- function() {
1043 1108
 #' @examples
1044 1109
 #' date_formats()
1045 1110
 date_formats <- function() {
1046
-    c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "yq")
1111
+    c(
1112
+        "ymd", "ydm", "mdy", "myd", "dmy", "dym", "yq", "ym", "my",
1113
+        "ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm",
1114
+        "dmy_h", "mdy_hms", "mdy_hm", "mdy_h", "ydm_hms", "ydm_hm", "ydm_h"
1115
+    )
1047 1116
 }
1048 1117
 
1049 1118
 
... ...
@@ -1059,3 +1128,18 @@ date_formats <- function() {
1059 1128
 default_iss_file_prefixes <- function() {
1060 1129
     c("stats\\.sequence.", "stats\\.matrix.")
1061 1130
 }
1131
+
1132
+default_af_transform <- function(convert_tp) {
1133
+    if (convert_tp) {
1134
+        return(list(
1135
+            TimepointMonths = ~ stringr::str_pad(
1136
+                as.character(.x),
1137
+                pad = "0", side = "left", width = 2
1138
+            ),
1139
+            TimepointYears = ~ stringr::str_pad(as.character(.x),
1140
+                pad = "0", side = "left", width = 2
1141
+            )
1142
+        ))
1143
+    }
1144
+    return(NULL)
1145
+}
... ...
@@ -3,7 +3,507 @@
3 3
 #------------------------------------------------------------------------------#
4 4
 ## All functions in this file are NOT exported, to be used internally only.
5 5
 
6
+#### ---- Internals for dynamic variables  ----####
7
+# Internal for setting mandatory or annotation is vars
8
+.new_IS_vars_checks <- function(specs, err) {
9
+    new_vars <- if (is.data.frame(specs)) {
10
+        # if data frame supplied
11
+        colnames_ok <- all(c("names", "types", "transform", "flag", "tag")
12
+        %in% colnames(specs))
13
+        col_types_ok <- if (colnames_ok) {
14
+            all(purrr::map_lgl(
15
+                specs[c("names", "types", "flag", "tag")],
16
+                is.character
17
+            ))
18
+        } else {
19
+            FALSE
20
+        }
21
+        types_ok <- if (colnames_ok & col_types_ok) {
22
+            all(specs$types %in% .types_mapping()[["types"]])
23
+        } else {
24
+            FALSE
25
+        }
26
+        flags_ok <- if (colnames_ok & col_types_ok) {
27
+            all(specs$flag %in% c("required", "optional"))
28
+        } else {
29
+            FALSE
30
+        }
31
+        transform_col_ok <- if (colnames_ok & col_types_ok) {
32
+            is.list(specs$transform) &&
33
+                all(purrr::map_lgl(
34
+                    specs$transform,
35
+                    ~ {
36
+                        is.null(.x) || rlang::is_formula(.x) || is.function(.x)
37
+                    }
38
+                ))
39
+        } else {
40
+            FALSE
41
+        }
42
+        if (!(colnames_ok & col_types_ok & types_ok & transform_col_ok &
43
+            flags_ok)) {
44
+            rlang::abort(err)
45
+        }
46
+        specs
47
+    } else {
48
+        # if vector supplied
49
+        if (is.null(names(specs)) || !all(specs %in% .types_mapping()[["types"]])) {
50
+            rlang::abort(err)
51
+        }
52
+        purrr::map2_dfr(
53
+            names(specs), specs,
54
+            ~ tibble::tibble_row(
55
+                names = .x,
56
+                types = .y,
57
+                transform = list(NULL),
58
+                flag = "required",
59
+                tag = NA_character_
60
+            )
61
+        )
62
+    }
63
+    new_vars
64
+}
65
+
66
+# Applies transformations on columns as specified in variables specs
67
+# expects specs to be in data frame format
68
+.apply_col_transform <- function(df, specs) {
69
+    # Extract and associate names and transf
70
+    non_null <- purrr::pmap(specs, function(names, types,
71
+                                            transform, flags, tag) {
72
+        if (is.null(transform)) {
73
+            return(NULL)
74
+        }
75
+        return(transform)
76
+    }) %>%
77
+        purrr::set_names(specs$names)
78
+    # Retain only non-null transform
79
+    non_null <- non_null[purrr::map_lgl(non_null, ~ !is.null(.x))]
80
+    if (length(non_null) > 0) {
81
+        # if there are transf to apply
82
+        apply_transform <- function(col, col_name) {
83
+            if (!col_name %in% names(non_null)) {
84
+                return(col)
85
+            }
86
+            transformation <- non_null[[col_name]]
87
+            t <- if (rlang::is_formula(transformation)) {
88
+                unlist(purrr::map(col, transformation))
89
+            } else {
90
+                # if it is a function
91
+                do.call(what = transformation, args = list(col))
92
+            }
93
+        }
94
+        df <- purrr::map2_dfc(df, colnames(df), apply_transform)
95
+    }
96
+    df
97
+}
98
+
99
+# Internal to quickly convert columns marked as dates with lubridate
100
+.convert_dates <- function(date_vec, format) {
101
+    if (format %in% date_formats()) {
102
+        parsed <- lubridate::parse_date_time(date_vec, format)
103
+        if (format %in% c(
104
+            "ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm",
105
+            "dmy_h", "mdy_hms", "mdy_hm", "mdy_h",
106
+            "ydm_hms", "ydm_hm", "ydm_h"
107
+        )) {
108
+            return(parsed)
109
+        } else {
110
+            return(lubridate::as_date(parsed))
111
+        }
112
+    }
113
+    if (format == "date") {
114
+        # Guesses the format
115
+        return(lubridate::as_date(as.character(date_vec)))
116
+    }
117
+}