... | ... |
@@ -47,8 +47,14 @@ export(realign_after_collisions) |
47 | 47 |
export(reduced_AF_columns) |
48 | 48 |
export(refGene_table_cols) |
49 | 49 |
export(remove_collisions) |
50 |
+export(reset_af_columns_def) |
|
51 |
+export(reset_annotation_IS_vars) |
|
52 |
+export(reset_mandatory_IS_vars) |
|
50 | 53 |
export(sample_statistics) |
51 | 54 |
export(separate_quant_matrices) |
55 |
+export(set_af_columns_def) |
|
56 |
+export(set_annotation_IS_vars) |
|
57 |
+export(set_mandatory_IS_vars) |
|
52 | 58 |
export(sharing_heatmap) |
53 | 59 |
export(sharing_venn) |
54 | 60 |
export(threshold_filter) |
... | ... |
@@ -2,183 +2,233 @@ |
2 | 2 |
# Exported/Internal variables |
3 | 3 |
#------------------------------------------------------------------------------# |
4 | 4 |
|
5 |
-#' Names of mandatory variables for an integration matrix. |
|
6 |
-#' |
|
7 |
-#' Contains the names of the columns that need to be present in order for a |
|
8 |
-#' tibble to be considered an integration matrix. |
|
9 |
-#' |
|
10 |
-#' @return A character vector |
|
11 |
-#' @export |
|
12 |
-#' |
|
13 |
-#' @examples |
|
14 |
-#' mandatory_IS_vars() |
|
15 |
-mandatory_IS_vars <- function() { |
|
16 |
- c("chr", "integration_locus", "strand") |
|
5 |
+# Internal: default mandatory IS vars and associated column types. |
|
6 |
+# The combination of these fields defines a unique integration site. |
|
7 |
+.default_mandatory_IS_vars <- function() { |
|
8 |
+ tibble::tribble( |
|
9 |
+ ~ names, ~ types, ~ transform, ~ flag, ~ tag, |
|
10 |
+ "chr", "char", NULL, "required", "chromosome", |
|
11 |
+ "integration_locus", "int", NULL, "required", "locus", |
|
12 |
+ "strand", "char", NULL, "required", "is_strand" |
|
13 |
+ ) |
|
17 | 14 |
} |
18 | 15 |
|
19 |
-# Internal: associates column types with column names for a more precise |
|
20 |
-# import |
|
21 |
-.mandatory_IS_types <- function(mode) { |
|
22 |
- if (mode == "fread") { |
|
23 |
- return(list( |
|
24 |
- character = c("chr", "strand"), |
|
25 |
- integer = "integration_locus" |
|
26 |
- )) |
|
27 |
- } else { |
|
28 |
- return( |
|
29 |
- list( |
|
30 |
- chr = "c", |
|
31 |
- integration_locus = "i", |
|
32 |
- strand = "c" |
|
33 |
- ) |
|
34 |
- ) |
|
35 |
- } |
|
16 |
+# Internal: default genomic annotation IS vars and associated column types. |
|
17 |
+.default_annotation_IS_vars <- function() { |
|
18 |
+ tibble::tribble( |
|
19 |
+ ~ names, ~ types, ~ transform, ~ flag, ~ tag, |
|
20 |
+ "GeneName", "char", NULL, "required", "gene_symbol", |
|
21 |
+ "GeneStrand", "char", NULL, "required", "gene_strand" |
|
22 |
+ ) |
|
36 | 23 |
} |
37 | 24 |
|
38 |
-#' Names of the annotation variables for an integration matrix. |
|
39 |
-#' |
|
40 |
-#' Contains the names of the columns that are present if the integration matrix |
|
41 |
-#' is annotated. |
|
42 |
-#' |
|
43 |
-#' @return A character vector |
|
44 |
-#' @export |
|
45 |
-#' |
|
46 |
-#' @examples |
|
47 |
-#' annotation_IS_vars() |
|
48 |
-annotation_IS_vars <- function() { |
|
49 |
- c("GeneName", "GeneStrand") |
|
25 |
+# Internal: default association file columns and types |
|
26 |
+.default_af_cols <- function() { |
|
27 |
+ tibble::tribble( |
|
28 |
+ ~ names, ~ types, ~ transform, ~ flag, ~ tag, |
|
29 |
+ "ProjectID", "char", NULL, "required", "project_id", |
|
30 |
+ "FUSIONID", "char", NULL, "optional", NA_character_, |
|
31 |
+ "PoolID", "char", NULL, "required", "pool_id", |
|
32 |
+ "TagSequence", "char", NULL, "required", "tag_seq", |
|
33 |
+ "SubjectID", "char", NULL, "required", "subject", |
|
34 |
+ "VectorType", "char", NULL, "optional", NA_character_, |
|
35 |
+ "VectorID", "char", NULL, "required", NA_character_, |
|
36 |
+ "ExperimentID", "char", NULL, "optional", NA_character_, |
|
37 |
+ "Tissue", "char", NULL, "required", "tissue", |
|
38 |
+ "TimePoint", "char", ~ stringr::str_pad(.x, 4, side = "left", pad = "0"), |
|
39 |
+ "required", "tp_days", |
|
40 |
+ "DNAFragmentation", "char", NULL, "optional", NA_character_, |
|
41 |
+ "PCRMethod", "char", NULL, "required", "pcr_method", |
|
42 |
+ "TagIDextended", "char", NULL, "optional", NA_character_, |
|
43 |
+ "Keywords","char", NULL, "optional", NA_character_, |
|
44 |
+ "CellMarker", "char", NULL, "required", "cell_marker", |
|
45 |
+ "TagID", "char", NULL, "required", NA_character_, |
|
46 |
+ "NGSProvider", "char", NULL, "optional", NA_character_, |
|
47 |
+ "NGSTechnology", "char", NULL, "required", "ngs_tech", |
|
48 |
+ "ConverrtedFilesDir", "char", NULL, "optional", NA_character_, |
|
49 |
+ "ConverrtedFilesName", "char", NULL, "optional", NA_character_, |
|
50 |
+ "SourceFileFolder", "char", NULL, "optional", NA_character_, |
|
51 |
+ "SourceFileNameR1", "char", NULL, "optional", NA_character_, |
|
52 |
+ "SourceFileNameR2", "char", NULL, "optional", NA_character_, |
|
53 |
+ "DNAnumber", "char", NULL, "required", "dna_num", |
|
54 |
+ "ReplicateNumber", "int", NULL, "required", "pcr_replicate", |
|
55 |
+ "DNAextractionDate", "date", NULL, "optional", NA_character_, |
|
56 |
+ "DNAngUsed", "numeric", NULL, "required", NA_character_, |
|
57 |
+ "LinearPCRID", "char", NULL, "optional", NA_character_, |
|
58 |
+ "LinearPCRDate", "date", NULL, "optional", NA_character_, |
|
59 |
+ "SonicationDate", "date", NULL, "optional", NA_character_, |
|
60 |
+ "LigationDate", "date", NULL, "optional", NA_character_, |
|
61 |
+ "1stExpoPCRID", "char", NULL, "optional", NA_character_, |
|
62 |
+ "1stExpoPCRDate", "date", NULL, "optional", NA_character_, |
|
63 |
+ "2ndExpoID", "char", NULL, "optional", NA_character_, |
|
64 |
+ "2ndExpoDate", "date", NULL, "optional", NA_character_, |
|
65 |
+ "FusionPrimerPCRID", "char", NULL, "optional", NA_character_, |
|
66 |
+ "FusionPrimerPCRDate", "date", NULL, "optional", NA_character_, |
|
67 |
+ "PoolDate", "date", NULL, "optional", NA_character_, |
|
68 |
+ "SequencingDate", "date", NULL, "required", NA_character_, |
|
69 |
+ "VCN", "numeric", NULL, "required", "vcn", |
|
70 |
+ "Genome", "char", NULL, "required", "genome", |
|
71 |
+ "SequencingRound", "int", NULL, "optional", NA_character_, |
|
72 |
+ "Genotype", "char", NULL, "optional", NA_character_, |
|
73 |
+ "TestGroup", "char", NULL, "optional", NA_character_, |
|
74 |
+ "MOI", "char", NULL, "optional", NA_character_, |
|
75 |
+ "Engraftment", "numeric", NULL, "optional", NA_character_, |
|
76 |
+ "Transduction", "numeric", NULL, "optional", NA_character_, |
|
77 |
+ "Notes", "char", NULL, "optional", NA_character_, |
|
78 |
+ "AddedField1", "char", NULL, "optional", NA_character_, |
|
79 |
+ "AddedField2", "char", NULL, "optional", NA_character_, |
|
80 |
+ "AddedField3", "char", NULL, "optional", NA_character_, |
|
81 |
+ "AddedField4", "char", NULL, "optional", NA_character_, |
|
82 |
+ "concatenatePoolIDSeqRun", "char", NULL,"required", "vispa_concatenate", |
|
83 |
+ "AddedField6_RelativeBloodPercentage", "char", NULL, "optional", |
|
84 |
+ NA_character_, |
|
85 |
+ "AddedField7_PurityTestFeasibility", "char", NULL, "optional", |
|
86 |
+ NA_character_, |
|
87 |
+ "AddedField8_FacsSeparationPurity", "char", NULL, "optional", NA_character_, |
|
88 |
+ "Kapa", "numeric", NULL, "required", NA_character_, |
|
89 |
+ "ulForPool", "numeric", NULL, "required", NA_character_, |
|
90 |
+ "CompleteAmplificationID", "char", NULL, "required", "pcr_repl_id", |
|
91 |
+ "UniqueID", "char", NULL, "required", NA_character_, |
|
92 |
+ "StudyTestID", "char", NULL, "optional", NA_character_, |
|
93 |
+ "StudyTestGroup", "char", NULL, "optional", NA_character_, |
|
94 |
+ "MouseID", "char", NULL, "optional", NA_character_, |
|
95 |
+ "Tigroup", "char", NULL, "optional", NA_character_, |
|
96 |
+ "Tisource", "char", NULL, "optional", NA_character_, |
|
97 |
+ "PathToFolderProjectID", "char", NULL, "required", "proj_folder", |
|
98 |
+ "SamplesNameCheck", "char", NULL, "optional", NA_character_, |
|
99 |
+ "TimepointDays", "char", NULL, "optional", NA_character_, |
|
100 |
+ "TimepointMonths", "char", NULL, "optional", NA_character_, |
|
101 |
+ "TimepointYears", "char", NULL, "optional", NA_character_, |
|
102 |
+ "ng DNA corrected", "numeric", NULL, "optional", NA_character_ |
|
103 |
+ ) |
|
104 |
+} |
|
105 |
+ |
|
106 |
+# Internal: default columns and types of vispa2 stats cols |
|
107 |
+.default_iss_stats_specs <- function() { |
|
108 |
+ tibble::tribble( |
|
109 |
+ ~ names, ~ types, ~ transform, ~ flag, ~ tag, |
|
110 |
+ "RUN_NAME", "char", NULL, "required", NA_character_, |
|
111 |
+ "POOL", "char", NULL, "required", "vispa_concatenate", |
|
112 |
+ "TAG", "char", ~ stringr::str_replace_all(.x, pattern = "\\.", |
|
113 |
+ replacement = ""), "required", |
|
114 |
+ "tag_seq", |
|
115 |
+ "RAW_READS", "int", NULL, "optional", NA_character_, |
|
116 |
+ "QUALITY_PASSED", "int", NULL, "optional", NA_character_, |
|
117 |
+ "PHIX_MAPPING", "int", NULL, "optional", NA_character_, |
|
118 |
+ "PLASMID_MAPPED_BYPOOL", "int", NULL, "optional", NA_character_, |
|
119 |
+ "BARCODE_MUX", "int", NULL, "required", NA_character_, |
|
120 |
+ "LTR_IDENTIFIED", "int", NULL, "optional", NA_character_, |
|
121 |
+ "TRIMMING_FINAL_LTRLC", "int", NULL, "optional", NA_character_, |
|
122 |
+ "LV_MAPPED", "int", NULL, "optional", NA_character_, |
|
123 |
+ "BWA_MAPPED_OVERALL", "int", NULL, "optional", NA_character_, |
|
124 |
+ "ISS_MAPPED_OVERALL", "int", NULL, "optional", NA_character_, |
|
125 |
+ "ISS_MAPPED_PP", "int", NULL, "optional", NA_character_ |
|
126 |
+ ) |
|
127 |
+} |
|
128 |
+ |
|
129 |
+# Mappings between input format and formats requested by external parsing |
|
130 |
+# functions |
|
131 |
+.types_mapping <- function() { |
|
132 |
+ tibble::tribble( |
|
133 |
+ ~ types, ~ mapping, ~ fread, |
|
134 |
+ "char", "c", "character", |
|
135 |
+ "int", "i", "integer", |
|
136 |
+ "logi", "l", "logical", |
|
137 |
+ "numeric", "d", "numeric", |
|
138 |
+ "factor", "f", "factor", |
|
139 |
+ "date", "c", "charcter", |
|
140 |
+ "ymd", "c", "character", |
|
141 |
+ "ydm", "c", "character", |
|
142 |
+ "mdy", "c", "character", |
|
143 |
+ "myd", "c", "character", |
|
144 |
+ "dmy", "c", "character", |
|
145 |
+ "yq", "c", "character", |
|
146 |
+ "ym", "c", "character", |
|
147 |
+ "my", "c", "character", |
|
148 |
+ "ymd_hms", "c", "character", |
|
149 |
+ "ymd_hm", "c", "character", |
|
150 |
+ "ymd_h", "c", "character", |
|
151 |
+ "dmy_hms", "c", "character", |
|
152 |
+ "dmy_hm", "c", "character", |
|
153 |
+ "dmy_h", "c", "character", |
|
154 |
+ "mdy_hms", "c", "character", |
|
155 |
+ "mdy_hm", "c", "character", |
|
156 |
+ "mdy_h", "c", "character", |
|
157 |
+ "ydm_hms", "c", "character", |
|
158 |
+ "ydm_hm", "c", "character", |
|
159 |
+ "ydm_h", "c", "character" |
|
160 |
+ ) |
|
50 | 161 |
} |
51 | 162 |
|
52 | 163 |
# Internal: associates column types with column names for a more precise |
53 | 164 |
# import |
54 |
-.annotation_IS_types <- function(mode) { |
|
55 |
- if (mode == "fread") { |
|
56 |
- return(list(character = c("GeneName", "GeneStrand"))) |
|
57 |
- } else { |
|
58 |
- return(list( |
|
59 |
- GeneName = "c", |
|
60 |
- GeneStrand = "c" |
|
61 |
- )) |
|
62 |
- } |
|
165 |
+.mandatory_IS_types <- function(mode) { |
|
166 |
+ specs <- mandatory_IS_vars(include_types = TRUE) |
|
167 |
+ specs_mappings <- specs %>% |
|
168 |
+ dplyr::left_join(.types_mapping(), by = "types") |
|
169 |
+ if (mode == "fread") { |
|
170 |
+ specs_mappings <- specs_mappings %>% |
|
171 |
+ dplyr::select(.data$names, .data$fread) %>% |
|
172 |
+ dplyr::group_by(.data$fread) |
|
173 |
+ types <- specs_mappings %>% |
|
174 |
+ dplyr::group_keys() %>% |
|
175 |
+ dplyr::pull(.data$fread) |
|
176 |
+ specs_mappings <- specs_mappings %>% |
|
177 |
+ dplyr::group_split(.keep = FALSE) |
|
178 |
+ names(specs_mappings) <- types |
|
179 |
+ types <- purrr::map(specs_mappings, ~ .x$names) |
|
180 |
+ return(types) |
|
181 |
+ } |
|
182 |
+ types <- as.list(setNames(specs_mappings$mapping, specs_mappings$names)) |
|
183 |
+ return(types) |
|
63 | 184 |
} |
64 | 185 |
|
65 |
-#' Names of the columns in the association file. |
|
66 |
-#' |
|
67 |
-#' All the names of the columns present in the association file. |
|
68 |
-#' |
|
69 |
-#' @return A character vector |
|
70 |
-#' @export |
|
71 |
-#' |
|
72 |
-#' @examples |
|
73 |
-#' association_file_columns() |
|
74 |
-association_file_columns <- function() { |
|
75 |
- c( |
|
76 |
- "ProjectID", "FUSIONID", "PoolID", "TagSequence", "SubjectID", |
|
77 |
- "VectorType", "VectorID", "ExperimentID", "Tissue", "TimePoint", |
|
78 |
- "DNAFragmentation", "PCRMethod", "TagIDextended", "Keywords", |
|
79 |
- "CellMarker", |
|
80 |
- "TagID", "NGSProvider", "NGSTechnology", "ConverrtedFilesDir", |
|
81 |
- "ConverrtedFilesName", "SourceFileFolder", "SourceFileNameR1", |
|
82 |
- "SourceFileNameR2", "DNAnumber", "ReplicateNumber", "DNAextractionDate", |
|
83 |
- "DNAngUsed", "LinearPCRID", "LinearPCRDate", "SonicationDate", |
|
84 |
- "LigationDate", "1stExpoPCRID", "1stExpoPCRDate", "2ndExpoID", |
|
85 |
- "2ndExpoDate", "FusionPrimerPCRID", "FusionPrimerPCRDate", |
|
86 |
- "PoolDate", "SequencingDate", "VCN", "Genome", "SequencingRound", |
|
87 |
- "Genotype", "TestGroup", "MOI", "Engraftment", "Transduction", "Notes", |
|
88 |
- "AddedField1", "AddedField2", "AddedField3", "AddedField4", |
|
89 |
- "concatenatePoolIDSeqRun", "AddedField6_RelativeBloodPercentage", |
|
90 |
- "AddedField7_PurityTestFeasibility", "AddedField8_FacsSeparationPurity", |
|
91 |
- "Kapa", "ulForPool", "CompleteAmplificationID", "UniqueID", |
|
92 |
- "StudyTestID", |
|
93 |
- "StudyTestGroup", "MouseID", "Tigroup", "Tisource", |
|
94 |
- "PathToFolderProjectID", |
|
95 |
- "SamplesNameCheck", |
|
96 |
- "TimepointDays", "TimepointMonths", |
|
97 |
- "TimepointYears", "ng DNA corrected" |
|
98 |
- ) |
|
186 |
+# Internal: associates column types with column names for a more precise |
|
187 |
+# import |
|
188 |
+.annotation_IS_types <- function(mode) { |
|
189 |
+ specs <- annotation_IS_vars(include_types = TRUE) |
|
190 |
+ specs_mappings <- specs %>% |
|
191 |
+ dplyr::left_join(.types_mapping(), by = "types") |
|
192 |
+ if (mode == "fread") { |
|
193 |
+ specs_mappings <- specs_mappings %>% |
|
194 |
+ dplyr::select(.data$names, .data$fread) %>% |
|
195 |
+ dplyr::group_by(.data$fread) |
|
196 |
+ types <- specs_mappings %>% |
|
197 |
+ dplyr::group_keys() %>% |
|
198 |
+ dplyr::pull(.data$fread) |
|
199 |
+ specs_mappings <- specs_mappings %>% |
|
200 |
+ dplyr::group_split(.keep = FALSE) |
|
201 |
+ names(specs_mappings) <- types |
|
202 |
+ types <- purrr::map(specs_mappings, ~ .x$names) |
|
203 |
+ return(types) |
|
204 |
+ } |
|
205 |
+ types <- as.list(setNames(specs_mappings$mapping, specs_mappings$names)) |
|
206 |
+ return(types) |
|
99 | 207 |
} |
100 | 208 |
|
101 | 209 |
# Internal: associates column types with column names for a more precise |
102 | 210 |
# import |
103 | 211 |
.af_col_types <- function(mode) { |
104 |
- if (mode == "fread") { |
|
105 |
- types <- list( |
|
106 |
- character = c( |
|
107 |
- "ProjectID", "FUSIONID", "PoolID", "TagSequence", |
|
108 |
- "SubjectID", "VectorType", "VectorID", "ExperimentID", |
|
109 |
- "Tissue", "TimePoint", "DNAFragmentation", |
|
110 |
- "PCRMethod", "TagIDextended", "Keywords", |
|
111 |
- "CellMarker", "TagID", "NGSProvider", "NGSTechnology", |
|
112 |
- "ConverrtedFilesDir", "ConverrtedFilesName", |
|
113 |
- "SourceFileFolder", "SourceFileNameR1", |
|
114 |
- "SourceFileNameR2", "DNAnumber", "LinearPCRID", |
|
115 |
- "1stExpoPCRID", "2ndExpoID", "FusionPrimerPCRID", |
|
116 |
- "Genome", "Genotype", "Notes", "AddedField1", |
|
117 |
- "AddedField2", "AddedField3", "AddedField4", |
|
118 |
- "concatenatePoolIDSeqRun", "CompleteAmplificationID", |
|
119 |
- "UniqueID", "StudyTestID", "Tigroup", "Tisource", |
|
120 |
- "PathToFolderProjectID", "SamplesNameCheck", |
|
121 |
- "DNAextractionDate", "LinearPCRDate", |
|
122 |
- "SonicationDate", "LigationDate", |
|
123 |
- "FusionPrimerPCRDate", "PoolDate", "SequencingDate", |
|
124 |
- "MOI", "AddedField6_RelativeBloodPercentage", |
|
125 |
- "TestGroup" |
|
126 |
- ), |
|
127 |
- double = c( |
|
128 |
- "DNAngUsed", "VCN", "Engraftment", "Transduction", |
|
129 |
- "AddedField7_PurityTestFeasibility", |
|
130 |
- "AddedField8_FacsSeparationPurity", "Kapa", |
|
131 |
- "ulForPool", "TimepointMonths", "TimepointYears", |
|
132 |
- "ng DNA corrected" |
|
133 |
- ), |
|
134 |
- integer = c( |
|
135 |
- "ReplicateNumber", "SequencingRound", |
|
136 |
- "StudyTestGroup", "MouseID", "TimepointDays" |
|
137 |
- ) |
|
138 |
- ) |
|
139 |
- return(types) |
|
140 |
- } |
|
141 |
- if (mode == "readr") { |
|
142 |
- types <- list( |
|
143 |
- ProjectID = "c", FUSIONID = "c", PoolID = "c", TagSequence = "c", |
|
144 |
- SubjectID = "c", VectorType = "c", VectorID = "c", |
|
145 |
- ExperimentID = "c", Tissue = "c", TimePoint = "c", |
|
146 |
- DNAFragmentation = "c", PCRMethod = "c", TagIDextended = "c", |
|
147 |
- Keywords = "c", CellMarker = "c", TagID = "c", |
|
148 |
- NGSProvider = "c", NGSTechnology = "c", |
|
149 |
- ConverrtedFilesDir = "c", ConverrtedFilesName = "c", |
|
150 |
- SourceFileFolder = "c", SourceFileNameR1 = "c", |
|
151 |
- SourceFileNameR2 = "c", DNAnumber = "c", LinearPCRID = "c", |
|
152 |
- `1stExpoPCRID` = "c", `2ndExpoID` = "c", |
|
153 |
- FusionPrimerPCRID = "c", Genome = "c", Genotype = "c", |
|
154 |
- Notes = "c", AddedField1 = "c", |
|
155 |
- AddedField2 = "c", AddedField3 = "c", AddedField4 = "c", |
|
156 |
- concatenatePoolIDSeqRun = "c", CompleteAmplificationID = "c", |
|
157 |
- UniqueID = "c", StudyTestID = "c", Tigroup = "c", Tisource = "c", |
|
158 |
- PathToFolderProjectID = "c", SamplesNameCheck = "c", |
|
159 |
- DNAextractionDate = "c", |
|
160 |
- LinearPCRDate = "c", |
|
161 |
- SonicationDate = "c", |
|
162 |
- LigationDate = "c", |
|
163 |
- FusionPrimerPCRDate = "c", |
|
164 |
- PoolDate = "c", |
|
165 |
- SequencingDate = "c", |
|
166 |
- MOI = "c", AddedField6_RelativeBloodPercentage = "c", |
|
167 |
- DNAngUsed = "d", VCN = "d", Engraftment = "d", Transduction = "d", |
|
168 |
- AddedField7_PurityTestFeasibility = "d", |
|
169 |
- AddedField8_FacsSeparationPurity = "d", Kapa = "d", |
|
170 |
- ulForPool = "d", TimepointMonths = "d", TimepointYears = "d", |
|
171 |
- `ng DNA corrected` = "d", |
|
172 |
- ReplicateNumber = "i", SequencingRound = "i", TestGroup = "c", |
|
173 |
- MouseID = "i", TimepointDays = "i", |
|
174 |
- `1stExpoPCRDate` = "c", |
|
175 |
- `2ndExpoDate` = "c", |
|
176 |
- StudyTestGroup = "i" |
|
177 |
- ) |
|
178 |
- return(types) |
|
179 |
- } |
|
212 |
+ specs <- association_file_columns(include_types = TRUE) |
|
213 |
+ specs_mappings <- specs %>% |
|
214 |
+ dplyr::left_join(.types_mapping(), by = "types") |
|
215 |
+ if (mode == "fread") { |
|
216 |
+ specs_mappings <- specs_mappings %>% |
|
217 |
+ dplyr::select(.data$names, .data$fread) %>% |
|
218 |
+ dplyr::group_by(.data$fread) |
|
219 |
+ types <- specs_mappings %>% |
|
220 |
+ dplyr::group_keys() %>% |
|
221 |
+ dplyr::pull(.data$fread) |
|
222 |
+ specs_mappings <- specs_mappings %>% |
|
223 |
+ dplyr::group_split(.keep = FALSE) |
|
224 |
+ names(specs_mappings) <- types |
|
225 |
+ types <- purrr::map(specs_mappings, ~ .x$names) |
|
226 |
+ return(types) |
|
227 |
+ } |
|
228 |
+ types <- as.list(setNames(specs_mappings$mapping, specs_mappings$names)) |
|
180 | 229 |
} |
181 | 230 |
|
231 |
+ |
|
182 | 232 |
# Internal: used for file system alignment in import_association_file, |
183 | 233 |
# gives the names of the columns that respectively contain: |
184 | 234 |
# - the absolute path on disk of the project |
... | ... |
@@ -188,6 +238,11 @@ association_file_columns <- function() { |
188 | 238 |
list(project = "Path", quant = "Path_quant", iss = "Path_iss") |
189 | 239 |
} |
190 | 240 |
|
241 |
+ |
|
242 |
+.default_matrix_suffixes <- function() { |
|
243 |
+ |
|
244 |
+} |
|
245 |
+ |
|
191 | 246 |
.matrix_annotated_suffixes <- function() { |
192 | 247 |
c(".no0.annotated") |
193 | 248 |
} |
... | ... |
@@ -277,3 +332,22 @@ refGene_table_cols <- function() { |
277 | 332 |
"max_cdsEnd", "minmax_CdsLen", "average_CdsLen" |
278 | 333 |
) |
279 | 334 |
} |
335 |
+ |
|
336 |
+ |
|
337 |
+available_column_tags <- function() { |
|
338 |
+ list( |
|
339 |
+ critical = list(af = c( |
|
340 |
+ "project_id", "pool_id", "tag_seq", "subject", "tissue", |
|
341 |
+ "cell_marker", "pcr_replicate", "vispa_concatenate", |
|
342 |
+ "pcr_repl_id", "proj_folder" |
|
343 |
+ ), |
|
344 |
+ matrix = c("chromosome", "locus", "is_strand", "gene_symbol"), |
|
345 |
+ stats = c("vispa_concatenate", "tag_seq")), |
|
346 |
+ optional = list(af = c( |
|
347 |
+ "tp_days", "pcr_method", "ngs_tech", "dna_num", |
|
348 |
+ "vcn", "genome" |
|
349 |
+ ), |
|
350 |
+ matrix = c("gene_strand"), |
|
351 |
+ stats = c()) |
|
352 |
+ ) |
|
353 |
+} |
... | ... |
@@ -5,18 +5,24 @@ |
5 | 5 |
#' |
6 | 6 |
#' @description \lifecycle{stable} |
7 | 7 |
#' This function allows to read and import an integration matrix |
8 |
-#' produced as the output of Vispa2 pipeline and converts it to a tidy |
|
8 |
+#' (ideally produced by VISPA2) and converts it to a tidy |
|
9 | 9 |
#' format. |
10 | 10 |
#' |
11 | 11 |
#' @param path The path to the file on disk |
12 |
-#' @param to_exclude Either NULL or a character vector of column names that |
|
12 |
+#' @param to_exclude Either `NULL` or a character vector of column names that |
|
13 | 13 |
#' should be ignored when importing |
14 | 14 |
#' @param keep_excluded Keep the columns in `to_exclude` as additional |
15 |
-#' id columns? |
|
15 |
+#' columns? (also as a vector) |
|
16 | 16 |
#' @param separator The column delimiter used, defaults to `\t` |
17 | 17 |
#' |
18 |
+#' @section Required vars and tags: |
|
19 |
+#' ## Required vars |
|
20 |
+#' TODO |
|
21 |
+#' |
|
18 | 22 |
#' @return A data.table object in tidy format |
23 |
+#' |
|
19 | 24 |
#' @family Import functions |
25 |
+#' |
|
20 | 26 |
#' @importFrom rlang abort inform |
21 | 27 |
#' @importFrom fs path_ext |
22 | 28 |
#' @importFrom readr read_delim cols |
... | ... |
@@ -45,12 +51,15 @@ |
45 | 51 |
#' matrix <- import_single_Vispa2Matrix(matrix_path) |
46 | 52 |
#' head(matrix) |
47 | 53 |
import_single_Vispa2Matrix <- function(path, |
48 |
- to_exclude = NULL, |
|
49 |
- keep_excluded = FALSE, |
|
50 |
- separator = "\t") { |
|
54 |
+ separator = "\t", |
|
55 |
+ additional_cols = NULL, |
|
56 |
+ transformations = NULL, |
|
57 |
+ sample_names_to = pcr_id_column(), |
|
58 |
+ values_to = "Value", |
|
59 |
+ to_exclude = lifecycle::deprecated(), |
|
60 |
+ keep_excluded = lifecycle::deprecated() |
|
61 |
+ ) { |
|
51 | 62 |
stopifnot(!missing(path) & is.character(path)) |
52 |
- stopifnot(is.null(to_exclude) || is.character(to_exclude)) |
|
53 |
- stopifnot(is.logical(keep_excluded)) |
|
54 | 63 |
stopifnot(is.character(separator)) |
55 | 64 |
if (!file.exists(path)) { |
56 | 65 |
not_found_msg <- paste("File not found at", path) |
... | ... |
@@ -59,150 +68,42 @@ import_single_Vispa2Matrix <- function(path, |
59 | 68 |
if (!fs::is_file(path)) { |
60 | 69 |
rlang::abort("Path exists but is not a file") |
61 | 70 |
} |
62 |
- mode <- "fread" |
|
63 |
- ## Is the file compressed? |
|
64 |
- is_compressed <- fs::path_ext(path) %in% .compressed_formats() |
|
65 |
- if (is_compressed) { |
|
66 |
- ## The compression type is supported by data.table::fread? |
|
67 |
- compression_type <- fs::path_ext(path) |
|
68 |
- if (!compression_type %in% .supported_fread_compression_formats()) { |
|
69 |
- ### If not, switch to classic for reading |
|
70 |
- mode <- "classic" |
|
71 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
72 |
- rlang::inform(.unsupported_comp_format_inf(), |
|
73 |
- class = "unsup_comp_format" |
|
74 |
- ) |
|
75 |
- } |
|
76 |
- } |
|
71 |
+ stopifnot(is.null(transformations) || |
|
72 |
+ (is.list(transformations) && !is.null(names(transformations)))) |
|
73 |
+ stopifnot(is.character(sample_names_to)) |
|
74 |
+ sample_names_to <- sample_names_to[1] |
|
75 |
+ stopifnot(is.character(values_to)) |
|
76 |
+ values_to <- values_to[1] |
|
77 |
+ deprecation_details <- paste("Arguments 'to_exclude' and 'keep_excluded'", |
|
78 |
+ "are deprecated in favor of a single argument", |
|
79 |
+ "which allows a more refined tuning. See", |
|
80 |
+ "`?import_single_Vispa2Matrix` for details") |
|
81 |
+ if (lifecycle::is_present(to_exclude)) { |
|
82 |
+ lifecycle::deprecate_warn( |
|
83 |
+ when = "1.5.4", |
|
84 |
+ what = "import_single_Vispa2Matrix(to_exclude)", |
|
85 |
+ with = "import_single_Vispa2Matrix(additional_cols)", |
|
86 |
+ details = deprecation_details |
|
87 |
+ ) |
|
88 |
+ return(NULL) |
|
77 | 89 |
} |
78 |
- ### Peak headers |
|
79 |
- peek_headers <- readr::read_delim(path, |
|
80 |
- delim = separator, n_max = 0, |
|
81 |
- col_types = readr::cols() |
|
82 |
- ) |
|
83 |
- ## - Detect type |
|
84 |
- df_type <- .auto_detect_type(peek_headers) |
|
85 |
- if (df_type == "MALFORMED") { |
|
86 |
- rlang::abort(.malformed_ISmatrix_error(), |
|
87 |
- class = "malformed_ism" |
|
88 |
- ) |
|
89 |
- } |
|
90 |
- is_annotated <- .is_annotated(peek_headers) |
|
91 |
- ## - Start reading |
|
92 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
93 |
- rlang::inform(c("Reading file...", i = paste0("Mode: ", mode))) |
|
90 |
+ if (lifecycle::is_present(keep_excluded)) { |
|
91 |
+ lifecycle::deprecate_warn( |
|
92 |
+ when = "1.5.4", |
|
93 |
+ what = "import_single_Vispa2Matrix(keep_excluded)", |
|
94 |
+ with = "import_single_Vispa2Matrix(additional_cols)", |
|
95 |
+ details = deprecation_details |
|
96 |
+ ) |
|
97 |
+ return(NULL) |
|
94 | 98 |
} |
95 |
- df <- if (mode == "fread") { |
|
96 |
- if (!keep_excluded) { |
|
97 |
- .read_with_fread( |
|
98 |
- path = path, to_drop = to_exclude, |
|
99 |
- df_type = df_type, annotated = is_annotated, |
|
100 |
- sep = separator |
|
101 |
- ) |
|
102 |
- } else { |
|
103 |
- .read_with_fread( |
|
104 |
- path = path, to_drop = NULL, |
|
105 |
- df_type = df_type, annotated = is_annotated, |
|
106 |
- sep = separator |
|
107 |
- ) |
|
108 |
- } |
|
109 |
- } else { |
|
110 |
- if (!keep_excluded) { |
|
111 |
- .read_with_readr( |
|
112 |
- path = path, to_drop = to_exclude, |
|
113 |
- df_type = df_type, annotated = is_annotated, |
|
114 |
- sep = separator |
|
115 |
- ) |
|
116 |
- } else { |
|
117 |
- .read_with_readr( |
|
118 |
- path = path, to_drop = NULL, |
|
119 |
- df_type = df_type, annotated = is_annotated, |
|
120 |
- sep = separator |
|
121 |
- ) |
|
122 |
- } |
|
123 |
- } |
|
124 |
- ## - Report summary |
|
125 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
126 |
- rlang::inform(.summary_ism_import_msg( |
|
127 |
- df_type, |
|
128 |
- .is_annotated(df), |
|
129 |
- dim(df), |
|
130 |
- mode |
|
131 |
- ), |
|
132 |
- class = "ism_import_summary" |
|
133 |
- ) |
|
134 |
- } |
|
135 |
- if (df_type == "OLD") { |
|
136 |
- df <- df %>% |
|
137 |
- tidyr::separate( |
|
138 |
- col = .data$IS_genomicID, |
|
139 |
- into = mandatory_IS_vars(), |
|
140 |
- sep = "_", remove = TRUE, |
|
141 |
- convert = TRUE |
|
142 |
- ) %>% |
|
143 |
- dplyr::mutate(chr = stringr::str_replace( |
|
144 |
- .data$chr, "chr", "" |
|
145 |
- )) |
|
146 |
- } |
|
147 |
- ## - Split in chunks |
|
148 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
149 |
- rlang::inform("Reshaping...") |
|
150 |
- } |
|
151 |
- chunks <- split(df, |
|
152 |
- by = c("chr"), |
|
153 |
- verbose = FALSE |
|
154 |
- ) |
|
155 |
- ## - Melt in parallel |
|
156 |
- p <- if (.Platform$OS.type == "windows") { |
|
157 |
- BiocParallel::SnowParam( |
|
158 |
- tasks = length(chunks), |
|
159 |
- progressbar = getOption("ISAnalytics.verbose"), |
|
160 |
- exportglobals = TRUE, |
|
161 |
- stop.on.error = TRUE |
|
162 |
- ) |
|
163 |
- } else { |
|
164 |
- BiocParallel::MulticoreParam( |
|
165 |
- tasks = length(chunks), |
|
166 |
- progressbar = getOption("ISAnalytics.verbose"), |
|
167 |
- exportglobals = FALSE, |
|
168 |
- stop.on.error = TRUE |
|
169 |
- ) |
|
170 |
- } |
|
171 |
- mt <- function(data, annot) { |
|
172 |
- id_vars <- if (annot) { |
|
173 |
- c( |
|
174 |
- mandatory_IS_vars(), |
|
175 |
- annotation_IS_vars() |
|
176 |
- ) |
|
177 |
- } else { |
|
178 |
- mandatory_IS_vars() |
|
179 |
- } |
|
180 |
- if (!is.null(to_exclude)) { |
|
181 |
- if (length(to_exclude) > 0 && keep_excluded) { |
|
182 |
- id_vars <- c(id_vars, to_exclude) |
|
183 |
- } |
|
184 |
- } |
|
185 |
- data.table::melt.data.table(data, |
|
186 |
- id.vars = id_vars, |
|
187 |
- variable.name = "CompleteAmplificationID", |
|
188 |
- value.name = "Value", |
|
189 |
- na.rm = TRUE, |
|
190 |
- verbose = FALSE |
|
191 |
- ) |
|
192 |
- } |
|
193 |
- tidy_chunks <- BiocParallel::bplapply( |
|
194 |
- X = chunks, |
|
195 |
- FUN = mt, |
|
196 |
- annot = is_annotated, |
|
197 |
- BPPARAM = p |
|
198 |
- ) |
|
199 |
- BiocParallel::bpstop(p) |
|
200 |
- tidy <- data.table::rbindlist(tidy_chunks) |
|
201 |
- tidy <- tidy["Value" > 0] |
|
202 |
- if (getOption("ISAnalytics.verbose") == TRUE) { |
|
203 |
- rlang::inform("Done!") |
|
204 |
- } |
|
205 |
- return(tidy) |
|
99 |
+ |
|
100 |
+ tidy_df <- .import_single_matrix(path = path, separator = separator, |
|
101 |
+ additional_cols = additional_cols, |
|
102 |
+ transformations = transformations, |
|
103 |
+ call_mode = "EXTERNAL", |
|
104 |
+ id_col_name = sample_names_to, |
|
105 |
+ val_col_name = values_to) |
|
106 |
+ return(tidy_df) |
|
206 | 107 |
} |
207 | 108 |
|
208 | 109 |
|
... | ... |
@@ -256,41 +157,122 @@ import_single_Vispa2Matrix <- function(path, |
256 | 157 |
#' @examples |
257 | 158 |
#' fs_path <- system.file("extdata", "fs.zip", package = "ISAnalytics") |
258 | 159 |
#' fs <- unzip_file_system(fs_path, "fs") |
259 |
-#' af_path <- system.file("extdata", "asso.file.tsv.gz", package = "ISAnalytics") |
|
160 |
+#' af_path <- system.file("extdata", "asso.file.tsv.gz", |
|
161 |
+#' package = "ISAnalytics" |
|
162 |
+#' ) |
|
260 | 163 |
#' af <- import_association_file(af_path, root = fs, report_path = NULL) |
261 | 164 |
#' head(af) |
262 | 165 |
import_association_file <- function(path, |
263 | 166 |
root = NULL, |
264 |
- tp_padding = 4, |
|
265 | 167 |
dates_format = "ymd", |
266 | 168 |
separator = "\t", |
267 | 169 |
filter_for = NULL, |
268 | 170 |
import_iss = FALSE, |
269 | 171 |
convert_tp = TRUE, |
270 | 172 |
report_path = default_report_path(), |
173 |
+ transformations = default_af_transform(convert_tp), |
|
174 |
+ tp_padding = lifecycle::deprecated(), |
|
271 | 175 |
...) { |
272 | 176 |
# Check parameters |
273 |
- stopifnot(is.character(path) & length(path) == 1) |
|
177 |
+ stopifnot(is.character(path)) |
|
178 |
+ path <- path[1] |
|
274 | 179 |
stopifnot((is.character(root) & length(root) == 1) || (is.null(root))) |
275 | 180 |
stopifnot(file.exists(path)) |
276 | 181 |
if (!is.null(root) && root != "") { |
277 | 182 |
stopifnot(file.exists(root)) |
278 | 183 |
} |
279 |
- stopifnot((is.numeric(tp_padding) | |
|
280 |
- is.integer(tp_padding)) & length(tp_padding) == 1) |
|
281 | 184 |
stopifnot(length(dates_format) == 1 & dates_format %in% date_formats()) |
282 |
- stopifnot(is.character(separator) && length(separator) == 1) |
|
283 |
- stopifnot(is.logical(import_iss) && length(import_iss) == 1) |
|
185 |
+ stopifnot(is.character(separator)) |
|
186 |
+ separator <- separator[1] |
|
187 |
+ stopifnot(is.logical(import_iss)) |
|
188 |
+ import_iss <- import_iss[1] |
|
284 | 189 |
if (import_iss & is.null(root)) { |
285 | 190 |
rlang::abort(.no_stats_import_err()) |
286 | 191 |
} |
192 |
+ stopifnot(is.null(transformations) || |
|
193 |
+ (is.list(transformations) && !is.null(names(transformations)))) |
|
194 |
+ if (lifecycle::is_present(tp_padding)) { |
|
195 |
+ lifecycle::deprecate_warn( |
|
196 |
+ when = "1.5.4", |
|
197 |
+ what = "import_association_file(tp_padding)", |
|
198 |
+ details = c(paste( |
|
199 |
+ "The argument is now deprecated in favor of custom", |
|
200 |
+ "column transformations" |
|
201 |
+ ), |
|
202 |
+ i = paste( |
|
203 |
+ "See the documentation of `transform_columns`", |
|
204 |
+ "or browse the package vignettes for more details" |
|
205 |
+ ) |
|
206 |
+ ) |
|
207 |
+ ) |
|
208 |
+ } |
|
287 | 209 |
# Check filter |
288 | 210 |
stopifnot(is.null(filter_for) || |
289 | 211 |
(is.list(filter_for) && !is.null(names(filter_for)))) |
212 |
+ |
|
213 |
+ # Check presence of required tags |
|
214 |
+ required_tags <- list() |
|
215 |
+ req_tags_politic <- c() |
|
216 |
+ if (!is.null(root)) { |
|
217 |
+ ### Tags required for file system alignment |
|
218 |
+ required_tags <- append( |
|
219 |
+ required_tags, |
|
220 |
+ list( |
|
221 |
+ "project_id" = "char", |
|
222 |
+ "proj_folder" = "char", |
|
223 |
+ "vispa_concatenate" = "char" |
|
224 |
+ ) |
|
225 |
+ ) |
|
226 |
+ req_tags_politic <- c(req_tags_politic, |
|
227 |
+ "project_id" = "error", |
|
228 |
+ "proj_folder" = "error", |
|
229 |
+ "vispa_concatenate" = "error" |
|
230 |
+ ) |
|
231 |
+ } |
|
232 |
+ if (convert_tp) { |
|
233 |
+ ### tags required for time point conversion |
|
234 |
+ required_tags <- append( |
|
235 |
+ required_tags, |
|
236 |
+ list("tp_days" = c("char", "int", "numeric")) |
|
237 |
+ ) |
|
238 |
+ req_tags_politic <- c(req_tags_politic, |
|
239 |
+ "tp_days" = "first" |
|
240 |
+ ) |
|
241 |
+ } |
|
242 |
+ tags_to_cols <- if (!purrr::is_empty(required_tags)) { |
|
243 |
+ .check_required_cols( |
|
244 |
+ required_tags = required_tags, |
|
245 |
+ vars_df = association_file_columns(TRUE), |
|
246 |
+ duplicate_politic = req_tags_politic |
|
247 |
+ ) |
|
248 |
+ } else { |
|
249 |
+ NULL |
|
250 |
+ } |
|
290 | 251 |
# Read file and check the correctness |
291 | 252 |
af_checks <- .manage_association_file( |
292 |
- path, root, tp_padding, dates_format, |
|
293 |
- separator, filter_for |
|
253 |
+ af_path = path, |
|
254 |
+ root = root, |
|
255 |
+ format = dates_format, |
|
256 |
+ delimiter = separator, |
|
257 |
+ filter = filter_for, |
|
258 |
+ proj_fold_col = dplyr::if_else(!is.null(tags_to_cols), |
|
259 |
+ tags_to_cols %>% |
|
260 |
+ dplyr::filter(.data$tag == "proj_folder") %>% |
|
261 |
+ dplyr::pull(.data$names), |
|
262 |
+ NULL |
|
263 |
+ ), |
|
264 |
+ concat_pool_col = dplyr::if_else(!is.null(tags_to_cols), |
|
265 |
+ tags_to_cols %>% |
|
266 |
+ dplyr::filter(.data$tag == "vispa_concatenate") %>% |
|
267 |
+ dplyr::pull(.data$names), |
|
268 |
+ NULL |
|
269 |
+ ), |
|
270 |
+ project_id_col = dplyr::if_else(!is.null(tags_to_cols), |
|
271 |
+ tags_to_cols %>% |
|
272 |
+ dplyr::filter(.data$tag == "project_id") %>% |
|
273 |
+ dplyr::pull(.data$names), |
|
274 |
+ NULL |
|
275 |
+ ) |
|
294 | 276 |
) |
295 | 277 |
as_file <- af_checks$af |
296 | 278 |
parsing_problems <- af_checks$parsing_probs |
... | ... |
@@ -299,68 +281,35 @@ import_association_file <- function(path, |
299 | 281 |
if (nrow(parsing_problems) == 0) { |
300 | 282 |
parsing_problems <- NULL |
301 | 283 |
} |
302 |
- if (nrow(date_problems) == 0) { |
|
284 |
+ if (is.null(date_problems) || nrow(date_problems) == 0) { |
|
303 | 285 |
date_problems <- NULL |
304 | 286 |
} |
305 | 287 |
col_probs <- list(missing = NULL, non_standard = NULL) |
306 | 288 |
if (!.check_af_correctness(as_file)) { |
307 |
- col_probs[["missing"]] <- association_file_columns()[ |
|
308 |
- !association_file_columns() %in% colnames(as_file) |
|
289 |
+ min_required_cols <- association_file_columns(TRUE) %>% |
|
290 |
+ dplyr::filter(.data$flag == "required") %>% |
|
291 |
+ dplyr::pull(.data$names) |
|
292 |
+ col_probs[["missing"]] <- min_required_cols[ |
|
293 |
+ !min_required_cols %in% colnames(as_file) |
|
309 | 294 |
] |
310 | 295 |
} |
311 | 296 |
non_standard <- colnames(as_file)[ |
312 | 297 |
!colnames(as_file) %in% c( |
313 |
- association_file_columns(), "Path", |
|
314 |
- "Path_quant", "Path_iss" |
|
298 |
+ association_file_columns(), .path_cols_names() |
|
315 | 299 |
) |
316 | 300 |
] |
317 | 301 |
if (!purrr::is_empty(non_standard)) { |
318 | 302 |
col_probs[["non_standard"]] <- non_standard |
319 | 303 |
} |
320 |
- missing_dates <- purrr::map_lgl(date_columns_coll(), function(date_col) { |
|
321 |
- any(is.na(as_file[[date_col]])) |
|
322 |
- }) %>% purrr::set_names(date_columns_coll()) |
|
323 |
- missing_dates <- names(missing_dates)[missing_dates == TRUE] |
|
324 |
- if (length(missing_dates) == 0) { |
|
325 |
- missing_dates <- NULL |
|
326 |
- } |
|
327 | 304 |
## Fix timepoints |
328 | 305 |
if (convert_tp) { |
329 |
- if (!"TimepointMonths" %in% colnames(as_file)) { |
|
330 |
- as_file <- as_file %>% |
|
331 |
- tibble::add_column(TimepointMonths = NA_real_) |
|
332 |
- } |
|
333 |
- if (!"TimepointYears" %in% colnames(as_file)) { |
|
334 |
- as_file <- as_file %>% |
|
335 |
- tibble::add_column(TimepointYears = NA_real_) |
|
336 |
- } |
|
306 |
+ tp_col <- tags_to_cols %>% |
|
307 |
+ dplyr::filter(.data$tag == "tp_days") %>% |
|
308 |
+ dplyr::pull(.data$names) |
|
337 | 309 |
as_file <- as_file %>% |
338 | 310 |
dplyr::mutate( |
339 |
- TimepointMonths = dplyr::if_else( |
|
340 |
- condition = as.numeric(.data$TimePoint) == 0, |
|
341 |
- true = 0, |
|
342 |
- false = dplyr::if_else( |
|
343 |
- condition = as.numeric(.data$TimePoint) > 0 & |
|
344 |
- as.numeric(.data$TimePoint) < 30, |
|
345 |
- true = ceiling(as.numeric(.data$TimePoint) / 30), |
|
346 |
- false = round(as.numeric(.data$TimePoint) / 30) |
|
347 |
- ) |
|
348 |
- ), |
|
349 |
- TimepointYears = dplyr::if_else( |
|
350 |
- condition = as.numeric(.data$TimePoint) == 0, |
|
351 |
- true = 0, |
|
352 |
- false = ceiling(as.numeric(.data$TimePoint) / 360) |
|
353 |
- ) |
|
354 |
- ) %>% |
|
355 |
- dplyr::mutate( |
|
356 |
- TimepointMonths = stringr::str_pad( |
|
357 |
- as.character(.data$TimepointMonths), |
|
358 |
- pad = "0", side = "left", width = 2 |
|
359 |
- ), |
|
360 |
- TimepointYears = stringr::str_pad( |
|
361 |
- as.character(.data$TimepointYears), |
|
362 |
- pad = "0", side = "left", width = 2 |
|
363 |
- ), |
|
311 |
+ TimepointMonths = .timepoint_to_months(.data[[tp_col]]), |
|
312 |
+ TimepointYears = .timepoint_to_years(.data[[tp_col]]) |
|
364 | 313 |
) |
365 | 314 |
} |
366 | 315 |
import_stats_rep <- NULL |
... | ... |
@@ -406,6 +355,29 @@ import_association_file <- function(path, |
406 | 355 |
} |
407 | 356 |
} |
408 | 357 |
} |
358 |
+ |
|
359 |
+ if (!is.null(transformations)) { |
|
360 |
+ as_file <- transform_columns(as_file, transformations) |
|
361 |
+ } |
|
362 |
+ |
|
363 |
+ crit_colnames <- association_file_columns(TRUE) %>% |
|
364 |
+ dplyr::filter(.data$tag %in% available_column_tags()$critical$af) %>% |
|
365 |
+ dplyr::pull(.data$names) |
|
366 |
+ crit_colnames <- colnames(as_file)[colnames(as_file) %in% crit_colnames] |
|
367 |
+ crit_nas <- if (length(crit_colnames) > 0) { |
|
368 |
+ nas_crit <- purrr::map_lgl(crit_colnames, ~ { |
|
369 |
+ any(is.na(as_file[[.x]])) |
|
370 |
+ }) %>% |
|
371 |
+ purrr::set_names(crit_colnames) |
|
372 |
+ nas_crit <- names(purrr::keep(nas_crit, ~ .x == TRUE)) |
|
373 |
+ if (length(nas_crit) == 0) { |
|
374 |
+ NULL |
|
375 |
+ } else { |
|
376 |
+ nas_crit |
|
377 |
+ } |
|
378 |
+ } else { |
|
379 |
+ NULL |
|
380 |
+ } |
|
409 | 381 |
withCallingHandlers( |
410 | 382 |
{ |
411 | 383 |
.produce_report("asso_file", |
... | ... |
@@ -413,7 +385,7 @@ import_association_file <- function(path, |
413 | 385 |
parsing_prob = parsing_problems, |
414 | 386 |
dates_prob = date_problems, |
415 | 387 |
col_prob = col_probs, |
416 |
- crit_nas = missing_dates, |
|
388 |
+ crit_nas = crit_nas, |
|
417 | 389 |
fs_align = checks, |
418 | 390 |
iss_stats = import_stats_rep, |
419 | 391 |
iss_stats_miss = missing_stats_rep |
... | ... |
@@ -430,7 +402,7 @@ import_association_file <- function(path, |
430 | 402 |
summary_report <- .summary_af_import_msg( |
431 | 403 |
pars_prob = parsing_problems, dates_prob = date_problems, |
432 | 404 |
cols_prob = col_probs[[!is.null(col_probs)]], |
433 |
- crit_na = missing_dates, |
|
405 |
+ crit_na = crit_nas, |
|
434 | 406 |
checks = ifelse(is.null(checks), |
435 | 407 |
yes = "skipped", |
436 | 408 |
no = ifelse(any(!checks$Found), |
... | ... |
@@ -503,13 +475,22 @@ import_Vispa2_stats <- function(association_file, |
503 | 475 |
if (!path_cols$iss %in% colnames(association_file)) { |
504 | 476 |
rlang::abort(.af_not_aligned_err()) |
505 | 477 |
} |
506 |
- min_cols <- c( |
|
507 |
- "ProjectID", |
|
508 |
- "CompleteAmplificationID", |
|
509 |
- pool_col, |
|
510 |
- path_cols$iss, |
|
511 |
- "TagSequence" |
|
478 |
+ required_tags <- list( |
|
479 |
+ "project_id" = "char", |
|
480 |
+ "tag_seq" = "char", |
|
481 |
+ "pcr_repl_id" = "char" |
|
482 |
+ ) |
|
483 |
+ tag_politics <- list( |
|
484 |
+ "project_id" = "error", |
|
485 |
+ "tag_seq" = "error", |
|
486 |
+ "pcr_repl_id" = "error" |
|
487 |
+ ) |
|
488 |
+ tags_to_cols <- .check_required_cols( |
|
489 |
+ required_tags = required_tags, |
|
490 |
+ vars_df = association_file_columns(TRUE), |
|
491 |
+ duplicate_politic = tag_politics |
|
512 | 492 |
) |
493 |
+ min_cols <- c(tags_to_cols$names, pool_col, path_cols$iss) |
|
513 | 494 |
if (!all(min_cols %in% colnames(association_file))) { |
514 | 495 |
rlang::abort( |
515 | 496 |
.missing_needed_cols( |
... | ... |
@@ -527,7 +508,10 @@ import_Vispa2_stats <- function(association_file, |
527 | 508 |
## Import |
528 | 509 |
stats <- .import_stats_iss( |
529 | 510 |
association_file = association_file, |
530 |
- prefixes = file_prefixes |
|
511 |
+ prefixes = file_prefixes, |
|
512 |
+ pool_col = pool_col, |
|
513 |
+ path_iss_col = path_cols$iss, |
|
514 |
+ tags = tags_to_cols |
|
531 | 515 |
) |
532 | 516 |
report <- stats$report |
533 | 517 |
stats <- stats$stats |
... | ... |
@@ -536,7 +520,7 @@ import_Vispa2_stats <- function(association_file, |
536 | 520 |
if (getOption("ISAnalytics.verbose") == TRUE) { |
537 | 521 |
rlang::inform(.no_stat_files_imported()) |
538 | 522 |
} |
539 |
- if (report_path == "INTERNAL") { |
|
523 |
+ if (!is.null(report_path) && report_path == "INTERNAL") { |
|
540 | 524 |
## If function was called from import_association_file |
541 | 525 |
return(stats = association_file, report = report) |
542 | 526 |
} |
... | ... |
@@ -564,32 +548,77 @@ import_Vispa2_stats <- function(association_file, |
564 | 548 |
## - IF STATS NOT NULL |
565 | 549 |
## Merge if requested |
566 | 550 |
if (join_with_af) { |
551 |
+ required_tags_for_join <- list( |
|
552 |
+ "tag_seq" = "char", |
|
553 |
+ "vispa_concatenate" = "char" |
|
554 |
+ ) |
|
555 |
+ iss_tags_to_cols <- .check_required_cols(required_tags_for_join, |
|
556 |
+ iss_stats_specs(TRUE), |
|
557 |
+ duplicate_politic = "error" |
|
558 |
+ ) |
|
559 |
+ if (any(!iss_tags_to_cols$names %in% colnames(stats))) { |
|
560 |
+ msg <- c("Error in joining VISPA2 stats with AF - skipping", |
|
561 |
+ .missing_needed_cols(iss_tags_to_cols$names[ |
|
562 |
+ !iss_tags_to_cols$names %in% colnames(stats) |
|
563 |
+ ]), |
|
564 |
+ i = paste( |
|
565 |
+ "Needed columns for join were not found in", |
|
566 |
+ "imported stats. Check your iss stats specs", |
|
567 |
+ "with `iss_stats_specs(TRUE)` and check the files", |
|
568 |
+ "are not malformed." |
|
569 |
+ ), |
|
570 |
+ i = paste("Returning imported data only") |
|
571 |
+ ) |
|
572 |
+ rlang::inform(msg, class = "iss_join_missing") |
|
573 |
+ return(stats) |
|
574 |
+ } |
|
575 |
+ iss_pool_col <- iss_tags_to_cols %>% |
|
576 |
+ dplyr::filter(.data$tag == "vispa_concatenate") %>% |
|
577 |
+ dplyr::pull(.data$names) |
|
578 |
+ iss_tag_col <- iss_tags_to_cols %>% |
|
579 |
+ dplyr::filter(.data$tag == "tag_seq") %>% |
|
580 |
+ dplyr::pull(.data$names) |
|
567 | 581 |
association_file <- association_file %>% |
568 | 582 |
dplyr::left_join(stats, by = c( |
569 |
- stats::setNames("POOL", pool_col), |
|
570 |
- "TagSequence" = "TAG" |
|
583 |
+ stats::setNames(iss_pool_col, pool_col), |
|
584 |
+ stats::setNames(iss_tag_col, tags_to_cols %>% |
|
585 |
+ dplyr::filter(.data$tag == "tag_seq") %>% |
|
586 |
+ dplyr::pull(.data$names)) |
|
571 | 587 |
)) |
572 | 588 |
## Detect potential problems |
573 |
- addit_columns <- c("SubjectID", "CellMarker", "Tissue", "TimePoint") |
|
574 |
- addit_columns <- addit_columns[addit_columns %in% |
|
589 |
+ addit_columns <- association_file_columns(TRUE) %>% |
|
590 |
+ dplyr::filter(.data$tag %in% c( |
|
591 |
+ "subject", |
|
592 |
+ "tissue", |
|
593 |
+ "cell_marker", |
|
594 |
+ "tp_days" |
|
595 |
+ )) |
|
596 |
+ addit_columns_names <- addit_columns %>% |
|
597 |
+ dplyr::pull(.data$names) |
|
598 |
+ addit_columns_names <- addit_columns_names[addit_columns_names %in% |
|
575 | 599 |
colnames(association_file)] |
600 |
+ iss_cols_in_af <- colnames(association_file)[ |
|
601 |
+ colnames(association_file) %in% iss_stats_specs() |
|
602 |
+ ] |
|
603 |
+ |
|
576 | 604 |
missing_stats <- association_file %>% |
577 |
- dplyr::filter(is.na(.data$RUN_NAME)) %>% |
|
578 |
- dplyr::select( |
|
579 |
- .data$ProjectID, |
|
580 |
- dplyr::all_of(pool_col), |
|
581 |
- .data$CompleteAmplificationID, |
|
582 |
- .data$TagSequence, |
|
583 |
- dplyr::all_of(addit_columns) |
|
584 |
- ) %>% |
|
605 |
+ dplyr::filter(dplyr::if_all(dplyr::all_of(iss_cols_in_af), is.na)) %>% |
|
606 |
+ dplyr::select(dplyr::all_of(c( |
|
607 |
+ tags_to_cols$names, |
|
608 |
+ pool_col, |
|
609 |
+ addit_columns_names |
|
610 |
+ ))) %>% |
|
585 | 611 |
dplyr::distinct() |
586 |
- if (report_path == "INTERNAL") { |
|
612 |
+ all_af_tags <- tags_to_cols %>% |
|
613 |
+ dplyr::bind_rows(addit_columns) |
|
614 |
+ if (!is.null(report_path) && report_path == "INTERNAL") { |
|
587 | 615 |
## If function was called from import_association_file |
588 | 616 |
return(list( |
589 | 617 |
stats = association_file, |
590 | 618 |
report = list( |
591 | 619 |
import = report, |
592 |
- miss = missing_stats |
|
620 |
+ miss = missing_stats, |
|
621 |
+ af_tag_map = all_af_tags |
|
593 | 622 |
) |
594 | 623 |
)) |
595 | 624 |
} |
... | ... |
@@ -662,7 +691,8 @@ import_Vispa2_stats <- function(association_file, |
662 | 691 |
#' INTERACTIVE, the function will ask for input from the user on console, |
663 | 692 |
#' otherwise the process is fully automated (with limitations, see vignette). |
664 | 693 |
#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Additional named arguments |
665 |
-#' to pass to `ìmport_association_file` and `comparison_matrix` |
|
694 |
+#' to pass to `ìmport_association_file`, `comparison_matrix` and |
|
695 |
+#' `import_single_Vispa2_matrix` |
|
666 | 696 |
#' |
667 | 697 |
#' @importFrom rlang fn_fmls_names dots_list arg_match inform abort |
668 | 698 |
#' @importFrom rlang eval_tidy call2 |
... | ... |
@@ -689,8 +719,8 @@ import_Vispa2_stats <- function(association_file, |
689 | 719 |
#' ) |
690 | 720 |
#' head(matrices) |
691 | 721 |
import_parallel_Vispa2Matrices <- function(association_file, |
692 |
- quantification_type, |
|
693 |
- matrix_type = "annotated", |
|
722 |
+ quantification_type = c("seqCount", "fragmentEstimate"), |
|
723 |
+ matrix_type = c("annotated", "not_annotated"), |
|
694 | 724 |
workers = 2, |
695 | 725 |
multi_quant_matrix = TRUE, |
696 | 726 |
report_path = default_report_path(), |
... | ... |
@@ -702,14 +732,15 @@ import_parallel_Vispa2Matrices <- function(association_file, |
702 | 732 |
association_file, quantification_type, matrix_type, |
703 | 733 |
workers, multi_quant_matrix |
704 | 734 |
) |
705 |
- rlang::arg_match(mode) |
|
735 |
+ matrix_type <- rlang::arg_match(matrix_type) |
|
736 |
+ mode <- rlang::arg_match(mode) |
|
706 | 737 |
## Collect dot args |
707 | 738 |
if (is.character(association_file) || isTRUE(multi_quant_matrix)) { |
708 | 739 |
dots_args <- rlang::dots_list(..., .named = TRUE, .homonyms = "first") |
709 | 740 |
if (is.character(association_file)) { |
710 | 741 |
import_af_arg_names <- rlang::fn_fmls_names(import_association_file) |
711 | 742 |
import_af_arg_names <- import_af_arg_names[ |
712 |
- import_af_arg_names != "path" |
|
743 |
+ !import_af_arg_names %in% c("path", "report_path") |
|
713 | 744 |
] |
714 | 745 |
import_af_args <- dots_args[names(dots_args) %in% |
715 | 746 |
import_af_arg_names] |
... | ... |
@@ -721,27 +752,51 @@ import_parallel_Vispa2Matrices <- function(association_file, |
721 | 752 |
mult_arg_names] |
722 | 753 |
} |
723 | 754 |
} |
724 |
- association_file <- .pre_manage_af(association_file, import_af_args) |
|
755 |
+ import_matrix_arg_names <- rlang::fn_fmls_names( |
|
756 |
+ import_single_Vispa2Matrix) |
|
757 |
+ import_matrix_arg_names <- import_matrix_arg_names[ |
|
758 |
+ !import_matrix_arg_names %in% c("path", "to_exclude", |
|
759 |
+ "keep_excluded")] |
|
760 |
+ import_matrix_args <- dots_args[names(dots_args) %in% |
|
761 |
+ import_matrix_arg_names] |
|
762 |
+ association_file <- .pre_manage_af( |
|
763 |
+ association_file, |
|
764 |
+ import_af_args, |
|
765 |
+ report_path |
|
766 |
+ ) |
|
725 | 767 |
if (nrow(association_file) == 0) { |
726 | 768 |
rlang::inform(.af_empty_msg()) |
727 | 769 |
return(NULL) |
728 | 770 |
} |
729 | 771 |
## Workflows |
772 |
+ af_tags <- association_file_columns(TRUE) |
|
773 |
+ proj_col <- af_tags %>% |
|
774 |
+ dplyr::filter(.data$tag == "project_id") %>% |
|
775 |
+ dplyr::pull(.data$names) |
|
776 |
+ pool_col <- af_tags %>% |
|
777 |
+ dplyr::filter(.data$tag == "vispa_concatenate") %>% |
|
778 |
+ dplyr::pull(.data$names) |
|
730 | 779 |
### --- Interactive |
731 | 780 |
if (mode == "INTERACTIVE") { |
732 | 781 |
## User selects projects to keep |
733 | 782 |
association_file <- .interactive_select_projects_import( |
734 |
- association_file |
|
783 |
+ association_file, |
|
784 |
+ proj_col = proj_col |
|
735 | 785 |
) |
736 | 786 |
## User selects pools to keep |
737 |
- association_file <- .interactive_select_pools_import(association_file) |
|
787 |
+ association_file <- .interactive_select_pools_import(association_file, |
|
788 |
+ proj_col = proj_col, |
|
789 |
+ pool_col = pool_col |
|
790 |
+ ) |
|
738 | 791 |
## Scan the appropriate file system paths and look for files |
739 | 792 |
files_found <- .lookup_matrices( |
740 | 793 |
association_file, quantification_type, |
741 |
- matrix_type |
|
794 |
+ matrix_type, proj_col, pool_col |
|
742 | 795 |
) |
743 | 796 |
## Manage missing files and duplicates |
744 |
- files_to_import <- .manage_anomalies_interactive(files_found) |
|
797 |
+ files_to_import <- .manage_anomalies_interactive(files_found, |
|
798 |
+ proj_col, |
|
799 |
+ pool_col) |
|
745 | 800 |
} else { |
746 | 801 |
### --- Auto |
747 | 802 |
## In automatic workflow all projects and pools contained |
... | ... |
@@ -757,27 +812,30 @@ import_parallel_Vispa2Matrices <- function(association_file, |
757 | 812 |
stopifnot(is.character(patterns)) |
758 | 813 |
} |
759 | 814 |
### Evaluate matching_opt |
760 |
- matching_option <- match.arg(matching_opt) |
|
815 |
+ matching_option <- rlang::arg_match(matching_opt) |
|
761 | 816 |
stopifnot(is.character(matching_option)) |
762 | 817 |
## Scan the appropriate file system paths and look for files |
763 | 818 |
files_found <- .lookup_matrices_auto( |
764 | 819 |
association_file, quantification_type, |
765 |
- matrix_type, patterns, matching_option |
|
820 |
+ matrix_type, patterns, matching_option, |
|
821 |
+ proj_col, pool_col |
|
766 | 822 |
) |
767 | 823 |
## Manage missing files and duplicates |
768 |
- files_to_import <- .manage_anomalies_auto(files_found) |
|
824 |
+ files_to_import <- .manage_anomalies_auto(files_found, |
|
825 |
+ proj_col, pool_col) |
|
769 | 826 |
} |
770 | 827 |
## If files to import are 0 just terminate |
771 | 828 |
if (nrow(files_to_import) == 0) { |
772 | 829 |
rlang::abort("No files to import") |
773 | 830 |
} |
774 | 831 |
## Import |
775 |
- matrices <- .parallel_import_merge(files_to_import, workers) |
|
776 |
- fimported <- matrices[[2]] |
|
832 |
+ matrices <- .parallel_import_merge(files_to_import, workers, |
|
833 |
+ import_matrix_args) |
|
834 |
+ fimported <- matrices$summary |
|
777 | 835 |
if (nrow(fimported) == 0) { |
778 | 836 |
fimported <- NULL |
779 | 837 |
} |
780 |
- matrices <- matrices[[1]] |
|
838 |
+ matrices <- matrices$matrix |
|
781 | 839 |
if (multi_quant_matrix == TRUE) { |
782 | 840 |
matrices <- rlang::eval_tidy(rlang::call2(comparison_matrix, |
783 | 841 |
x = matrices, |
... | ... |
@@ -795,10 +853,17 @@ import_parallel_Vispa2Matrices <- function(association_file, |
795 | 853 |
} else { |
796 | 854 |
NULL |
797 | 855 |
} |
856 |
+ launch_params <- list() |
|
857 |
+ if (!is.null(patterns)) { |
|
858 |
+ launch_params[["patterns"]] <- patterns |
|
859 |
+ launch_params[["matching_opt"]] <- matching_option |
|
860 |
+ } |
|
798 | 861 |
withCallingHandlers( |
799 | 862 |
{ |
800 | 863 |
.produce_report("matrix_imp", |
801 | 864 |
params = list( |
865 |
+ launch_params = launch_params, |
|
866 |
+ set_vars = list(proj_col = proj_col, pool_col = pool_col), |
|
802 | 867 |
files_found = files_found, |
803 | 868 |
files_imp = fimported, |
804 | 869 |
annot_prob = annotation_problems |
... | ... |
@@ -1043,7 +1108,11 @@ matching_options <- function() { |
1043 | 1108 |
#' @examples |
1044 | 1109 |
#' date_formats() |
1045 | 1110 |
date_formats <- function() { |
1046 |
- c("ymd", "ydm", "mdy", "myd", "dmy", "dym", "yq") |
|
1111 |
+ c( |
|
1112 |
+ "ymd", "ydm", "mdy", "myd", "dmy", "dym", "yq", "ym", "my", |
|
1113 |
+ "ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm", |
|
1114 |
+ "dmy_h", "mdy_hms", "mdy_hm", "mdy_h", "ydm_hms", "ydm_hm", "ydm_h" |
|
1115 |
+ ) |
|
1047 | 1116 |
} |
1048 | 1117 |
|
1049 | 1118 |
|
... | ... |
@@ -1059,3 +1128,18 @@ date_formats <- function() { |
1059 | 1128 |
default_iss_file_prefixes <- function() { |
1060 | 1129 |
c("stats\\.sequence.", "stats\\.matrix.") |
1061 | 1130 |
} |
1131 |
+ |
|
1132 |
+default_af_transform <- function(convert_tp) { |
|
1133 |
+ if (convert_tp) { |
|
1134 |
+ return(list( |
|
1135 |
+ TimepointMonths = ~ stringr::str_pad( |
|
1136 |
+ as.character(.x), |
|
1137 |
+ pad = "0", side = "left", width = 2 |
|
1138 |
+ ), |
|
1139 |
+ TimepointYears = ~ stringr::str_pad(as.character(.x), |
|
1140 |
+ pad = "0", side = "left", width = 2 |
|
1141 |
+ ) |
|
1142 |
+ )) |
|
1143 |
+ } |
|
1144 |
+ return(NULL) |
|
1145 |
+} |
... | ... |
@@ -3,7 +3,507 @@ |
3 | 3 |
#------------------------------------------------------------------------------# |
4 | 4 |
## All functions in this file are NOT exported, to be used internally only. |
5 | 5 |
|
6 |
+#### ---- Internals for dynamic variables ----#### |
|
7 |
+# Internal for setting mandatory or annotation is vars |
|
8 |
+.new_IS_vars_checks <- function(specs, err) { |
|
9 |
+ new_vars <- if (is.data.frame(specs)) { |
|
10 |
+ # if data frame supplied |
|
11 |
+ colnames_ok <- all(c("names", "types", "transform", "flag", "tag") |
|
12 |
+ %in% colnames(specs)) |
|
13 |
+ col_types_ok <- if (colnames_ok) { |
|
14 |
+ all(purrr::map_lgl( |
|
15 |
+ specs[c("names", "types", "flag", "tag")], |
|
16 |
+ is.character |
|
17 |
+ )) |
|
18 |
+ } else { |
|
19 |
+ FALSE |
|
20 |
+ } |
|
21 |
+ types_ok <- if (colnames_ok & col_types_ok) { |
|
22 |
+ all(specs$types %in% .types_mapping()[["types"]]) |
|
23 |
+ } else { |
|
24 |
+ FALSE |
|
25 |
+ } |
|
26 |
+ flags_ok <- if (colnames_ok & col_types_ok) { |
|
27 |
+ all(specs$flag %in% c("required", "optional")) |
|
28 |
+ } else { |
|
29 |
+ FALSE |
|
30 |
+ } |
|
31 |
+ transform_col_ok <- if (colnames_ok & col_types_ok) { |
|
32 |
+ is.list(specs$transform) && |
|
33 |
+ all(purrr::map_lgl( |
|
34 |
+ specs$transform, |
|
35 |
+ ~ { |
|
36 |
+ is.null(.x) || rlang::is_formula(.x) || is.function(.x) |
|
37 |
+ } |
|
38 |
+ )) |
|
39 |
+ } else { |
|
40 |
+ FALSE |
|
41 |
+ } |
|
42 |
+ if (!(colnames_ok & col_types_ok & types_ok & transform_col_ok & |
|
43 |
+ flags_ok)) { |
|
44 |
+ rlang::abort(err) |
|
45 |
+ } |
|
46 |
+ specs |
|
47 |
+ } else { |
|
48 |
+ # if vector supplied |
|
49 |
+ if (is.null(names(specs)) || !all(specs %in% .types_mapping()[["types"]])) { |
|
50 |
+ rlang::abort(err) |
|
51 |
+ } |
|
52 |
+ purrr::map2_dfr( |
|
53 |
+ names(specs), specs, |
|
54 |
+ ~ tibble::tibble_row( |
|
55 |
+ names = .x, |
|
56 |
+ types = .y, |
|
57 |
+ transform = list(NULL), |
|
58 |
+ flag = "required", |
|
59 |
+ tag = NA_character_ |
|
60 |
+ ) |
|
61 |
+ ) |
|
62 |
+ } |
|
63 |
+ new_vars |
|
64 |
+} |
|
65 |
+ |
|
66 |
+# Applies transformations on columns as specified in variables specs |
|
67 |
+# expects specs to be in data frame format |
|
68 |
+.apply_col_transform <- function(df, specs) { |
|
69 |
+ # Extract and associate names and transf |
|
70 |
+ non_null <- purrr::pmap(specs, function(names, types, |
|
71 |
+ transform, flags, tag) { |
|
72 |
+ if (is.null(transform)) { |
|
73 |
+ return(NULL) |
|
74 |
+ } |
|
75 |
+ return(transform) |
|
76 |
+ }) %>% |
|
77 |
+ purrr::set_names(specs$names) |
|
78 |
+ # Retain only non-null transform |
|
79 |
+ non_null <- non_null[purrr::map_lgl(non_null, ~ !is.null(.x))] |
|
80 |
+ if (length(non_null) > 0) { |
|
81 |
+ # if there are transf to apply |
|
82 |
+ apply_transform <- function(col, col_name) { |
|
83 |
+ if (!col_name %in% names(non_null)) { |
|
84 |
+ return(col) |
|
85 |
+ } |
|
86 |
+ transformation <- non_null[[col_name]] |
|
87 |
+ t <- if (rlang::is_formula(transformation)) { |
|
88 |
+ unlist(purrr::map(col, transformation)) |
|
89 |
+ } else { |
|
90 |
+ # if it is a function |
|
91 |
+ do.call(what = transformation, args = list(col)) |
|
92 |
+ } |
|
93 |
+ } |
|
94 |
+ df <- purrr::map2_dfc(df, colnames(df), apply_transform) |
|
95 |
+ } |
|
96 |
+ df |
|
97 |
+} |
|
98 |
+ |
|
99 |
+# Internal to quickly convert columns marked as dates with lubridate |
|
100 |
+.convert_dates <- function(date_vec, format) { |
|
101 |
+ if (format %in% date_formats()) { |
|
102 |
+ parsed <- lubridate::parse_date_time(date_vec, format) |
|
103 |
+ if (format %in% c( |
|
104 |
+ "ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm", |
|
105 |
+ "dmy_h", "mdy_hms", "mdy_hm", "mdy_h", |
|
106 |
+ "ydm_hms", "ydm_hm", "ydm_h" |
|
107 |
+ )) { |
|
108 |
+ return(parsed) |
|
109 |
+ } else { |
|
110 |
+ return(lubridate::as_date(parsed)) |
|
111 |
+ } |
|
112 |
+ } |
|
113 |
+ if (format == "date") { |
|
114 |
+ # Guesses the format |
|
115 |
+ return(lubridate::as_date(as.character(date_vec))) |
|
116 |
+ } |
|
117 |
+} |