... | ... |
@@ -443,12 +443,18 @@ filter_and_extract <- function( |
443 | 443 |
g <- GenomicRanges::makeGRangesFromDataFrame( |
444 | 444 |
complete_df, |
445 | 445 |
keep.extra.columns = TRUE, |
446 |
+ seqnames.field = c("seqnames", "seqname", |
|
447 |
+ "chromosome", "chrom", |
|
448 |
+ "chr", "chromosome_name"), |
|
446 | 449 |
start.field = "left", |
447 | 450 |
end.field = "right") |
448 | 451 |
} else { |
449 | 452 |
g <- GenomicRanges::makeGRangesFromDataFrame( |
450 | 453 |
df, |
451 | 454 |
keep.extra.columns = TRUE, |
455 |
+ seqnames.field = c("seqnames", "seqname", |
|
456 |
+ "chromosome", "chrom", |
|
457 |
+ "chr", "chromosome_name"), |
|
452 | 458 |
start.field = "left", |
453 | 459 |
end.field = "right") |
454 | 460 |
} |
... | ... |
@@ -232,10 +232,6 @@ filter_and_extract <- function( |
232 | 232 |
else |
233 | 233 |
all_values[!all_values %in% except_values] |
234 | 234 |
names(regions) <- NULL |
235 |
- # since import convert this value from GMQL schema to GTF format |
|
236 |
- # we need to convert it back |
|
237 |
- regions <- replace(regions, regions == "feature", "type") |
|
238 |
- regions <- replace(regions, regions == "frame", "phase") |
|
239 | 235 |
} |
240 | 236 |
|
241 | 237 |
elementMetadata(g1) <- NULL |
... | ... |
@@ -318,7 +314,7 @@ filter_and_extract <- function( |
318 | 314 |
}) |
319 | 315 |
## we would like that manage more index from grep |
320 | 316 |
found <- as.logical(length(unlist(a))) |
321 |
- # if found retrieve samples that has at least one choosen metadata |
|
317 |
+ # if found retrieve samples that has at least one chosen metadata |
|
322 | 318 |
if (found) { |
323 | 319 |
x |
324 | 320 |
} |
... | ... |
@@ -331,24 +327,15 @@ filter_and_extract <- function( |
331 | 327 |
suffixes, |
332 | 328 |
vector_field |
333 | 329 |
) { |
334 |
- g1 <- tryCatch( |
|
335 |
- expr = { |
|
336 |
- rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
|
337 |
- }, |
|
338 |
- error = function(e){ |
|
339 |
- rtracklayer::import( |
|
340 |
- gtf_region_files[1], |
|
341 |
- format = "gff", |
|
342 |
- version = "3") |
|
343 |
- }, |
|
344 |
- warning = function(w){ |
|
345 |
- rtracklayer::import( |
|
346 |
- gtf_region_files[1], |
|
347 |
- format = "gff", |
|
348 |
- version = "3") |
|
349 |
- } |
|
330 |
+ attr_col_names <- vector_field[ |
|
331 |
+ !vector_field %in% c("seqname", "seqid", "start", "end", "strand")] |
|
332 |
+ |
|
333 |
+ g1 <- rtracklayer::import( |
|
334 |
+ con = gtf_region_files[1], |
|
335 |
+ format = "gtf", |
|
336 |
+ colnames = attr_col_names |
|
350 | 337 |
) |
351 |
- |
|
338 |
+ |
|
352 | 339 |
elementMetadata(g1) <- NULL |
353 | 340 |
if (is.null(suffixes)) { |
354 | 341 |
suffixes <- "" |
... | ... |
@@ -377,16 +364,10 @@ filter_and_extract <- function( |
377 | 364 |
|
378 | 365 |
if (!is.null(regions)) { |
379 | 366 |
DF_list <- mapply(function(x, header) { |
380 |
- g_x <- tryCatch( |
|
381 |
- expr = { |
|
382 |
- rtracklayer::import(x, format = "gtf") |
|
383 |
- }, |
|
384 |
- error = function(e){ |
|
385 |
- rtracklayer::import(x, format = "gff", version = "3") |
|
386 |
- }, |
|
387 |
- warning = function(w){ |
|
388 |
- rtracklayer::import(x, format = "gff", version = "3") |
|
389 |
- } |
|
367 |
+ g_x <- rtracklayer::import( |
|
368 |
+ x, |
|
369 |
+ format = "gtf", |
|
370 |
+ colnames = attr_col_names |
|
390 | 371 |
) |
391 | 372 |
meta <- elementMetadata(g_x)[regions] |
392 | 373 |
if (header != "") { |
... | ... |
@@ -331,7 +331,24 @@ filter_and_extract <- function( |
331 | 331 |
suffixes, |
332 | 332 |
vector_field |
333 | 333 |
) { |
334 |
- g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
|
334 |
+ g1 <- tryCatch( |
|
335 |
+ expr = { |
|
336 |
+ rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
|
337 |
+ }, |
|
338 |
+ error = function(e){ |
|
339 |
+ rtracklayer::import( |
|
340 |
+ gtf_region_files[1], |
|
341 |
+ format = "gff", |
|
342 |
+ version = "3") |
|
343 |
+ }, |
|
344 |
+ warning = function(w){ |
|
345 |
+ rtracklayer::import( |
|
346 |
+ gtf_region_files[1], |
|
347 |
+ format = "gff", |
|
348 |
+ version = "3") |
|
349 |
+ } |
|
350 |
+ ) |
|
351 |
+ |
|
335 | 352 |
elementMetadata(g1) <- NULL |
336 | 353 |
if (is.null(suffixes)) { |
337 | 354 |
suffixes <- "" |
... | ... |
@@ -360,7 +377,17 @@ filter_and_extract <- function( |
360 | 377 |
|
361 | 378 |
if (!is.null(regions)) { |
362 | 379 |
DF_list <- mapply(function(x, header) { |
363 |
- g_x <- rtracklayer::import(con = x, format = "gtf") |
|
380 |
+ g_x <- tryCatch( |
|
381 |
+ expr = { |
|
382 |
+ rtracklayer::import(x, format = "gtf") |
|
383 |
+ }, |
|
384 |
+ error = function(e){ |
|
385 |
+ rtracklayer::import(x, format = "gff", version = "3") |
|
386 |
+ }, |
|
387 |
+ warning = function(w){ |
|
388 |
+ rtracklayer::import(x, format = "gff", version = "3") |
|
389 |
+ } |
|
390 |
+ ) |
|
364 | 391 |
meta <- elementMetadata(g_x)[regions] |
365 | 392 |
if (header != "") { |
366 | 393 |
names(meta) <- paste(regions, header, sep = ".") |
... | ... |
@@ -72,12 +72,12 @@ |
72 | 72 |
#' filter_and_extract(sorted_grl_full, region_attributes = FULL()) |
73 | 73 |
#' |
74 | 74 |
#' ## This statement imports a GMQL dataset as GRangesList and filters it |
75 |
-#' ## including all the region attributes except "jaccard" and "score" |
|
75 |
+#' ## including all the region attributes except "jaccard" |
|
76 | 76 |
#' |
77 | 77 |
#' sorted_grl_full_except <- sort(grl) |
78 | 78 |
#' filter_and_extract( |
79 | 79 |
#' sorted_grl_full_except, |
80 |
-#' region_attributes = FULL("jaccard", "score") |
|
80 |
+#' region_attributes = FULL("jaccard") |
|
81 | 81 |
#' ) |
82 | 82 |
#' |
83 | 83 |
#' @export |
... | ... |
@@ -83,376 +83,365 @@ |
83 | 83 |
#' @export |
84 | 84 |
#' |
85 | 85 |
filter_and_extract <- function( |
86 |
- data, |
|
87 |
- metadata = NULL, |
|
88 |
- metadata_prefix = NULL, |
|
89 |
- region_attributes = NULL, |
|
90 |
- suffix = "antibody_target" |
|
86 |
+ data, |
|
87 |
+ metadata = NULL, |
|
88 |
+ metadata_prefix = NULL, |
|
89 |
+ region_attributes = NULL, |
|
90 |
+ suffix = "antibody_target" |
|
91 | 91 |
) { |
92 |
- |
|
93 |
- if (is(data, "GRangesList")) { |
|
94 |
- .extract_from_GRangesList( |
|
95 |
- data, |
|
96 |
- metadata, |
|
97 |
- metadata_prefix, |
|
98 |
- region_attributes, |
|
99 |
- suffix |
|
100 |
- ) |
|
101 |
- } else { |
|
102 |
- .extract_from_dataset( |
|
103 |
- data, |
|
104 |
- metadata, |
|
105 |
- metadata_prefix, |
|
106 |
- region_attributes, suffix |
|
107 |
- ) |
|
108 |
- } |
|
92 |
+ if (is(data, "GRangesList")) { |
|
93 |
+ .extract_from_GRangesList( |
|
94 |
+ data, |
|
95 |
+ metadata, |
|
96 |
+ metadata_prefix, |
|
97 |
+ region_attributes, |
|
98 |
+ suffix) |
|
99 |
+ } else { |
|
100 |
+ .extract_from_dataset( |
|
101 |
+ data, |
|
102 |
+ metadata, |
|
103 |
+ metadata_prefix, |
|
104 |
+ region_attributes, suffix) |
|
105 |
+ } |
|
109 | 106 |
} |
110 | 107 |
|
111 | 108 |
.extract_from_dataset <- function( |
112 |
- datasetName, |
|
113 |
- metadata, |
|
114 |
- metadata_prefix, |
|
115 |
- regions, |
|
116 |
- suffix |
|
117 |
-) { |
|
118 |
- datasetName <- sub("/*[/]$", "", datasetName) |
|
119 |
- if (basename(datasetName) != "files") { |
|
120 |
- datasetName <- file.path(datasetName, "files") |
|
121 |
- } |
|
122 |
- |
|
123 |
- if (!dir.exists(datasetName)) { |
|
124 |
- stop("Directory does not exists") |
|
125 |
- } |
|
126 |
- |
|
127 |
- gdm_meta_files <- list.files( |
|
128 |
- datasetName, |
|
129 |
- pattern = "*.gdm.meta$", |
|
130 |
- full.names = TRUE |
|
131 |
- ) |
|
132 |
- |
|
133 |
- gtf_meta_files <- list.files( |
|
134 | 109 |
datasetName, |
135 |
- pattern = "*.gtf.meta$", |
|
136 |
- full.names = TRUE |
|
137 |
- ) |
|
138 |
- |
|
139 |
- if (!length(gdm_meta_files) && !length(gtf_meta_files)) { |
|
140 |
- stop("no samples present or no files format supported") |
|
141 |
- } |
|
142 |
- |
|
143 |
- if (length(gdm_meta_files) && length(gtf_meta_files)) { |
|
144 |
- stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together") |
|
145 |
- } |
|
146 |
- |
|
147 |
- vector_field <- .schema_header(datasetName) |
|
148 |
- |
|
149 |
- if (length(gdm_meta_files)) { |
|
150 |
- samples_file <- .check_metadata_files( |
|
151 |
- metadata, metadata_prefix, |
|
152 |
- gdm_meta_files |
|
153 |
- ) |
|
154 |
- |
|
155 |
- samples_meta_to_read <- unlist(samples_file) |
|
110 |
+ metadata, |
|
111 |
+ metadata_prefix, |
|
112 |
+ regions, |
|
113 |
+ suffix |
|
114 |
+) { |
|
115 |
+ datasetName <- sub("/*[/]$", "", datasetName) |
|
116 |
+ if (basename(datasetName) != "files") { |
|
117 |
+ datasetName <- file.path(datasetName, "files") |
|
118 |
+ } |
|
156 | 119 |
|
157 |
- if (length(samples_meta_to_read)) { |
|
158 |
- samples_to_read <- gsub(".meta$", "", samples_meta_to_read) |
|
159 |
- } else { |
|
160 |
- samples_to_read <- gsub(".meta$", "", gdm_meta_files) |
|
161 |
- samples_meta_to_read <- gtf_meta_files |
|
120 |
+ if (!dir.exists(datasetName)) { |
|
121 |
+ stop("Directory does not exists") |
|
162 | 122 |
} |
163 | 123 |
|
164 |
- suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) |
|
165 |
- granges <- .parse_gdm_files( |
|
166 |
- vector_field, |
|
167 |
- samples_to_read, |
|
168 |
- regions, |
|
169 |
- suffix_vec |
|
124 |
+ gdm_meta_files <- list.files( |
|
125 |
+ datasetName, |
|
126 |
+ pattern = "*.gdm.meta$", |
|
127 |
+ full.names = TRUE |
|
170 | 128 |
) |
171 | 129 |
|
172 |
- } else { |
|
173 |
- samples_file <- .check_metadata_files( |
|
174 |
- metadata, |
|
175 |
- metadata_prefix, |
|
176 |
- gtf_meta_files |
|
130 |
+ gtf_meta_files <- list.files( |
|
131 |
+ datasetName, |
|
132 |
+ pattern = "*.gtf.meta$", |
|
133 |
+ full.names = TRUE |
|
177 | 134 |
) |
178 |
- samples_meta_to_read <- unlist(samples_file) |
|
179 | 135 |
|
180 |
- if (length(samples_meta_to_read)) { |
|
181 |
- samples_to_read <- gsub(".meta$", "", samples_meta_to_read) |
|
182 |
- } else { |
|
183 |
- samples_to_read <- gsub(".meta$", "", gtf_meta_files) |
|
184 |
- samples_meta_to_read <- gtf_meta_files |
|
136 |
+ if (!length(gdm_meta_files) && !length(gtf_meta_files)) { |
|
137 |
+ stop("no samples present or no files format supported") |
|
185 | 138 |
} |
186 | 139 |
|
187 |
- suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) |
|
188 |
- granges <- .parse_gtf_files( |
|
189 |
- samples_to_read, |
|
190 |
- regions, |
|
191 |
- suffix_vec, |
|
192 |
- vector_field |
|
193 |
- ) |
|
194 |
- } |
|
140 |
+ if (length(gdm_meta_files) && length(gtf_meta_files)) { |
|
141 |
+ stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together") |
|
142 |
+ } |
|
143 |
+ |
|
144 |
+ vector_field <- .schema_header(datasetName) |
|
145 |
+ |
|
146 |
+ if (length(gdm_meta_files)) { |
|
147 |
+ samples_file <- .check_metadata_files( |
|
148 |
+ metadata, metadata_prefix, |
|
149 |
+ gdm_meta_files |
|
150 |
+ ) |
|
151 |
+ |
|
152 |
+ samples_meta_to_read <- unlist(samples_file) |
|
153 |
+ |
|
154 |
+ if (length(samples_meta_to_read)) { |
|
155 |
+ samples_to_read <- gsub(".meta$", "", samples_meta_to_read) |
|
156 |
+ } else { |
|
157 |
+ samples_to_read <- gsub(".meta$", "", gdm_meta_files) |
|
158 |
+ samples_meta_to_read <- gtf_meta_files |
|
159 |
+ } |
|
160 |
+ |
|
161 |
+ suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) |
|
162 |
+ granges <- .parse_gdm_files( |
|
163 |
+ vector_field, |
|
164 |
+ samples_to_read, |
|
165 |
+ regions, |
|
166 |
+ suffix_vec) |
|
167 |
+ } else { |
|
168 |
+ samples_file <- .check_metadata_files( |
|
169 |
+ metadata, |
|
170 |
+ metadata_prefix, |
|
171 |
+ gtf_meta_files |
|
172 |
+ ) |
|
173 |
+ samples_meta_to_read <- unlist(samples_file) |
|
174 |
+ |
|
175 |
+ if (length(samples_meta_to_read)) { |
|
176 |
+ samples_to_read <- gsub(".meta$", "", samples_meta_to_read) |
|
177 |
+ } else { |
|
178 |
+ samples_to_read <- gsub(".meta$", "", gtf_meta_files) |
|
179 |
+ samples_meta_to_read <- gtf_meta_files |
|
180 |
+ } |
|
181 |
+ |
|
182 |
+ suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) |
|
183 |
+ granges <- .parse_gtf_files( |
|
184 |
+ samples_to_read, |
|
185 |
+ regions, |
|
186 |
+ suffix_vec, |
|
187 |
+ vector_field |
|
188 |
+ ) |
|
189 |
+ } |
|
195 | 190 |
} |
196 | 191 |
|
197 | 192 |
.extract_from_GRangesList <- function( |
198 |
- rangesList, |
|
199 |
- metadata, |
|
200 |
- metadata_prefix, |
|
201 |
- regions, |
|
202 |
- suffix |
|
193 |
+ rangesList, |
|
194 |
+ metadata, |
|
195 |
+ metadata_prefix, |
|
196 |
+ regions, |
|
197 |
+ suffix |
|
203 | 198 |
) { |
204 |
- if (!is(rangesList, "GRangesList")) { |
|
205 |
- stop("only GrangesList admitted") |
|
206 |
- } |
|
207 |
- |
|
208 |
- if (!length(rangesList)) { |
|
209 |
- stop("rangesList empty") |
|
210 |
- } |
|
211 |
- |
|
212 |
- meta_list <- metadata(rangesList) |
|
213 |
- samples <- .check_metadata_list(metadata, metadata_prefix, meta_list) |
|
214 |
- if (!length(unlist(samples))) { |
|
215 |
- samples <- rangesList |
|
216 |
- } else { |
|
217 |
- index <- unlist(samples) |
|
218 |
- samples <- rangesList[c(index)] |
|
219 |
- } |
|
220 |
- new_meta_list <- metadata(samples) |
|
221 |
- suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list) |
|
222 |
- granges <- .parse_Granges(samples, regions, suffix_vec) |
|
199 |
+ if (!is(rangesList, "GRangesList")) { |
|
200 |
+ stop("only GrangesList admitted") |
|
201 |
+ } |
|
202 |
+ |
|
203 |
+ if (!length(rangesList)) { |
|
204 |
+ stop("rangesList empty") |
|
205 |
+ } |
|
206 |
+ |
|
207 |
+ meta_list <- metadata(rangesList) |
|
208 |
+ samples <- .check_metadata_list(metadata, metadata_prefix, meta_list) |
|
209 |
+ if (!length(unlist(samples))) { |
|
210 |
+ samples <- rangesList |
|
211 |
+ } else { |
|
212 |
+ index <- unlist(samples) |
|
213 |
+ samples <- rangesList[c(index)] |
|
214 |
+ } |
|
215 |
+ new_meta_list <- metadata(samples) |
|
216 |
+ suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list) |
|
217 |
+ granges <- .parse_Granges(samples, regions, suffix_vec) |
|
223 | 218 |
} |
224 | 219 |
|
225 | 220 |
.parse_Granges <- function(region_list, regions, suffixes) { |
226 |
- if (is.null(suffixes)) { |
|
227 |
- suffixes <- "" |
|
228 |
- } |
|
229 |
- |
|
230 |
- g1 <- region_list[[1]] |
|
231 |
- |
|
232 |
- if(is.object(regions) && ("FULL" %in% class(regions))) { |
|
233 |
- all_values <- names(elementMetadata(g1)) |
|
234 |
- except_values <- regions$values |
|
235 |
- regions <- if (is.null(except_values)) |
|
236 |
- all_values |
|
237 |
- else |
|
238 |
- all_values[!all_values %in% except_values] |
|
239 |
- names(regions) <- NULL |
|
240 |
- # since import convert this value from GMQL schema to GTF format |
|
241 |
- # we need to convert it back |
|
242 |
- regions <- replace(regions, regions == "feature", "type") |
|
243 |
- regions <- replace(regions, regions == "frame", "phase") |
|
244 |
- } |
|
245 |
- |
|
246 |
- elementMetadata(g1) <- NULL |
|
247 |
- if (!is.null(regions)) { |
|
248 |
- DF_list <- mapply(function(g_x, h) { |
|
249 |
- meta <- elementMetadata(g_x)[regions] |
|
250 |
- if (h != "") { |
|
251 |
- names(meta) <- paste(regions, h, sep = ".") |
|
252 |
- } |
|
253 |
- data.frame(meta) |
|
254 |
- }, region_list, suffixes, SIMPLIFY = FALSE) |
|
255 |
- DF_only_regions <- dplyr::bind_cols(DF_list) |
|
256 |
- elementMetadata(g1) <- DF_only_regions |
|
257 |
- } |
|
258 |
- g1 |
|
221 |
+ if (is.null(suffixes)) { |
|
222 |
+ suffixes <- "" |
|
223 |
+ } |
|
224 |
+ |
|
225 |
+ g1 <- region_list[[1]] |
|
226 |
+ |
|
227 |
+ if(is.object(regions) && ("FULL" %in% class(regions))) { |
|
228 |
+ all_values <- names(elementMetadata(g1)) |
|
229 |
+ except_values <- regions$values |
|
230 |
+ regions <- if (is.null(except_values)) |
|
231 |
+ all_values |
|
232 |
+ else |
|
233 |
+ all_values[!all_values %in% except_values] |
|
234 |
+ names(regions) <- NULL |
|
235 |
+ # since import convert this value from GMQL schema to GTF format |
|
236 |
+ # we need to convert it back |
|
237 |
+ regions <- replace(regions, regions == "feature", "type") |
|
238 |
+ regions <- replace(regions, regions == "frame", "phase") |
|
239 |
+ } |
|
240 |
+ |
|
241 |
+ elementMetadata(g1) <- NULL |
|
242 |
+ if (!is.null(regions)) { |
|
243 |
+ DF_list <- mapply(function(g_x, h) { |
|
244 |
+ meta <- elementMetadata(g_x)[regions] |
|
245 |
+ if (h != "") { |
|
246 |
+ names(meta) <- paste(regions, h, sep = ".") |
|
247 |
+ } |
|
248 |
+ data.frame(meta) |
|
249 |
+ }, region_list, suffixes, SIMPLIFY = FALSE) |
|
250 |
+ DF_only_regions <- dplyr::bind_cols(DF_list) |
|
251 |
+ elementMetadata(g1) <- DF_only_regions |
|
252 |
+ } |
|
253 |
+ g1 |
|
259 | 254 |
} |
260 | 255 |
|
261 | 256 |
.get_suffix <- function(col_name, from_list, meta_fl) { |
262 |
- suffix <- paste0(col_name, "$") |
|
263 |
- |
|
264 |
- if (from_list) { |
|
265 |
- meta_list <- mapply(function(x, index) { |
|
266 |
- vec_names <- names(x) |
|
267 |
- s_index <- grep(suffix, vec_names) |
|
268 |
- first_index <- s_index[1] |
|
269 |
- suffix <- unlist(x[first_index]) # ne prendo solo uno |
|
270 |
- names(suffix) <- NULL |
|
271 |
- |
|
272 |
- # if found retrieve samples that has at least one choosen metadata |
|
273 |
- if (first_index && !is.na(first_index)) { |
|
274 |
- suffix |
|
275 |
- } else { |
|
276 |
- "" |
|
277 |
- } |
|
278 |
- }, meta_fl, seq_along(meta_fl)) |
|
279 |
- } |
|
280 |
- else { |
|
281 |
- meta_list <- vapply(meta_fl, function(x) { |
|
282 |
- list <- .add_metadata(x) |
|
283 |
- vec_names <- names(list) |
|
284 |
- index <- grep(suffix, vec_names) |
|
285 |
- first_index <- index[1] |
|
286 |
- suffix <- unlist(list[first_index]) # ne prendo solo uno |
|
287 |
- names(suffix) <- NULL |
|
288 |
- # if found retrieve samples that has at least one choosen metadata |
|
289 |
- if (first_index && !is.na(first_index)) { |
|
290 |
- suffix |
|
291 |
- } else { |
|
292 |
- "" |
|
293 |
- } |
|
294 |
- }, character(1)) |
|
295 |
- } |
|
296 |
- names(meta_list) <- NULL |
|
297 |
- meta_list |
|
257 |
+ suffix <- paste0(col_name, "$") |
|
258 |
+ |
|
259 |
+ if (from_list) { |
|
260 |
+ meta_list <- mapply(function(x, index) { |
|
261 |
+ vec_names <- names(x) |
|
262 |
+ s_index <- grep(suffix, vec_names) |
|
263 |
+ first_index <- s_index[1] |
|
264 |
+ suffix <- unlist(x[first_index]) # ne prendo solo uno |
|
265 |
+ names(suffix) <- NULL |
|
266 |
+ |
|
267 |
+ # if found retrieve samples that has at least one choosen metadata |
|
268 |
+ if (first_index && !is.na(first_index)) { |
|
269 |
+ suffix |
|
270 |
+ } else { |
|
271 |
+ "" |
|
272 |
+ } |
|
273 |
+ }, meta_fl, seq_along(meta_fl)) |
|
274 |
+ } |
|
275 |
+ else { |
|
276 |
+ meta_list <- vapply(meta_fl, function(x) { |
|
277 |
+ list <- .add_metadata(x) |
|
278 |
+ vec_names <- names(list) |
|
279 |
+ index <- grep(suffix, vec_names) |
|
280 |
+ first_index <- index[1] |
|
281 |
+ suffix <- unlist(list[first_index]) # ne prendo solo uno |
|
282 |
+ names(suffix) <- NULL |
|
283 |
+ # if found retrieve samples that has at least one choosen metadata |
|
284 |
+ if (first_index && !is.na(first_index)) { |
|
285 |
+ suffix |
|
286 |
+ } else { |
|
287 |
+ "" |
|
288 |
+ } |
|
289 |
+ }, character(1)) |
|
290 |
+ } |
|
291 |
+ names(meta_list) <- NULL |
|
292 |
+ meta_list |
|
298 | 293 |
} |
299 | 294 |
|
300 | 295 |
.check_metadata_list <- function(metadata, metadata_prefix, meta_list) { |
301 |
- vec_meta <- paste0(metadata_prefix, metadata) |
|
302 |
- list <- mapply(function(x, index) { |
|
303 |
- vec_names <- names(x) |
|
304 |
- a <- lapply(vec_meta, function(y) { |
|
305 |
- which(y == vec_names) |
|
306 |
- }) |
|
307 |
- ## we would like that manage more index from grep |
|
308 |
- found <- as.logical(length(unlist(a))) |
|
309 |
- # if found retrieve samples that has at least one choosen metadata |
|
310 |
- if (found) { |
|
311 |
- index |
|
312 |
- } |
|
313 |
- }, meta_list, seq_along(meta_list)) |
|
296 |
+ vec_meta <- paste0(metadata_prefix, metadata) |
|
297 |
+ list <- mapply(function(x, index) { |
|
298 |
+ vec_names <- names(x) |
|
299 |
+ a <- lapply(vec_meta, function(y) { |
|
300 |
+ which(y == vec_names) |
|
301 |
+ }) |
|
302 |
+ ## we would like that manage more index from grep |
|
303 |
+ found <- as.logical(length(unlist(a))) |
|
304 |
+ # if found retrieve samples that has at least one choosen metadata |
|
305 |
+ if (found) { |
|
306 |
+ index |
|
307 |
+ } |
|
308 |
+ }, meta_list, seq_along(meta_list)) |
|
314 | 309 |
} |
315 | 310 |
|
316 | 311 |
.check_metadata_files <- function(metadata, metadata_prefix, meta_files) { |
317 |
- vec_meta <- paste0(metadata_prefix, metadata) |
|
318 |
- meta_list <- lapply(meta_files, function(x) { |
|
319 |
- list <- .add_metadata(x) |
|
320 |
- vec_names <- names(list) |
|
321 |
- a <- lapply(vec_meta, function(y) { |
|
322 |
- grep(y, vec_names) |
|
312 |
+ vec_meta <- paste0(metadata_prefix, metadata) |
|
313 |
+ meta_list <- lapply(meta_files, function(x) { |
|
314 |
+ list <- .add_metadata(x) |
|
315 |
+ vec_names <- names(list) |
|
316 |
+ a <- lapply(vec_meta, function(y) { |
|
317 |
+ grep(y, vec_names) |
|
318 |
+ }) |
|
319 |
+ ## we would like that manage more index from grep |
|
320 |
+ found <- as.logical(length(unlist(a))) |
|
321 |
+ # if found retrieve samples that has at least one choosen metadata |
|
322 |
+ if (found) { |
|
323 |
+ x |
|
324 |
+ } |
|
323 | 325 |
}) |
324 |
- ## we would like that manage more index from grep |
|
325 |
- found <- as.logical(length(unlist(a))) |
|
326 |
- # if found retrieve samples that has at least one choosen metadata |
|
327 |
- if (found) { |
|
328 |
- x |
|
329 |
- } |
|
330 |
- }) |
|
331 | 326 |
} |
332 | 327 |
|
333 | 328 |
.parse_gtf_files <- function( |
334 |
- gtf_region_files, |
|
335 |
- regions, |
|
336 |
- suffixes, |
|
337 |
- vector_field |
|
329 |
+ gtf_region_files, |
|
330 |
+ regions, |
|
331 |
+ suffixes, |
|
332 |
+ vector_field |
|
338 | 333 |
) { |
339 |
- g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
|
340 |
- elementMetadata(g1) <- NULL |
|
341 |
- if (is.null(suffixes)) { |
|
342 |
- suffixes <- "" |
|
343 |
- } |
|
344 |
- |
|
345 |
- # check if we used a FULL parameter instead of char array containing |
|
346 |
- # the region parameters |
|
347 |
- if(is.object(regions) && ("FULL" %in% class(regions))) { |
|
348 |
- all_values <- vector_field[!vector_field %in% c( |
|
349 |
- "seqname", |
|
350 |
- "strand", |
|
351 |
- "start", |
|
352 |
- "end" |
|
353 |
- ) |
|
354 |
- ] |
|
355 |
- except_values <- regions$values |
|
356 |
- regions <- if (is.null(except_values)) |
|
357 |
- all_values |
|
358 |
- else |
|
359 |
- all_values[!all_values %in% except_values] |
|
360 |
- names(regions) <- NULL |
|
361 |
- # since import convert this value from GMQL schema to GTF format |
|
362 |
- # we need to convert it back |
|
363 |
- regions <- replace(regions, regions == "feature", "type") |
|
364 |
- regions <- replace(regions, regions == "frame", "phase") |
|
365 |
- } |
|
366 |
- |
|
367 |
- if (!is.null(regions)) { |
|
368 |
- DF_list <- mapply(function(x, header) { |
|
369 |
- g_x <- rtracklayer::import(con = x, format = "gtf") |
|
370 |
- meta <- elementMetadata(g_x)[regions] |
|
371 |
- if (header != "") { |
|
372 |
- names(meta) <- paste(regions, header, sep = ".") |
|
373 |
- } |
|
374 |
- data.frame(meta) |
|
375 |
- }, gtf_region_files, suffixes, SIMPLIFY = FALSE) |
|
376 |
- DF_only_regions <- dplyr::bind_cols(DF_list) |
|
377 |
- elementMetadata(g1) <- DF_only_regions |
|
378 |
- } |
|
379 |
- g1 |
|
334 |
+ g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
|
335 |
+ elementMetadata(g1) <- NULL |
|
336 |
+ if (is.null(suffixes)) { |
|
337 |
+ suffixes <- "" |
|
338 |
+ } |
|
339 |
+ |
|
340 |
+ # check if we used a FULL parameter instead of char array containing |
|
341 |
+ # the region parameters |
|
342 |
+ if(is.object(regions) && ("FULL" %in% class(regions))) { |
|
343 |
+ all_values <- vector_field[!vector_field %in% c( |
|
344 |
+ "seqname", |
|
345 |
+ "strand", |
|
346 |
+ "start", |
|
347 |
+ "end") |
|
348 |
+ ] |
|
349 |
+ except_values <- regions$values |
|
350 |
+ regions <- if (is.null(except_values)) |
|
351 |
+ all_values |
|
352 |
+ else |
|
353 |
+ all_values[!all_values %in% except_values] |
|
354 |
+ names(regions) <- NULL |
|
355 |
+ # since import convert this value from GMQL schema to GTF format |
|
356 |
+ # we need to convert it back |
|
357 |
+ regions <- replace(regions, regions == "feature", "type") |
|
358 |
+ regions <- replace(regions, regions == "frame", "phase") |
|
359 |
+ } |
|
360 |
+ |
|
361 |
+ if (!is.null(regions)) { |
|
362 |
+ DF_list <- mapply(function(x, header) { |
|
363 |
+ g_x <- rtracklayer::import(con = x, format = "gtf") |
|
364 |
+ meta <- elementMetadata(g_x)[regions] |
|
365 |
+ if (header != "") { |
|
366 |
+ names(meta) <- paste(regions, header, sep = ".") |
|
367 |
+ } |
|
368 |
+ data.frame(meta) |
|
369 |
+ }, gtf_region_files, suffixes, SIMPLIFY = FALSE) |
|
370 |
+ DF_only_regions <- dplyr::bind_cols(DF_list) |
|
371 |
+ elementMetadata(g1) <- DF_only_regions |
|
372 |
+ } |
|
373 |
+ g1 |
|
380 | 374 |
} |
381 | 375 |
|
382 | 376 |
.parse_gdm_files <- function( |
383 |
- vector_field, |
|
384 |
- gdm_region_files, |
|
385 |
- regions, |
|
386 |
- suffixes |
|
377 |
+ vector_field, |
|
378 |
+ gdm_region_files, |
|
379 |
+ regions, |
|
380 |
+ suffixes |
|
387 | 381 |
) { |
388 |
- # read first sample cause chromosome regions are the same for all samples |
|
389 |
- df <- data.table::fread( |
|
390 |
- gdm_region_files[1], |
|
391 |
- col.names = vector_field, |
|
392 |
- header = FALSE, |
|
393 |
- sep = "\t" |
|
394 |
- ) |
|
395 |
- col_names <- names(df) |
|
396 |
- df <- subset(df, TRUE, c("chr", "left", "right", "strand")) |
|
397 |
- |
|
398 |
- # check if we used a FULL parameter instead of char array containing |
|
399 |
- # the region parameters |
|
400 |
- if(is.object(regions) && ("FULL" %in% class(regions))) { |
|
401 |
- all_values <- vector_field[!vector_field %in% c( |
|
402 |
- "chr", |
|
403 |
- "left", |
|
404 |
- "right", |
|
405 |
- "strand" |
|
406 |
- ) |
|
407 |
- ] |
|
408 |
- except_values <- regions$values |
|
409 |
- regions <- if (is.null(except_values)) |
|
410 |
- all_values |
|
411 |
- else |
|
412 |
- all_values[!all_values %in% except_values] |
|
413 |
- names(regions) <- NULL |
|
414 |
- } |
|
415 |
- |
|
416 |
- if (!is.null(regions)) { |
|
417 |
- df_list <- lapply(gdm_region_files, function(x, regions, vector_field) { |
|
418 |
- region_frame <- data.table::fread( |
|
419 |
- x, |
|
382 |
+ # read first sample cause chromosome regions are the same for all samples |
|
383 |
+ df <- data.table::fread( |
|
384 |
+ gdm_region_files[1], |
|
420 | 385 |
col.names = vector_field, |
421 | 386 |
header = FALSE, |
422 | 387 |
sep = "\t" |
423 |
- ) |
|
424 |
- col_names <- names(region_frame) |
|
425 |
- # delete column not choosen by input |
|
426 |
- if (!is.null(regions)) { |
|
427 |
- col_names <- col_names[col_names %in% regions] |
|
428 |
- } |
|
429 |
- |
|
430 |
- if (length(col_names)) { |
|
431 |
- r <- subset(region_frame, TRUE, col_names) |
|
432 |
- } |
|
433 |
- }, regions, vector_field) |
|
388 |
+ ) |
|
389 |
+ col_names <- names(df) |
|
390 |
+ df <- subset(df, TRUE, c("chr", "left", "right", "strand")) |
|
434 | 391 |
|
435 |
- df_only_regions <- dplyr::bind_cols(df_list) |
|
436 |
- complete_df <- dplyr::bind_cols(df, df_only_regions) |
|
392 |
+ # check if we used a FULL parameter instead of char array containing |
|
393 |
+ # the region parameters |
|
394 |
+ if(is.object(regions) && ("FULL" %in% class(regions))) { |
|
395 |
+ all_values <- vector_field[!vector_field %in% c( |
|
396 |
+ "chr", |
|
397 |
+ "left", |
|
398 |
+ "right", |
|
399 |
+ "strand") |
|
400 |
+ ] |
|
401 |
+ except_values <- regions$values |
|
402 |
+ regions <- if (is.null(except_values)) |
|
403 |
+ all_values |
|
404 |
+ else |
|
405 |
+ all_values[!all_values %in% except_values] |
|
406 |
+ names(regions) <- NULL |
|
407 |
+ } |
|
437 | 408 |
|
438 |
- region_names <- names(complete_df)[-(seq_len(4))] |
|
439 |
- region_names <- gsub("[0-9]+", "", region_names) |
|
440 |
- region_names <- paste(region_names, suffixes, sep = ".") |
|
441 |
- region_names <- c(names(complete_df)[(seq_len(4))], region_names) |
|
442 |
- names(complete_df) <- region_names |
|
443 |
- g <- GenomicRanges::makeGRangesFromDataFrame( |
|
444 |
- complete_df, |
|
445 |
- keep.extra.columns = TRUE, |
|
446 |
- start.field = "left", |
|
447 |
- end.field = "right" |
|
448 |
- ) |
|
449 |
- } |
|
450 |
- else { |
|
451 |
- g <- GenomicRanges::makeGRangesFromDataFrame( |
|
452 |
- df, |
|
453 |
- keep.extra.columns = TRUE, |
|
454 |
- start.field = "left", |
|
455 |
- end.field = "right" |
|
456 |
- ) |
|
457 |
- } |
|
409 |
+ if (!is.null(regions)) { |
|
410 |
+ df_list <- lapply(gdm_region_files, function(x, regions, vector_field) { |
|
411 |
+ region_frame <- data.table::fread( |
|
412 |
+ x, |
|
413 |
+ col.names = vector_field, |
|
414 |
+ header = FALSE, |
|
415 |
+ sep = "\t") |
|
416 |
+ col_names <- names(region_frame) |
|
417 |
+ # delete column not choosen by input |
|
418 |
+ if (!is.null(regions)) { |
|
419 |
+ col_names <- col_names[col_names %in% regions] |
|
420 |
+ } |
|
421 |
+ |
|
422 |
+ if (length(col_names)) { |
|
423 |
+ r <- subset(region_frame, TRUE, col_names) |
|
424 |
+ } |
|
425 |
+ }, regions, vector_field) |
|
426 |
+ |
|
427 |
+ df_only_regions <- dplyr::bind_cols(df_list) |
|
428 |
+ complete_df <- dplyr::bind_cols(df, df_only_regions) |
|
429 |
+ |
|
430 |
+ region_names <- names(complete_df)[-(seq_len(4))] |
|
431 |
+ region_names <- gsub("[0-9]+", "", region_names) |
|
432 |
+ region_names <- paste(region_names, suffixes, sep = ".") |
|
433 |
+ region_names <- c(names(complete_df)[(seq_len(4))], region_names) |
|
434 |
+ names(complete_df) <- region_names |
|
435 |
+ g <- GenomicRanges::makeGRangesFromDataFrame( |
|
436 |
+ complete_df, |
|
437 |
+ keep.extra.columns = TRUE, |
|
438 |
+ start.field = "left", |
|
439 |
+ end.field = "right") |
|
440 |
+ } else { |
|
441 |
+ g <- GenomicRanges::makeGRangesFromDataFrame( |
|
442 |
+ df, |
|
443 |
+ keep.extra.columns = TRUE, |
|
444 |
+ start.field = "left", |
|
445 |
+ end.field = "right") |
|
446 |
+ } |
|
458 | 447 |
} |
... | ... |
@@ -26,7 +26,11 @@ |
26 | 26 |
#' @param region_attributes vector of strings that extracts only region |
27 | 27 |
#' attributes specified; if NULL no regions attribute is taken and the output |
28 | 28 |
#' is only GRanges made up by the region coordinate attributes |
29 |
-#' (seqnames, start, end, strand) |
|
29 |
+#' (seqnames, start, end, strand); |
|
30 |
+#' It is also possible to assign the \code{\link{FULL}} with or without |
|
31 |
+#' its input parameter; in case was without the `except` parameter, |
|
32 |
+#' all the region attributes are taken, otherwise all the region attributes |
|
33 |
+#' are taken except the input attribute defined by except. |
|
30 | 34 |
#' @param suffix name for each metadata column of GRanges. By default it is the |
31 | 35 |
#' value of the metadata attribute named "antibody_target". This string is |
32 | 36 |
#' taken from sample metadata file or from metadata() associated. |
... | ... |
@@ -61,19 +65,14 @@ |
61 | 65 |
#' sorted_grl <- sort(grl) |
62 | 66 |
#' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak")) |
63 | 67 |
#' |
64 |
-#' ## It is also possible to define the region attributes, using the FULL() |
|
65 |
-#' ## function parameter, in order to includes every region |
|
66 |
-#' ## attributes present into the schema file |
|
68 |
+#' ## This statement imports a GMQL dataset as GRangesList and filters it |
|
69 |
+#' ## including all the region attributes |
|
67 | 70 |
#' |
68 | 71 |
#' sorted_grl_full <- sort(grl) |
69 |
-#' filter_and_extract(sorted_grl, region_attributes = FULL()) |
|
72 |
+#' filter_and_extract(sorted_grl_full, region_attributes = FULL()) |
|
70 | 73 |
#' |
71 |
-#' grl <- import_gmql(test_path, TRUE) |
|
72 |
-#' sorted_grl <- sort(grl) |
|
73 |
-#' filter_and_extract(sorted_grl, region_attributes = FULL()) |
|
74 |
-#' |
|
75 |
-#' ## Also, we can inlcude a list of region attribute inside the FULL() |
|
76 |
-#' ## function to exlucde that regions |
|
74 |
+#' ## This statement imports a GMQL dataset as GRangesList and filters it |
|
75 |
+#' ## including all the region attributes except "jaccard" and "score" |
|
77 | 76 |
#' |
78 | 77 |
#' sorted_grl_full_except <- sort(grl) |
79 | 78 |
#' filter_and_extract( |
... | ... |
@@ -68,6 +68,10 @@ |
68 | 68 |
#' sorted_grl_full <- sort(grl) |
69 | 69 |
#' filter_and_extract(sorted_grl, region_attributes = FULL()) |
70 | 70 |
#' |
71 |
+#' grl <- import_gmql(test_path, TRUE) |
|
72 |
+#' sorted_grl <- sort(grl) |
|
73 |
+#' filter_and_extract(sorted_grl, region_attributes = FULL()) |
|
74 |
+#' |
|
71 | 75 |
#' ## Also, we can inlcude a list of region attribute inside the FULL() |
72 | 76 |
#' ## function to exlucde that regions |
73 | 77 |