... | ... |
@@ -46,29 +46,22 @@ import_gmql <- function(dataset_path, is_gtf) { |
46 | 46 |
if(!length(list.files(datasetName))) |
47 | 47 |
stop("no samples present in this dataset") |
48 | 48 |
|
49 |
+ attr_col_names <- .schema_header(datasetName) |
|
50 |
+ attr_col_names <- attr_col_names[ |
|
51 |
+ !attr_col_names %in% c("seqname", "seqid", "start", "end", "strand")] |
|
49 | 52 |
regions <- list.files(datasetName, pattern = "*.gtf$",full.names = TRUE) |
50 | 53 |
if(length(regions)) { |
51 | 54 |
name_samples <- lapply(regions, function(x) { |
52 | 55 |
gsub("*.gtf", "", basename(x))}) |
53 | 56 |
|
54 |
- sampleList <- tryCatch( |
|
55 |
- expr = { |
|
56 |
- lapply(regions, function(x) { |
|
57 |
- rtracklayer::import(con = x, format = "gtf") |
|
58 |
- }) |
|
59 |
- }, |
|
60 |
- error = function(e) { |
|
61 |
- lapply(regions, function(x) { |
|
62 |
- rtracklayer::import(con = x, format = "gff", version = "3") |
|
63 |
- }) |
|
64 |
- }, |
|
65 |
- warning = function(w) { |
|
66 |
- lapply(regions, function(x) { |
|
67 |
- rtracklayer::import(con = x, format = "gff", version = "3") |
|
68 |
- }) |
|
69 |
- } |
|
70 |
- ) |
|
71 |
- |
|
57 |
+ sampleList <- lapply(regions, function(x) { |
|
58 |
+ rtracklayer::import( |
|
59 |
+ con = x, |
|
60 |
+ format = "gtf", |
|
61 |
+ colnames = attr_col_names |
|
62 |
+ ) |
|
63 |
+ }) |
|
64 |
+ |
|
72 | 65 |
names(sampleList) <- name_samples |
73 | 66 |
gRange_list <- GenomicRanges::GRangesList(sampleList) |
74 | 67 |
|
... | ... |
@@ -232,10 +232,6 @@ filter_and_extract <- function( |
232 | 232 |
else |
233 | 233 |
all_values[!all_values %in% except_values] |
234 | 234 |
names(regions) <- NULL |
235 |
- # since import convert this value from GMQL schema to GTF format |
|
236 |
- # we need to convert it back |
|
237 |
- regions <- replace(regions, regions == "feature", "type") |
|
238 |
- regions <- replace(regions, regions == "frame", "phase") |
|
239 | 235 |
} |
240 | 236 |
|
241 | 237 |
elementMetadata(g1) <- NULL |
... | ... |
@@ -318,7 +314,7 @@ filter_and_extract <- function( |
318 | 314 |
}) |
319 | 315 |
## we would like that manage more index from grep |
320 | 316 |
found <- as.logical(length(unlist(a))) |
321 |
- # if found retrieve samples that has at least one choosen metadata |
|
317 |
+ # if found retrieve samples that has at least one chosen metadata |
|
322 | 318 |
if (found) { |
323 | 319 |
x |
324 | 320 |
} |
... | ... |
@@ -331,24 +327,15 @@ filter_and_extract <- function( |
331 | 327 |
suffixes, |
332 | 328 |
vector_field |
333 | 329 |
) { |
334 |
- g1 <- tryCatch( |
|
335 |
- expr = { |
|
336 |
- rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
|
337 |
- }, |
|
338 |
- error = function(e){ |
|
339 |
- rtracklayer::import( |
|
340 |
- gtf_region_files[1], |
|
341 |
- format = "gff", |
|
342 |
- version = "3") |
|
343 |
- }, |
|
344 |
- warning = function(w){ |
|
345 |
- rtracklayer::import( |
|
346 |
- gtf_region_files[1], |
|
347 |
- format = "gff", |
|
348 |
- version = "3") |
|
349 |
- } |
|
330 |
+ attr_col_names <- vector_field[ |
|
331 |
+ !vector_field %in% c("seqname", "seqid", "start", "end", "strand")] |
|
332 |
+ |
|
333 |
+ g1 <- rtracklayer::import( |
|
334 |
+ con = gtf_region_files[1], |
|
335 |
+ format = "gtf", |
|
336 |
+ colnames = attr_col_names |
|
350 | 337 |
) |
351 |
- |
|
338 |
+ |
|
352 | 339 |
elementMetadata(g1) <- NULL |
353 | 340 |
if (is.null(suffixes)) { |
354 | 341 |
suffixes <- "" |
... | ... |
@@ -377,16 +364,10 @@ filter_and_extract <- function( |
377 | 364 |
|
378 | 365 |
if (!is.null(regions)) { |
379 | 366 |
DF_list <- mapply(function(x, header) { |
380 |
- g_x <- tryCatch( |
|
381 |
- expr = { |
|
382 |
- rtracklayer::import(x, format = "gtf") |
|
383 |
- }, |
|
384 |
- error = function(e){ |
|
385 |
- rtracklayer::import(x, format = "gff", version = "3") |
|
386 |
- }, |
|
387 |
- warning = function(w){ |
|
388 |
- rtracklayer::import(x, format = "gff", version = "3") |
|
389 |
- } |
|
367 |
+ g_x <- rtracklayer::import( |
|
368 |
+ x, |
|
369 |
+ format = "gtf", |
|
370 |
+ colnames = attr_col_names |
|
390 | 371 |
) |
391 | 372 |
meta <- elementMetadata(g_x)[regions] |
392 | 373 |
if (header != "") { |
... | ... |
@@ -278,6 +278,9 @@ gmql_take <- function(input_data, rows) { |
278 | 278 |
#levels(x$start)[x$start] = start_numeric |
279 | 279 |
g <- GenomicRanges::makeGRangesFromDataFrame( |
280 | 280 |
x, |
281 |
+ seqnames.field = c("seqnames", "seqname", |
|
282 |
+ "chromosome", "chrom", |
|
283 |
+ "chr", "chromosome_name"), |
|
281 | 284 |
keep.extra.columns = TRUE, |
282 | 285 |
start.field = "start", |
283 | 286 |
end.field = "end") |
... | ... |
@@ -1272,22 +1272,20 @@ sample_region <- function(url, datasetName,sampleName) { |
1272 | 1272 |
col.names = FALSE, |
1273 | 1273 |
row.names = FALSE |
1274 | 1274 |
) |
1275 |
+ vector_field <- vapply( |
|
1276 |
+ list$fields, function(x) x$name, character(1) |
|
1277 |
+ ) |
|
1275 | 1278 |
if (identical(schema_type, "gtf")) { |
1276 |
- samples <- tryCatch( |
|
1277 |
- expr = { |
|
1278 |
- rtracklayer::import(temp, format = "gtf") |
|
1279 |
- }, |
|
1280 |
- error = function(e){ |
|
1281 |
- rtracklayer::import(temp, format = "gff", version = "3") |
|
1282 |
- }, |
|
1283 |
- warning = function(w){ |
|
1284 |
- rtracklayer::import(temp, format = "gff", version = "3") |
|
1285 |
- } |
|
1279 |
+ attr_col_names <- vector_field[ |
|
1280 |
+ !vector_field %in% c( |
|
1281 |
+ "seqname", "seqid", "start", "end", "strand" |
|
1282 |
+ )] |
|
1283 |
+ samples <- rtracklayer::import( |
|
1284 |
+ temp, |
|
1285 |
+ format = "gtf", |
|
1286 |
+ colnames = attr_col_names |
|
1286 | 1287 |
) |
1287 | 1288 |
} else { |
1288 |
- vector_field <- vapply( |
|
1289 |
- list$fields, function(x) x$name, character(1) |
|
1290 |
- ) |
|
1291 | 1289 |
df <- data.table::fread(temp, header = FALSE, sep = "\t") |
1292 | 1290 |
a <- df[1, 2] |
1293 | 1291 |
if(is.na(as.numeric(a))) |