Browse code

fixed error on gtf importing dataset files with seqid as other group columns

Simone authored on 17/07/2021 14:32:55
Showing 5 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: RGMQL
2 2
 Type: Package
3 3
 Title: GenoMetric Query Language for R/Bioconductor
4
-Version: 1.13.2
4
+Version: 1.13.3
5 5
 Authors@R: c(person(given = "Simone",
6 6
            family = "Pallotta",
7 7
            role = c("aut", "cre"),
... ...
@@ -46,29 +46,22 @@ import_gmql <- function(dataset_path, is_gtf) {
46 46
     if(!length(list.files(datasetName)))
47 47
         stop("no samples present in this dataset")
48 48
     
49
+    attr_col_names <- .schema_header(datasetName)
50
+    attr_col_names <- attr_col_names[
51
+        !attr_col_names %in% c("seqname", "seqid", "start", "end", "strand")]
49 52
     regions <- list.files(datasetName, pattern = "*.gtf$",full.names = TRUE)
50 53
     if(length(regions)) {
51 54
         name_samples <- lapply(regions, function(x) {
52 55
             gsub("*.gtf", "", basename(x))})
53 56
         
54
-        sampleList <- tryCatch(
55
-            expr = {
56
-                lapply(regions, function(x) {
57
-                    rtracklayer::import(con = x, format = "gtf")
58
-                })
59
-            },
60
-            error = function(e) { 
61
-                lapply(regions, function(x) {
62
-                    rtracklayer::import(con = x, format = "gff", version = "3")
63
-                })
64
-            },
65
-            warning = function(w) {
66
-                lapply(regions, function(x) {
67
-                    rtracklayer::import(con = x, format = "gff", version = "3")
68
-                })
69
-            }
70
-        )
71
-
57
+        sampleList <- lapply(regions, function(x) {
58
+            rtracklayer::import(
59
+                con = x, 
60
+                format = "gtf", 
61
+                colnames = attr_col_names
62
+            )
63
+        })
64
+        
72 65
         names(sampleList) <- name_samples
73 66
         gRange_list <- GenomicRanges::GRangesList(sampleList)
74 67
         
... ...
@@ -232,10 +232,6 @@ filter_and_extract <- function(
232 232
         else
233 233
             all_values[!all_values %in% except_values]
234 234
         names(regions) <- NULL
235
-        # since import convert this value from GMQL schema to GTF format
236
-        # we need to convert it back
237
-        regions <- replace(regions, regions == "feature", "type")
238
-        regions <- replace(regions, regions == "frame", "phase")
239 235
     }
240 236
     
241 237
     elementMetadata(g1) <- NULL
... ...
@@ -318,7 +314,7 @@ filter_and_extract <- function(
318 314
         })
319 315
         ## we would like that manage more index from grep
320 316
         found <- as.logical(length(unlist(a)))
321
-        # if found retrieve samples that has at least one choosen metadata
317
+        # if found retrieve samples that has at least one chosen metadata
322 318
         if (found) {
323 319
             x
324 320
         }
... ...
@@ -331,24 +327,15 @@ filter_and_extract <- function(
331 327
     suffixes,
332 328
     vector_field
333 329
 ) {
334
-    g1 <- tryCatch(
335
-        expr = {
336
-            rtracklayer::import(con = gtf_region_files[1], format = "gtf")
337
-        },
338
-        error = function(e){ 
339
-            rtracklayer::import(
340
-                gtf_region_files[1], 
341
-                format = "gff",
342
-                version = "3")
343
-        },
344
-        warning = function(w){
345
-            rtracklayer::import(
346
-                gtf_region_files[1], 
347
-                format = "gff", 
348
-                version = "3")
349
-        }
330
+    attr_col_names <- vector_field[
331
+      !vector_field %in% c("seqname", "seqid", "start", "end", "strand")]
332
+    
333
+    g1 <- rtracklayer::import(
334
+        con = gtf_region_files[1], 
335
+        format = "gtf",
336
+        colnames = attr_col_names
350 337
     )
351
-  
338
+    
352 339
     elementMetadata(g1) <- NULL
353 340
     if (is.null(suffixes)) {
354 341
         suffixes <- ""
... ...
@@ -377,16 +364,10 @@ filter_and_extract <- function(
377 364
     
378 365
     if (!is.null(regions)) {
379 366
         DF_list <- mapply(function(x, header) {
380
-            g_x <- tryCatch(
381
-                expr = {
382
-                    rtracklayer::import(x, format = "gtf")
383
-                },
384
-                error = function(e){ 
385
-                    rtracklayer::import(x, format = "gff", version = "3")
386
-                },
387
-                warning = function(w){
388
-                    rtracklayer::import(x, format = "gff", version = "3")
389
-                }
367
+            g_x <- rtracklayer::import(
368
+                x, 
369
+                format = "gtf", 
370
+                colnames = attr_col_names
390 371
             )
391 372
             meta <- elementMetadata(g_x)[regions]
392 373
             if (header != "") {
... ...
@@ -278,6 +278,9 @@ gmql_take <- function(input_data, rows) {
278 278
         #levels(x$start)[x$start] = start_numeric
279 279
         g <- GenomicRanges::makeGRangesFromDataFrame(
280 280
             x,
281
+            seqnames.field = c("seqnames", "seqname",
282
+                               "chromosome", "chrom",
283
+                               "chr", "chromosome_name"),
281 284
             keep.extra.columns = TRUE,
282 285
             start.field = "start",
283 286
             end.field = "end")
... ...
@@ -1272,22 +1272,20 @@ sample_region <- function(url, datasetName,sampleName) {
1272 1272
             col.names = FALSE,
1273 1273
             row.names = FALSE
1274 1274
         )
1275
+        vector_field <- vapply(
1276
+            list$fields, function(x) x$name, character(1)
1277
+        )
1275 1278
         if (identical(schema_type, "gtf")) {
1276
-            samples <- tryCatch(
1277
-                expr = {
1278
-                    rtracklayer::import(temp, format = "gtf")
1279
-                },
1280
-                error = function(e){ 
1281
-                    rtracklayer::import(temp, format = "gff", version = "3")
1282
-                },
1283
-                warning = function(w){
1284
-                    rtracklayer::import(temp, format = "gff", version = "3")
1285
-                }
1279
+            attr_col_names <- vector_field[
1280
+              !vector_field %in% c(
1281
+                "seqname", "seqid", "start", "end", "strand"
1282
+              )]
1283
+            samples <- rtracklayer::import(
1284
+                temp, 
1285
+                format = "gtf",
1286
+                colnames = attr_col_names
1286 1287
             )
1287 1288
         } else {
1288
-            vector_field <- vapply(
1289
-                list$fields, function(x) x$name, character(1)
1290
-            )
1291 1289
             df <- data.table::fread(temp, header = FALSE, sep = "\t")
1292 1290
             a <- df[1, 2]
1293 1291
             if(is.na(as.numeric(a)))