Browse code

fixed gdm also

Simone authored on 19/07/2021 07:02:30
Showing 1 changed files
... ...
@@ -443,12 +443,18 @@ filter_and_extract <- function(
443 443
         g <- GenomicRanges::makeGRangesFromDataFrame(
444 444
             complete_df,
445 445
             keep.extra.columns = TRUE,
446
+            seqnames.field = c("seqnames", "seqname",
447
+                               "chromosome", "chrom",
448
+                               "chr", "chromosome_name"),
446 449
             start.field = "left",
447 450
             end.field = "right")
448 451
     } else {
449 452
         g <- GenomicRanges::makeGRangesFromDataFrame(
450 453
             df,
451 454
             keep.extra.columns = TRUE,
455
+            seqnames.field = c("seqnames", "seqname",
456
+                               "chromosome", "chrom",
457
+                               "chr", "chromosome_name"),
452 458
             start.field = "left",
453 459
             end.field = "right")
454 460
     }
Browse code

fixed error on gtf importing dataset files with seqid as other group columns

Simone authored on 17/07/2021 14:32:55
Showing 1 changed files
... ...
@@ -232,10 +232,6 @@ filter_and_extract <- function(
232 232
         else
233 233
             all_values[!all_values %in% except_values]
234 234
         names(regions) <- NULL
235
-        # since import convert this value from GMQL schema to GTF format
236
-        # we need to convert it back
237
-        regions <- replace(regions, regions == "feature", "type")
238
-        regions <- replace(regions, regions == "frame", "phase")
239 235
     }
240 236
     
241 237
     elementMetadata(g1) <- NULL
... ...
@@ -318,7 +314,7 @@ filter_and_extract <- function(
318 314
         })
319 315
         ## we would like that manage more index from grep
320 316
         found <- as.logical(length(unlist(a)))
321
-        # if found retrieve samples that has at least one choosen metadata
317
+        # if found retrieve samples that has at least one chosen metadata
322 318
         if (found) {
323 319
             x
324 320
         }
... ...
@@ -331,24 +327,15 @@ filter_and_extract <- function(
331 327
     suffixes,
332 328
     vector_field
333 329
 ) {
334
-    g1 <- tryCatch(
335
-        expr = {
336
-            rtracklayer::import(con = gtf_region_files[1], format = "gtf")
337
-        },
338
-        error = function(e){ 
339
-            rtracklayer::import(
340
-                gtf_region_files[1], 
341
-                format = "gff",
342
-                version = "3")
343
-        },
344
-        warning = function(w){
345
-            rtracklayer::import(
346
-                gtf_region_files[1], 
347
-                format = "gff", 
348
-                version = "3")
349
-        }
330
+    attr_col_names <- vector_field[
331
+      !vector_field %in% c("seqname", "seqid", "start", "end", "strand")]
332
+    
333
+    g1 <- rtracklayer::import(
334
+        con = gtf_region_files[1], 
335
+        format = "gtf",
336
+        colnames = attr_col_names
350 337
     )
351
-  
338
+    
352 339
     elementMetadata(g1) <- NULL
353 340
     if (is.null(suffixes)) {
354 341
         suffixes <- ""
... ...
@@ -377,16 +364,10 @@ filter_and_extract <- function(
377 364
     
378 365
     if (!is.null(regions)) {
379 366
         DF_list <- mapply(function(x, header) {
380
-            g_x <- tryCatch(
381
-                expr = {
382
-                    rtracklayer::import(x, format = "gtf")
383
-                },
384
-                error = function(e){ 
385
-                    rtracklayer::import(x, format = "gff", version = "3")
386
-                },
387
-                warning = function(w){
388
-                    rtracklayer::import(x, format = "gff", version = "3")
389
-                }
367
+            g_x <- rtracklayer::import(
368
+                x, 
369
+                format = "gtf", 
370
+                colnames = attr_col_names
390 371
             )
391 372
             meta <- elementMetadata(g_x)[regions]
392 373
             if (header != "") {
Browse code

fixed import gtf, granges schema

Simone authored on 08/06/2021 09:04:48
Showing 1 changed files
... ...
@@ -331,7 +331,24 @@ filter_and_extract <- function(
331 331
     suffixes,
332 332
     vector_field
333 333
 ) {
334
-    g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf")
334
+    g1 <- tryCatch(
335
+        expr = {
336
+            rtracklayer::import(con = gtf_region_files[1], format = "gtf")
337
+        },
338
+        error = function(e){ 
339
+            rtracklayer::import(
340
+                gtf_region_files[1], 
341
+                format = "gff",
342
+                version = "3")
343
+        },
344
+        warning = function(w){
345
+            rtracklayer::import(
346
+                gtf_region_files[1], 
347
+                format = "gff", 
348
+                version = "3")
349
+        }
350
+    )
351
+  
335 352
     elementMetadata(g1) <- NULL
336 353
     if (is.null(suffixes)) {
337 354
         suffixes <- ""
... ...
@@ -360,7 +377,17 @@ filter_and_extract <- function(
360 377
     
361 378
     if (!is.null(regions)) {
362 379
         DF_list <- mapply(function(x, header) {
363
-            g_x <- rtracklayer::import(con = x, format = "gtf")
380
+            g_x <- tryCatch(
381
+                expr = {
382
+                    rtracklayer::import(x, format = "gtf")
383
+                },
384
+                error = function(e){ 
385
+                    rtracklayer::import(x, format = "gff", version = "3")
386
+                },
387
+                warning = function(w){
388
+                    rtracklayer::import(x, format = "gff", version = "3")
389
+                }
390
+            )
364 391
             meta <- elementMetadata(g_x)[regions]
365 392
             if (header != "") {
366 393
                 names(meta) <- paste(regions, header, sep = ".")
Browse code

update vignette, new datasets

Simone authored on 18/05/2021 20:04:54
Showing 1 changed files
... ...
@@ -72,12 +72,12 @@
72 72
 #' filter_and_extract(sorted_grl_full, region_attributes = FULL())
73 73
 #' 
74 74
 #' ## This statement imports a GMQL dataset as GRangesList and filters it
75
-#' ## including all the region attributes except "jaccard" and "score"
75
+#' ## including all the region attributes except "jaccard"
76 76
 #' 
77 77
 #' sorted_grl_full_except <- sort(grl)
78 78
 #' filter_and_extract(
79 79
 #'  sorted_grl_full_except, 
80
-#'  region_attributes = FULL("jaccard", "score")
80
+#'  region_attributes = FULL("jaccard")
81 81
 #' )
82 82
 #' 
83 83
 #' @export
Browse code

biocheck

Simone authored on 17/05/2021 09:41:45
Showing 1 changed files
... ...
@@ -83,376 +83,365 @@
83 83
 #' @export
84 84
 #'
85 85
 filter_and_extract <- function(
86
-  data,
87
-  metadata = NULL,
88
-  metadata_prefix = NULL,
89
-  region_attributes = NULL,
90
-  suffix = "antibody_target"
86
+    data,
87
+    metadata = NULL,
88
+    metadata_prefix = NULL,
89
+    region_attributes = NULL,
90
+    suffix = "antibody_target"
91 91
 ) {
92
-  
93
-  if (is(data, "GRangesList")) {
94
-    .extract_from_GRangesList(
95
-      data,
96
-      metadata,
97
-      metadata_prefix,
98
-      region_attributes,
99
-      suffix
100
-    )
101
-  } else {
102
-    .extract_from_dataset(
103
-      data,
104
-      metadata,
105
-      metadata_prefix,
106
-      region_attributes, suffix
107
-    )
108
-  }
92
+    if (is(data, "GRangesList")) {
93
+        .extract_from_GRangesList(
94
+            data,
95
+            metadata,
96
+            metadata_prefix,
97
+            region_attributes,
98
+            suffix)
99
+    } else {
100
+        .extract_from_dataset(
101
+            data,
102
+            metadata,
103
+            metadata_prefix,
104
+            region_attributes, suffix)
105
+    }
109 106
 }
110 107
 
111 108
 .extract_from_dataset <- function(
112
-  datasetName,
113
-  metadata,
114
-  metadata_prefix,
115
-  regions,
116
-  suffix
117
-) {
118
-  datasetName <- sub("/*[/]$", "", datasetName)
119
-  if (basename(datasetName) != "files") {
120
-    datasetName <- file.path(datasetName, "files")
121
-  }
122
-  
123
-  if (!dir.exists(datasetName)) {
124
-    stop("Directory does not exists")
125
-  }
126
-  
127
-  gdm_meta_files <- list.files(
128
-    datasetName,
129
-    pattern = "*.gdm.meta$",
130
-    full.names = TRUE
131
-  )
132
-  
133
-  gtf_meta_files <- list.files(
134 109
     datasetName,
135
-    pattern = "*.gtf.meta$",
136
-    full.names = TRUE
137
-  )
138
-  
139
-  if (!length(gdm_meta_files) && !length(gtf_meta_files)) {
140
-    stop("no samples present or no files format supported")
141
-  }
142
-  
143
-  if (length(gdm_meta_files) && length(gtf_meta_files)) {
144
-    stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together")
145
-  }
146
-  
147
-  vector_field <- .schema_header(datasetName)
148
-  
149
-  if (length(gdm_meta_files)) {
150
-    samples_file <- .check_metadata_files(
151
-      metadata, metadata_prefix,
152
-      gdm_meta_files
153
-    )
154
-    
155
-    samples_meta_to_read <- unlist(samples_file)
110
+    metadata,
111
+    metadata_prefix,
112
+    regions,
113
+    suffix
114
+) {
115
+    datasetName <- sub("/*[/]$", "", datasetName)
116
+    if (basename(datasetName) != "files") {
117
+        datasetName <- file.path(datasetName, "files")
118
+    }
156 119
     
157
-    if (length(samples_meta_to_read)) {
158
-      samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
159
-    } else {
160
-      samples_to_read <- gsub(".meta$", "", gdm_meta_files)
161
-      samples_meta_to_read <- gtf_meta_files
120
+    if (!dir.exists(datasetName)) {
121
+        stop("Directory does not exists")
162 122
     }
163 123
     
164
-    suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
165
-    granges <- .parse_gdm_files(
166
-      vector_field, 
167
-      samples_to_read, 
168
-      regions,
169
-      suffix_vec
124
+    gdm_meta_files <- list.files(
125
+        datasetName,
126
+        pattern = "*.gdm.meta$",
127
+        full.names = TRUE
170 128
     )
171 129
     
172
-  } else {
173
-    samples_file <- .check_metadata_files(
174
-      metadata, 
175
-      metadata_prefix,
176
-      gtf_meta_files
130
+    gtf_meta_files <- list.files(
131
+        datasetName,
132
+        pattern = "*.gtf.meta$",
133
+        full.names = TRUE
177 134
     )
178
-    samples_meta_to_read <- unlist(samples_file)
179 135
     
180
-    if (length(samples_meta_to_read)) {
181
-      samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
182
-    } else {
183
-      samples_to_read <- gsub(".meta$", "", gtf_meta_files)
184
-      samples_meta_to_read <- gtf_meta_files
136
+    if (!length(gdm_meta_files) && !length(gtf_meta_files)) {
137
+        stop("no samples present or no files format supported")
185 138
     }
186 139
     
187
-    suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
188
-    granges <- .parse_gtf_files(
189
-      samples_to_read, 
190
-      regions, 
191
-      suffix_vec, 
192
-      vector_field
193
-    )
194
-  }
140
+    if (length(gdm_meta_files) && length(gtf_meta_files)) {
141
+        stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together")
142
+    }
143
+    
144
+    vector_field <- .schema_header(datasetName)
145
+    
146
+    if (length(gdm_meta_files)) {
147
+        samples_file <- .check_metadata_files(
148
+            metadata, metadata_prefix,
149
+            gdm_meta_files
150
+        )
151
+        
152
+        samples_meta_to_read <- unlist(samples_file)
153
+        
154
+        if (length(samples_meta_to_read)) {
155
+            samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
156
+        } else {
157
+            samples_to_read <- gsub(".meta$", "", gdm_meta_files)
158
+            samples_meta_to_read <- gtf_meta_files
159
+        }
160
+        
161
+        suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
162
+        granges <- .parse_gdm_files(
163
+            vector_field, 
164
+            samples_to_read, 
165
+            regions,
166
+            suffix_vec)
167
+    } else {
168
+        samples_file <- .check_metadata_files(
169
+            metadata, 
170
+            metadata_prefix,
171
+            gtf_meta_files
172
+        )
173
+        samples_meta_to_read <- unlist(samples_file)
174
+        
175
+        if (length(samples_meta_to_read)) {
176
+            samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
177
+        } else {
178
+            samples_to_read <- gsub(".meta$", "", gtf_meta_files)
179
+            samples_meta_to_read <- gtf_meta_files
180
+        }
181
+        
182
+        suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
183
+        granges <- .parse_gtf_files(
184
+            samples_to_read, 
185
+            regions, 
186
+            suffix_vec, 
187
+            vector_field
188
+        )
189
+    }
195 190
 }
196 191
 
197 192
 .extract_from_GRangesList <- function(
198
-  rangesList,
199
-  metadata,
200
-  metadata_prefix,
201
-  regions,
202
-  suffix
193
+    rangesList,
194
+    metadata,
195
+    metadata_prefix,
196
+    regions,
197
+    suffix
203 198
 ) {
204
-  if (!is(rangesList, "GRangesList")) {
205
-    stop("only GrangesList admitted")
206
-  }
207
-  
208
-  if (!length(rangesList)) {
209
-    stop("rangesList empty")
210
-  }
211
-  
212
-  meta_list <- metadata(rangesList)
213
-  samples <- .check_metadata_list(metadata, metadata_prefix, meta_list)
214
-  if (!length(unlist(samples))) {
215
-    samples <- rangesList
216
-  } else {
217
-    index <- unlist(samples)
218
-    samples <- rangesList[c(index)]
219
-  }
220
-  new_meta_list <- metadata(samples)
221
-  suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list)
222
-  granges <- .parse_Granges(samples, regions, suffix_vec)
199
+    if (!is(rangesList, "GRangesList")) {
200
+        stop("only GrangesList admitted")
201
+    }
202
+    
203
+    if (!length(rangesList)) {
204
+        stop("rangesList empty")
205
+    }
206
+    
207
+    meta_list <- metadata(rangesList)
208
+    samples <- .check_metadata_list(metadata, metadata_prefix, meta_list)
209
+    if (!length(unlist(samples))) {
210
+        samples <- rangesList
211
+    } else {
212
+        index <- unlist(samples)
213
+        samples <- rangesList[c(index)]
214
+    }
215
+    new_meta_list <- metadata(samples)
216
+    suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list)
217
+    granges <- .parse_Granges(samples, regions, suffix_vec)
223 218
 }
224 219
 
225 220
 .parse_Granges <- function(region_list, regions, suffixes) {
226
-  if (is.null(suffixes)) {
227
-    suffixes <- ""
228
-  }
229
-  
230
-  g1 <- region_list[[1]]
231
-
232
-  if(is.object(regions) && ("FULL" %in% class(regions))) {
233
-    all_values <- names(elementMetadata(g1))
234
-    except_values <- regions$values
235
-    regions <- if (is.null(except_values))
236
-      all_values
237
-    else
238
-      all_values[!all_values %in% except_values]
239
-    names(regions) <- NULL
240
-    # since import convert this value from GMQL schema to GTF format
241
-    # we need to convert it back
242
-    regions <- replace(regions, regions == "feature", "type")
243
-    regions <- replace(regions, regions == "frame", "phase")
244
-  }
245
-  
246
-  elementMetadata(g1) <- NULL
247
-  if (!is.null(regions)) {
248
-    DF_list <- mapply(function(g_x, h) {
249
-      meta <- elementMetadata(g_x)[regions]
250
-      if (h != "") {
251
-        names(meta) <- paste(regions, h, sep = ".")
252
-      }
253
-      data.frame(meta)
254
-    }, region_list, suffixes, SIMPLIFY = FALSE)
255
-    DF_only_regions <- dplyr::bind_cols(DF_list)
256
-    elementMetadata(g1) <- DF_only_regions
257
-  }
258
-  g1
221
+    if (is.null(suffixes)) {
222
+        suffixes <- ""
223
+    }
224
+    
225
+    g1 <- region_list[[1]]
226
+    
227
+    if(is.object(regions) && ("FULL" %in% class(regions))) {
228
+        all_values <- names(elementMetadata(g1))
229
+        except_values <- regions$values
230
+        regions <- if (is.null(except_values))
231
+            all_values
232
+        else
233
+            all_values[!all_values %in% except_values]
234
+        names(regions) <- NULL
235
+        # since import convert this value from GMQL schema to GTF format
236
+        # we need to convert it back
237
+        regions <- replace(regions, regions == "feature", "type")
238
+        regions <- replace(regions, regions == "frame", "phase")
239
+    }
240
+    
241
+    elementMetadata(g1) <- NULL
242
+    if (!is.null(regions)) {
243
+        DF_list <- mapply(function(g_x, h) {
244
+            meta <- elementMetadata(g_x)[regions]
245
+            if (h != "") {
246
+                names(meta) <- paste(regions, h, sep = ".")
247
+            }
248
+            data.frame(meta)
249
+        }, region_list, suffixes, SIMPLIFY = FALSE)
250
+        DF_only_regions <- dplyr::bind_cols(DF_list)
251
+        elementMetadata(g1) <- DF_only_regions
252
+    }
253
+    g1
259 254
 }
260 255
 
261 256
 .get_suffix <- function(col_name, from_list, meta_fl) {
262
-  suffix <- paste0(col_name, "$")
263
-  
264
-  if (from_list) {
265
-    meta_list <- mapply(function(x, index) {
266
-      vec_names <- names(x)
267
-      s_index <- grep(suffix, vec_names)
268
-      first_index <- s_index[1]
269
-      suffix <- unlist(x[first_index]) # ne prendo solo uno
270
-      names(suffix) <- NULL
271
-      
272
-      # if found retrieve samples that has at least one choosen metadata
273
-      if (first_index && !is.na(first_index)) {
274
-        suffix
275
-      } else {
276
-        ""
277
-      }
278
-    }, meta_fl, seq_along(meta_fl))
279
-  }
280
-  else {
281
-    meta_list <- vapply(meta_fl, function(x) {
282
-      list <- .add_metadata(x)
283
-      vec_names <- names(list)
284
-      index <- grep(suffix, vec_names)
285
-      first_index <- index[1]
286
-      suffix <- unlist(list[first_index]) # ne prendo solo uno
287
-      names(suffix) <- NULL
288
-      # if found retrieve samples that has at least one choosen metadata
289
-      if (first_index && !is.na(first_index)) {
290
-        suffix
291
-      } else {
292
-        ""
293
-      }
294
-    }, character(1))
295
-  }
296
-  names(meta_list) <- NULL
297
-  meta_list
257
+    suffix <- paste0(col_name, "$")
258
+    
259
+    if (from_list) {
260
+        meta_list <- mapply(function(x, index) {
261
+            vec_names <- names(x)
262
+            s_index <- grep(suffix, vec_names)
263
+            first_index <- s_index[1]
264
+            suffix <- unlist(x[first_index]) # ne prendo solo uno
265
+            names(suffix) <- NULL
266
+            
267
+            # if found retrieve samples that has at least one choosen metadata
268
+            if (first_index && !is.na(first_index)) {
269
+                suffix
270
+            } else {
271
+                ""
272
+            }
273
+        }, meta_fl, seq_along(meta_fl))
274
+    }
275
+    else {
276
+        meta_list <- vapply(meta_fl, function(x) {
277
+            list <- .add_metadata(x)
278
+            vec_names <- names(list)
279
+            index <- grep(suffix, vec_names)
280
+            first_index <- index[1]
281
+            suffix <- unlist(list[first_index]) # ne prendo solo uno
282
+            names(suffix) <- NULL
283
+            # if found retrieve samples that has at least one choosen metadata
284
+            if (first_index && !is.na(first_index)) {
285
+                suffix
286
+            } else {
287
+                ""
288
+            }
289
+        }, character(1))
290
+    }
291
+    names(meta_list) <- NULL
292
+    meta_list
298 293
 }
299 294
 
300 295
 .check_metadata_list <- function(metadata, metadata_prefix, meta_list) {
301
-  vec_meta <- paste0(metadata_prefix, metadata)
302
-  list <- mapply(function(x, index) {
303
-    vec_names <- names(x)
304
-    a <- lapply(vec_meta, function(y) {
305
-      which(y == vec_names)
306
-    })
307
-    ## we would like that manage more index from grep
308
-    found <- as.logical(length(unlist(a)))
309
-    # if found retrieve samples that has at least one choosen metadata
310
-    if (found) {
311
-      index
312
-    }
313
-  }, meta_list, seq_along(meta_list))
296
+    vec_meta <- paste0(metadata_prefix, metadata)
297
+    list <- mapply(function(x, index) {
298
+        vec_names <- names(x)
299
+        a <- lapply(vec_meta, function(y) {
300
+            which(y == vec_names)
301
+        })
302
+        ## we would like that manage more index from grep
303
+        found <- as.logical(length(unlist(a)))
304
+        # if found retrieve samples that has at least one choosen metadata
305
+        if (found) {
306
+            index
307
+        }
308
+    }, meta_list, seq_along(meta_list))
314 309
 }
315 310
 
316 311
 .check_metadata_files <- function(metadata, metadata_prefix, meta_files) {
317
-  vec_meta <- paste0(metadata_prefix, metadata)
318
-  meta_list <- lapply(meta_files, function(x) {
319
-    list <- .add_metadata(x)
320
-    vec_names <- names(list)
321
-    a <- lapply(vec_meta, function(y) {
322
-      grep(y, vec_names)
312
+    vec_meta <- paste0(metadata_prefix, metadata)
313
+    meta_list <- lapply(meta_files, function(x) {
314
+        list <- .add_metadata(x)
315
+        vec_names <- names(list)
316
+        a <- lapply(vec_meta, function(y) {
317
+            grep(y, vec_names)
318
+        })
319
+        ## we would like that manage more index from grep
320
+        found <- as.logical(length(unlist(a)))
321
+        # if found retrieve samples that has at least one choosen metadata
322
+        if (found) {
323
+            x
324
+        }
323 325
     })
324
-    ## we would like that manage more index from grep
325
-    found <- as.logical(length(unlist(a)))
326
-    # if found retrieve samples that has at least one choosen metadata
327
-    if (found) {
328
-      x
329
-    }
330
-  })
331 326
 }
332 327
 
333 328
 .parse_gtf_files <- function(
334
-  gtf_region_files, 
335
-  regions, 
336
-  suffixes,
337
-  vector_field
329
+    gtf_region_files, 
330
+    regions, 
331
+    suffixes,
332
+    vector_field
338 333
 ) {
339
-  g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf")
340
-  elementMetadata(g1) <- NULL
341
-  if (is.null(suffixes)) {
342
-    suffixes <- ""
343
-  }
344
-  
345
-  # check if we used a FULL parameter instead of char array containing
346
-  # the region parameters
347
-  if(is.object(regions) && ("FULL" %in% class(regions))) {
348
-    all_values <- vector_field[!vector_field %in% c(
349
-      "seqname", 
350
-      "strand", 
351
-      "start",
352
-      "end"
353
-      )
354
-    ]
355
-    except_values <- regions$values
356
-    regions <- if (is.null(except_values))
357
-      all_values
358
-    else
359
-      all_values[!all_values %in% except_values]
360
-    names(regions) <- NULL
361
-    # since import convert this value from GMQL schema to GTF format
362
-    # we need to convert it back
363
-    regions <- replace(regions, regions == "feature", "type")
364
-    regions <- replace(regions, regions == "frame", "phase")
365
-  }
366
-  
367
-  if (!is.null(regions)) {
368
-    DF_list <- mapply(function(x, header) {
369
-      g_x <- rtracklayer::import(con = x, format = "gtf")
370
-      meta <- elementMetadata(g_x)[regions]
371
-      if (header != "") {
372
-        names(meta) <- paste(regions, header, sep = ".")
373
-      }
374
-      data.frame(meta)
375
-    }, gtf_region_files, suffixes, SIMPLIFY = FALSE)
376
-    DF_only_regions <- dplyr::bind_cols(DF_list)
377
-    elementMetadata(g1) <- DF_only_regions
378
-  }
379
-  g1
334
+    g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf")
335
+    elementMetadata(g1) <- NULL
336
+    if (is.null(suffixes)) {
337
+        suffixes <- ""
338
+    }
339
+    
340
+    # check if we used a FULL parameter instead of char array containing
341
+    # the region parameters
342
+    if(is.object(regions) && ("FULL" %in% class(regions))) {
343
+        all_values <- vector_field[!vector_field %in% c(
344
+            "seqname", 
345
+            "strand", 
346
+            "start",
347
+            "end")
348
+        ]
349
+        except_values <- regions$values
350
+        regions <- if (is.null(except_values))
351
+            all_values
352
+        else
353
+            all_values[!all_values %in% except_values]
354
+        names(regions) <- NULL
355
+        # since import convert this value from GMQL schema to GTF format
356
+        # we need to convert it back
357
+        regions <- replace(regions, regions == "feature", "type")
358
+        regions <- replace(regions, regions == "frame", "phase")
359
+    }
360
+    
361
+    if (!is.null(regions)) {
362
+        DF_list <- mapply(function(x, header) {
363
+            g_x <- rtracklayer::import(con = x, format = "gtf")
364
+            meta <- elementMetadata(g_x)[regions]
365
+            if (header != "") {
366
+                names(meta) <- paste(regions, header, sep = ".")
367
+            }
368
+            data.frame(meta)
369
+        }, gtf_region_files, suffixes, SIMPLIFY = FALSE)
370
+        DF_only_regions <- dplyr::bind_cols(DF_list)
371
+        elementMetadata(g1) <- DF_only_regions
372
+    }
373
+    g1
380 374
 }
381 375
 
382 376
 .parse_gdm_files <- function(
383
-  vector_field,
384
-  gdm_region_files,
385
-  regions,
386
-  suffixes
377
+    vector_field,
378
+    gdm_region_files,
379
+    regions,
380
+    suffixes
387 381
 ) {
388
-  # read first sample cause chromosome regions are the same for all samples
389
-  df <- data.table::fread(
390
-    gdm_region_files[1],
391
-    col.names = vector_field,
392
-    header = FALSE,
393
-    sep = "\t"
394
-  )
395
-  col_names <- names(df)
396
-  df <- subset(df, TRUE, c("chr", "left", "right", "strand"))
397
-  
398
-  # check if we used a FULL parameter instead of char array containing
399
-  # the region parameters
400
-  if(is.object(regions) && ("FULL" %in% class(regions))) {
401
-    all_values <- vector_field[!vector_field %in% c(
402
-      "chr", 
403
-      "left", 
404
-      "right",
405
-      "strand"
406
-      )
407
-    ]
408
-    except_values <- regions$values
409
-    regions <- if (is.null(except_values))
410
-      all_values
411
-    else
412
-      all_values[!all_values %in% except_values]
413
-    names(regions) <- NULL
414
-  }
415
-  
416
-  if (!is.null(regions)) {
417
-    df_list <- lapply(gdm_region_files, function(x, regions, vector_field) {
418
-      region_frame <- data.table::fread(
419
-        x,
382
+    # read first sample cause chromosome regions are the same for all samples
383
+    df <- data.table::fread(
384
+        gdm_region_files[1],
420 385
         col.names = vector_field,
421 386
         header = FALSE,
422 387
         sep = "\t"
423
-      )
424
-      col_names <- names(region_frame)
425
-      # delete column not choosen by input
426
-      if (!is.null(regions)) {
427
-        col_names <- col_names[col_names %in% regions]
428
-      }
429
-      
430
-      if (length(col_names)) {
431
-        r <- subset(region_frame, TRUE, col_names)
432
-      }
433
-    }, regions, vector_field)
388
+    )
389
+    col_names <- names(df)
390
+    df <- subset(df, TRUE, c("chr", "left", "right", "strand"))
434 391
     
435
-    df_only_regions <- dplyr::bind_cols(df_list)
436
-    complete_df <- dplyr::bind_cols(df, df_only_regions)
392
+    # check if we used a FULL parameter instead of char array containing
393
+    # the region parameters
394
+    if(is.object(regions) && ("FULL" %in% class(regions))) {
395
+        all_values <- vector_field[!vector_field %in% c(
396
+            "chr", 
397
+            "left", 
398
+            "right",
399
+            "strand")
400
+        ]
401
+        except_values <- regions$values
402
+        regions <- if (is.null(except_values))
403
+            all_values
404
+        else
405
+            all_values[!all_values %in% except_values]
406
+        names(regions) <- NULL
407
+    }
437 408
     
438
-    region_names <- names(complete_df)[-(seq_len(4))]
439
-    region_names <- gsub("[0-9]+", "", region_names)
440
-    region_names <- paste(region_names, suffixes, sep = ".")
441
-    region_names <- c(names(complete_df)[(seq_len(4))], region_names)
442
-    names(complete_df) <- region_names
443
-    g <- GenomicRanges::makeGRangesFromDataFrame(
444
-      complete_df,
445
-      keep.extra.columns = TRUE,
446
-      start.field = "left",
447
-      end.field = "right"
448
-    )
449
-  }
450
-  else {
451
-    g <- GenomicRanges::makeGRangesFromDataFrame(
452
-      df,
453
-      keep.extra.columns = TRUE,
454
-      start.field = "left",
455
-      end.field = "right"
456
-    )
457
-  }
409
+    if (!is.null(regions)) {
410
+        df_list <- lapply(gdm_region_files, function(x, regions, vector_field) {
411
+            region_frame <- data.table::fread(
412
+                x,
413
+                col.names = vector_field,
414
+                header = FALSE,
415
+                sep = "\t")
416
+            col_names <- names(region_frame)
417
+            # delete column not choosen by input
418
+            if (!is.null(regions)) {
419
+                col_names <- col_names[col_names %in% regions]
420
+            }
421
+            
422
+            if (length(col_names)) {
423
+                r <- subset(region_frame, TRUE, col_names)
424
+            }
425
+        }, regions, vector_field)
426
+        
427
+        df_only_regions <- dplyr::bind_cols(df_list)
428
+        complete_df <- dplyr::bind_cols(df, df_only_regions)
429
+        
430
+        region_names <- names(complete_df)[-(seq_len(4))]
431
+        region_names <- gsub("[0-9]+", "", region_names)
432
+        region_names <- paste(region_names, suffixes, sep = ".")
433
+        region_names <- c(names(complete_df)[(seq_len(4))], region_names)
434
+        names(complete_df) <- region_names
435
+        g <- GenomicRanges::makeGRangesFromDataFrame(
436
+            complete_df,
437
+            keep.extra.columns = TRUE,
438
+            start.field = "left",
439
+            end.field = "right")
440
+    } else {
441
+        g <- GenomicRanges::makeGRangesFromDataFrame(
442
+            df,
443
+            keep.extra.columns = TRUE,
444
+            start.field = "left",
445
+            end.field = "right")
446
+    }
458 447
 }
Browse code

added FULL() parameters, fixed multiple collect

Simone authored on 11/04/2021 18:14:20
Showing 1 changed files
... ...
@@ -26,7 +26,11 @@
26 26
 #' @param region_attributes vector of strings that extracts only region
27 27
 #' attributes  specified; if NULL no regions attribute is taken and the output
28 28
 #' is only GRanges made up by the region coordinate attributes
29
-#' (seqnames, start, end, strand)
29
+#' (seqnames, start, end, strand);
30
+#' It is also possible to assign the \code{\link{FULL}} with or without 
31
+#' its input parameter; in case was without the `except` parameter, 
32
+#' all the region attributes are taken, otherwise all the region attributes 
33
+#' are taken except the input attribute defined by except.
30 34
 #' @param suffix name for each metadata column of GRanges. By default it is the
31 35
 #' value of the metadata attribute named "antibody_target". This string is
32 36
 #' taken from sample metadata file or from metadata() associated.
... ...
@@ -61,19 +65,14 @@
61 65
 #' sorted_grl <- sort(grl)
62 66
 #' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak"))
63 67
 #' 
64
-#' ## It is also possible to define the region attributes, using the FULL() 
65
-#' ## function parameter, in order to includes every region 
66
-#' ## attributes present into the schema file
68
+#' ## This statement imports a GMQL dataset as GRangesList and filters it
69
+#' ## including all the region attributes
67 70
 #' 
68 71
 #' sorted_grl_full <- sort(grl)
69
-#' filter_and_extract(sorted_grl, region_attributes = FULL())
72
+#' filter_and_extract(sorted_grl_full, region_attributes = FULL())
70 73
 #' 
71
-#' grl <- import_gmql(test_path, TRUE)
72
-#' sorted_grl <- sort(grl)
73
-#' filter_and_extract(sorted_grl, region_attributes = FULL())
74
-#' 
75
-#' ## Also, we can inlcude a list of region attribute inside the FULL() 
76
-#' ## function to exlucde that regions
74
+#' ## This statement imports a GMQL dataset as GRangesList and filters it
75
+#' ## including all the region attributes except "jaccard" and "score"
77 76
 #' 
78 77
 #' sorted_grl_full_except <- sort(grl)
79 78
 #' filter_and_extract(
Browse code

add FULL() param to extract from GRangeslist

Simone authored on 27/03/2021 14:45:15
Showing 1 changed files
... ...
@@ -68,6 +68,10 @@
68 68
 #' sorted_grl_full <- sort(grl)
69 69
 #' filter_and_extract(sorted_grl, region_attributes = FULL())
70 70
 #' 
71
+#' grl <- import_gmql(test_path, TRUE)
72
+#' sorted_grl <- sort(grl)
73
+#' filter_and_extract(sorted_grl, region_attributes = FULL())
74
+#' 
71 75
 #' ## Also, we can inlcude a list of region attribute inside the FULL() 
72 76
 #' ## function to exlucde that regions
73 77