R/filter-extract-function.R
e5131ba8
 #' Filter and extract function
193ae520
 #'
e5131ba8
 #' This function lets user to create a new GRangesList with fixed information:
 #' seqnames, ranges and strand, and a variable part made up by the regions
73652f4d
 #' defined as input. The metadata and metadata_prefix are used to filter
 #' the data and choose only the samples that match at least one metdatata
 #' with its prefix. The input regions are shown for each sample obtained
e5131ba8
 #' from filtering.
13f9f8d0
 #'
 #' @import xml2
94a33e57
 #' @importFrom dplyr bind_cols
193ae520
 #' @importFrom data.table fread
 #' @importFrom rtracklayer import
13f9f8d0
 #'
e5131ba8
 #' @param data string GMQL dataset folder path or GRangesList
94a33e57
 #' object
e5131ba8
 #' @param metadata vector of strings containing names of metadata attributes
94a33e57
 #' to be searched for in metadata files.
e5131ba8
 #' Data will be extracted if at least one condition is satisfied:
 #' this condition is logically "ANDed" with prefix filtering (see below)
 #' if NULL no filtering action occures
 #' (i.e every sample is taken for region filtering)
83eb0624
 #' @param metadata_prefix vector of strings that will support the metadata
73652f4d
 #' filtering. If defined, each 'metadata' is concatenated with the
83eb0624
 #' corresponding prefix.
73652f4d
 #' @param region_attributes vector of strings that extracts only region
 #' attributes  specified; if NULL no regions attribute is taken and the output
 #' is only GRanges made up by the region coordinate attributes
08b4a456
 #' (seqnames, start, end, strand);
 #' It is also possible to assign the \code{\link{FULL}} with or without 
 #' its input parameter; in case was without the `except` parameter, 
 #' all the region attributes are taken, otherwise all the region attributes 
 #' are taken except the input attribute defined by except.
73652f4d
 #' @param suffix name for each metadata column of GRanges. By default it is the
 #' value of the metadata attribute named "antibody_target". This string is
 #' taken from sample metadata file or from metadata() associated.
18697071
 #' If not present, the column name is the name of selected regions specified
dda0cfa0
 #' by 'region_attributes' input parameter
13f9f8d0
 #'
c7602281
 #' @details
73652f4d
 #' This function works only with dataset or GRangesList all whose samples or
 #' Granges have the same region coordinates (chr, ranges, strand) ordered in
 #' the same way for each sample
 #'
18697071
 #' In case of GRangesList data input, the function searches for metadata
 #' into metadata() function associated to GRangesList.
c7602281
 #'
83eb0624
 #' @return GRanges with selected regions
13f9f8d0
 #'
 #' @examples
73652f4d
 #'
 #' ## This statement defines the path to the folder "DATASET" in the
 #' ## subdirectory "example" of the package "RGMQL" and filters such folder
18697071
 #' ## dataset including at output only "pvalue" and "peak" region attributes
73652f4d
 #'
94a33e57
 #' test_path <- system.file("example", "DATASET", package = "RGMQL")
18697071
 #' filter_and_extract(test_path, region_attributes = c("pvalue", "peak"))
73652f4d
 #'
 #' ## This statement imports a GMQL dataset as GRangesList and filters it
44735c40
 #' ## including at output only "pvalue" and "peak" region attributes, the sort
73652f4d
 #' ## function makes sure that the region coordinates (chr, ranges, strand)
44735c40
 #' ## of all samples are ordered correctly
94a33e57
 #'
73652f4d
 #' grl <- import_gmql(test_path, TRUE)
 #' sorted_grl <- sort(grl)
 #' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak"))
7f0f9dac
 #' 
08b4a456
 #' ## This statement imports a GMQL dataset as GRangesList and filters it
 #' ## including all the region attributes
7f0f9dac
 #' 
 #' sorted_grl_full <- sort(grl)
08b4a456
 #' filter_and_extract(sorted_grl_full, region_attributes = FULL())
7f0f9dac
 #' 
08b4a456
 #' ## This statement imports a GMQL dataset as GRangesList and filters it
 #' ## including all the region attributes except "jaccard" and "score"
7f0f9dac
 #' 
 #' sorted_grl_full_except <- sort(grl)
 #' filter_and_extract(
 #'  sorted_grl_full_except, 
 #'  region_attributes = FULL("jaccard", "score")
 #' )
 #' 
193ae520
 #' @export
13f9f8d0
 #'
73652f4d
 filter_and_extract <- function(
9212b326
     data,
     metadata = NULL,
     metadata_prefix = NULL,
     region_attributes = NULL,
     suffix = "antibody_target"
73652f4d
 ) {
9212b326
     if (is(data, "GRangesList")) {
         .extract_from_GRangesList(
             data,
             metadata,
             metadata_prefix,
             region_attributes,
             suffix)
     } else {
         .extract_from_dataset(
             data,
             metadata,
             metadata_prefix,
             region_attributes, suffix)
     }
94a33e57
 }
b8b733f8
 
73652f4d
 .extract_from_dataset <- function(
     datasetName,
9212b326
     metadata,
     metadata_prefix,
     regions,
     suffix
 ) {
     datasetName <- sub("/*[/]$", "", datasetName)
     if (basename(datasetName) != "files") {
         datasetName <- file.path(datasetName, "files")
     }
c7602281
     
9212b326
     if (!dir.exists(datasetName)) {
         stop("Directory does not exists")
73652f4d
     }
c7602281
     
9212b326
     gdm_meta_files <- list.files(
         datasetName,
         pattern = "*.gdm.meta$",
         full.names = TRUE
73652f4d
     )
c7602281
     
9212b326
     gtf_meta_files <- list.files(
         datasetName,
         pattern = "*.gtf.meta$",
         full.names = TRUE
73652f4d
     )
18697071
     
9212b326
     if (!length(gdm_meta_files) && !length(gtf_meta_files)) {
         stop("no samples present or no files format supported")
94a33e57
     }
c7602281
     
9212b326
     if (length(gdm_meta_files) && length(gtf_meta_files)) {
         stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together")
     }
     
     vector_field <- .schema_header(datasetName)
     
     if (length(gdm_meta_files)) {
         samples_file <- .check_metadata_files(
             metadata, metadata_prefix,
             gdm_meta_files
         )
         
         samples_meta_to_read <- unlist(samples_file)
         
         if (length(samples_meta_to_read)) {
             samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
         } else {
             samples_to_read <- gsub(".meta$", "", gdm_meta_files)
             samples_meta_to_read <- gtf_meta_files
         }
         
         suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
         granges <- .parse_gdm_files(
             vector_field, 
             samples_to_read, 
             regions,
             suffix_vec)
     } else {
         samples_file <- .check_metadata_files(
             metadata, 
             metadata_prefix,
             gtf_meta_files
         )
         samples_meta_to_read <- unlist(samples_file)
         
         if (length(samples_meta_to_read)) {
             samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
         } else {
             samples_to_read <- gsub(".meta$", "", gtf_meta_files)
             samples_meta_to_read <- gtf_meta_files
         }
         
         suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
         granges <- .parse_gtf_files(
             samples_to_read, 
             regions, 
             suffix_vec, 
             vector_field
         )
     }
315f4cb5
 }
 
73652f4d
 .extract_from_GRangesList <- function(
9212b326
     rangesList,
     metadata,
     metadata_prefix,
     regions,
     suffix
73652f4d
 ) {
9212b326
     if (!is(rangesList, "GRangesList")) {
         stop("only GrangesList admitted")
     }
     
     if (!length(rangesList)) {
         stop("rangesList empty")
     }
     
     meta_list <- metadata(rangesList)
     samples <- .check_metadata_list(metadata, metadata_prefix, meta_list)
     if (!length(unlist(samples))) {
         samples <- rangesList
     } else {
         index <- unlist(samples)
         samples <- rangesList[c(index)]
     }
     new_meta_list <- metadata(samples)
     suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list)
     granges <- .parse_Granges(samples, regions, suffix_vec)
315f4cb5
 }
 
73652f4d
 .parse_Granges <- function(region_list, regions, suffixes) {
9212b326
     if (is.null(suffixes)) {
         suffixes <- ""
     }
     
     g1 <- region_list[[1]]
     
     if(is.object(regions) && ("FULL" %in% class(regions))) {
         all_values <- names(elementMetadata(g1))
         except_values <- regions$values
         regions <- if (is.null(except_values))
             all_values
         else
             all_values[!all_values %in% except_values]
         names(regions) <- NULL
         # since import convert this value from GMQL schema to GTF format
         # we need to convert it back
         regions <- replace(regions, regions == "feature", "type")
         regions <- replace(regions, regions == "frame", "phase")
     }
     
     elementMetadata(g1) <- NULL
     if (!is.null(regions)) {
         DF_list <- mapply(function(g_x, h) {
             meta <- elementMetadata(g_x)[regions]
             if (h != "") {
                 names(meta) <- paste(regions, h, sep = ".")
             }
             data.frame(meta)
         }, region_list, suffixes, SIMPLIFY = FALSE)
         DF_only_regions <- dplyr::bind_cols(DF_list)
         elementMetadata(g1) <- DF_only_regions
     }
     g1
18697071
 }
 
73652f4d
 .get_suffix <- function(col_name, from_list, meta_fl) {
9212b326
     suffix <- paste0(col_name, "$")
     
     if (from_list) {
         meta_list <- mapply(function(x, index) {
             vec_names <- names(x)
             s_index <- grep(suffix, vec_names)
             first_index <- s_index[1]
             suffix <- unlist(x[first_index]) # ne prendo solo uno
             names(suffix) <- NULL
             
             # if found retrieve samples that has at least one choosen metadata
             if (first_index && !is.na(first_index)) {
                 suffix
             } else {
                 ""
             }
         }, meta_fl, seq_along(meta_fl))
     }
     else {
         meta_list <- vapply(meta_fl, function(x) {
             list <- .add_metadata(x)
             vec_names <- names(list)
             index <- grep(suffix, vec_names)
             first_index <- index[1]
             suffix <- unlist(list[first_index]) # ne prendo solo uno
             names(suffix) <- NULL
             # if found retrieve samples that has at least one choosen metadata
             if (first_index && !is.na(first_index)) {
                 suffix
             } else {
                 ""
             }
         }, character(1))
     }
     names(meta_list) <- NULL
     meta_list
13f9f8d0
 }
 
73652f4d
 .check_metadata_list <- function(metadata, metadata_prefix, meta_list) {
9212b326
     vec_meta <- paste0(metadata_prefix, metadata)
     list <- mapply(function(x, index) {
         vec_names <- names(x)
         a <- lapply(vec_meta, function(y) {
             which(y == vec_names)
         })
         ## we would like that manage more index from grep
         found <- as.logical(length(unlist(a)))
         # if found retrieve samples that has at least one choosen metadata
         if (found) {
             index
         }
     }, meta_list, seq_along(meta_list))
13f9f8d0
 }
 
73652f4d
 .check_metadata_files <- function(metadata, metadata_prefix, meta_files) {
9212b326
     vec_meta <- paste0(metadata_prefix, metadata)
     meta_list <- lapply(meta_files, function(x) {
         list <- .add_metadata(x)
         vec_names <- names(list)
         a <- lapply(vec_meta, function(y) {
             grep(y, vec_names)
         })
         ## we would like that manage more index from grep
         found <- as.logical(length(unlist(a)))
         # if found retrieve samples that has at least one choosen metadata
         if (found) {
             x
         }
73652f4d
     })
13f9f8d0
 }
 
73652f4d
 .parse_gtf_files <- function(
9212b326
     gtf_region_files, 
     regions, 
     suffixes,
     vector_field
73652f4d
 ) {
9212b326
     g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf")
     elementMetadata(g1) <- NULL
     if (is.null(suffixes)) {
         suffixes <- ""
     }
     
     # check if we used a FULL parameter instead of char array containing
     # the region parameters
     if(is.object(regions) && ("FULL" %in% class(regions))) {
         all_values <- vector_field[!vector_field %in% c(
             "seqname", 
             "strand", 
             "start",
             "end")
         ]
         except_values <- regions$values
         regions <- if (is.null(except_values))
             all_values
         else
             all_values[!all_values %in% except_values]
         names(regions) <- NULL
         # since import convert this value from GMQL schema to GTF format
         # we need to convert it back
         regions <- replace(regions, regions == "feature", "type")
         regions <- replace(regions, regions == "frame", "phase")
     }
     
     if (!is.null(regions)) {
         DF_list <- mapply(function(x, header) {
             g_x <- rtracklayer::import(con = x, format = "gtf")
             meta <- elementMetadata(g_x)[regions]
             if (header != "") {
                 names(meta) <- paste(regions, header, sep = ".")
             }
             data.frame(meta)
         }, gtf_region_files, suffixes, SIMPLIFY = FALSE)
         DF_only_regions <- dplyr::bind_cols(DF_list)
         elementMetadata(g1) <- DF_only_regions
     }
     g1
73652f4d
 }
 
 .parse_gdm_files <- function(
9212b326
     vector_field,
     gdm_region_files,
     regions,
     suffixes
73652f4d
 ) {
9212b326
     # read first sample cause chromosome regions are the same for all samples
     df <- data.table::fread(
         gdm_region_files[1],
73652f4d
         col.names = vector_field,
         header = FALSE,
         sep = "\t"
9212b326
     )
     col_names <- names(df)
     df <- subset(df, TRUE, c("chr", "left", "right", "strand"))
73652f4d
     
9212b326
     # check if we used a FULL parameter instead of char array containing
     # the region parameters
     if(is.object(regions) && ("FULL" %in% class(regions))) {
         all_values <- vector_field[!vector_field %in% c(
             "chr", 
             "left", 
             "right",
             "strand")
         ]
         except_values <- regions$values
         regions <- if (is.null(except_values))
             all_values
         else
             all_values[!all_values %in% except_values]
         names(regions) <- NULL
     }
c7602281
     
9212b326
     if (!is.null(regions)) {
         df_list <- lapply(gdm_region_files, function(x, regions, vector_field) {
             region_frame <- data.table::fread(
                 x,
                 col.names = vector_field,
                 header = FALSE,
                 sep = "\t")
             col_names <- names(region_frame)
             # delete column not choosen by input
             if (!is.null(regions)) {
                 col_names <- col_names[col_names %in% regions]
             }
             
             if (length(col_names)) {
                 r <- subset(region_frame, TRUE, col_names)
             }
         }, regions, vector_field)
         
         df_only_regions <- dplyr::bind_cols(df_list)
         complete_df <- dplyr::bind_cols(df, df_only_regions)
         
         region_names <- names(complete_df)[-(seq_len(4))]
         region_names <- gsub("[0-9]+", "", region_names)
         region_names <- paste(region_names, suffixes, sep = ".")
         region_names <- c(names(complete_df)[(seq_len(4))], region_names)
         names(complete_df) <- region_names
         g <- GenomicRanges::makeGRangesFromDataFrame(
             complete_df,
             keep.extra.columns = TRUE,
             start.field = "left",
             end.field = "right")
     } else {
         g <- GenomicRanges::makeGRangesFromDataFrame(
             df,
             keep.extra.columns = TRUE,
             start.field = "left",
             end.field = "right")
     }
13f9f8d0
 }