#' Filter and extract function #' #' This function lets user to create a new GRangesList with fixed information: #' seqnames, ranges and strand, and a variable part made up by the regions #' defined as input. The metadata and metadata_prefix are used to filter #' the data and choose only the samples that match at least one metdatata #' with its prefix. The input regions are shown for each sample obtained #' from filtering. #' #' @import xml2 #' @importFrom dplyr bind_cols #' @importFrom data.table fread #' @importFrom rtracklayer import #' #' @param data string GMQL dataset folder path or GRangesList #' object #' @param metadata vector of strings containing names of metadata attributes #' to be searched for in metadata files. #' Data will be extracted if at least one condition is satisfied: #' this condition is logically "ANDed" with prefix filtering (see below) #' if NULL no filtering action occures #' (i.e every sample is taken for region filtering) #' @param metadata_prefix vector of strings that will support the metadata #' filtering. If defined, each 'metadata' is concatenated with the #' corresponding prefix. #' @param region_attributes vector of strings that extracts only region #' attributes specified; if NULL no regions attribute is taken and the output #' is only GRanges made up by the region coordinate attributes #' (seqnames, start, end, strand); #' It is also possible to assign the \code{\link{FULL}} with or without #' its input parameter; in case was without the `except` parameter, #' all the region attributes are taken, otherwise all the region attributes #' are taken except the input attribute defined by except. #' @param suffix name for each metadata column of GRanges. By default it is the #' value of the metadata attribute named "antibody_target". This string is #' taken from sample metadata file or from metadata() associated. #' If not present, the column name is the name of selected regions specified #' by 'region_attributes' input parameter #' #' @details #' This function works only with dataset or GRangesList all whose samples or #' Granges have the same region coordinates (chr, ranges, strand) ordered in #' the same way for each sample #' #' In case of GRangesList data input, the function searches for metadata #' into metadata() function associated to GRangesList. #' #' @return GRanges with selected regions #' #' @examples #' #' ## This statement defines the path to the folder "DATASET" in the #' ## subdirectory "example" of the package "RGMQL" and filters such folder #' ## dataset including at output only "pvalue" and "peak" region attributes #' #' test_path <- system.file("example", "DATASET", package = "RGMQL") #' filter_and_extract(test_path, region_attributes = c("pvalue", "peak")) #' #' ## This statement imports a GMQL dataset as GRangesList and filters it #' ## including at output only "pvalue" and "peak" region attributes, the sort #' ## function makes sure that the region coordinates (chr, ranges, strand) #' ## of all samples are ordered correctly #' #' grl <- import_gmql(test_path, TRUE) #' sorted_grl <- sort(grl) #' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak")) #' #' ## This statement imports a GMQL dataset as GRangesList and filters it #' ## including all the region attributes #' #' sorted_grl_full <- sort(grl) #' filter_and_extract(sorted_grl_full, region_attributes = FULL()) #' #' ## This statement imports a GMQL dataset as GRangesList and filters it #' ## including all the region attributes except "jaccard" #' #' sorted_grl_full_except <- sort(grl) #' filter_and_extract( #' sorted_grl_full_except, #' region_attributes = FULL("jaccard") #' ) #' #' @export #' filter_and_extract <- function( data, metadata = NULL, metadata_prefix = NULL, region_attributes = NULL, suffix = "antibody_target" ) { if (is(data, "GRangesList")) { .extract_from_GRangesList( data, metadata, metadata_prefix, region_attributes, suffix) } else { .extract_from_dataset( data, metadata, metadata_prefix, region_attributes, suffix) } } .extract_from_dataset <- function( datasetName, metadata, metadata_prefix, regions, suffix ) { datasetName <- sub("/*[/]$", "", datasetName) if (basename(datasetName) != "files") { datasetName <- file.path(datasetName, "files") } if (!dir.exists(datasetName)) { stop("Directory does not exists") } gdm_meta_files <- list.files( datasetName, pattern = "*.gdm.meta$", full.names = TRUE ) gtf_meta_files <- list.files( datasetName, pattern = "*.gtf.meta$", full.names = TRUE ) if (!length(gdm_meta_files) && !length(gtf_meta_files)) { stop("no samples present or no files format supported") } if (length(gdm_meta_files) && length(gtf_meta_files)) { stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together") } vector_field <- .schema_header(datasetName) if (length(gdm_meta_files)) { samples_file <- .check_metadata_files( metadata, metadata_prefix, gdm_meta_files ) samples_meta_to_read <- unlist(samples_file) if (length(samples_meta_to_read)) { samples_to_read <- gsub(".meta$", "", samples_meta_to_read) } else { samples_to_read <- gsub(".meta$", "", gdm_meta_files) samples_meta_to_read <- gtf_meta_files } suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) granges <- .parse_gdm_files( vector_field, samples_to_read, regions, suffix_vec) } else { samples_file <- .check_metadata_files( metadata, metadata_prefix, gtf_meta_files ) samples_meta_to_read <- unlist(samples_file) if (length(samples_meta_to_read)) { samples_to_read <- gsub(".meta$", "", samples_meta_to_read) } else { samples_to_read <- gsub(".meta$", "", gtf_meta_files) samples_meta_to_read <- gtf_meta_files } suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) granges <- .parse_gtf_files( samples_to_read, regions, suffix_vec, vector_field ) } } .extract_from_GRangesList <- function( rangesList, metadata, metadata_prefix, regions, suffix ) { if (!is(rangesList, "GRangesList")) { stop("only GrangesList admitted") } if (!length(rangesList)) { stop("rangesList empty") } meta_list <- metadata(rangesList) samples <- .check_metadata_list(metadata, metadata_prefix, meta_list) if (!length(unlist(samples))) { samples <- rangesList } else { index <- unlist(samples) samples <- rangesList[c(index)] } new_meta_list <- metadata(samples) suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list) granges <- .parse_Granges(samples, regions, suffix_vec) } .parse_Granges <- function(region_list, regions, suffixes) { if (is.null(suffixes)) { suffixes <- "" } g1 <- region_list[[1]] if(is.object(regions) && ("FULL" %in% class(regions))) { all_values <- names(elementMetadata(g1)) except_values <- regions$values regions <- if (is.null(except_values)) all_values else all_values[!all_values %in% except_values] names(regions) <- NULL } elementMetadata(g1) <- NULL if (!is.null(regions)) { DF_list <- mapply(function(g_x, h) { meta <- elementMetadata(g_x)[regions] if (h != "") { names(meta) <- paste(regions, h, sep = ".") } data.frame(meta) }, region_list, suffixes, SIMPLIFY = FALSE) DF_only_regions <- dplyr::bind_cols(DF_list) elementMetadata(g1) <- DF_only_regions } g1 } .get_suffix <- function(col_name, from_list, meta_fl) { suffix <- paste0(col_name, "$") if (from_list) { meta_list <- mapply(function(x, index) { vec_names <- names(x) s_index <- grep(suffix, vec_names) first_index <- s_index[1] suffix <- unlist(x[first_index]) # ne prendo solo uno names(suffix) <- NULL # if found retrieve samples that has at least one choosen metadata if (first_index && !is.na(first_index)) { suffix } else { "" } }, meta_fl, seq_along(meta_fl)) } else { meta_list <- vapply(meta_fl, function(x) { list <- .add_metadata(x) vec_names <- names(list) index <- grep(suffix, vec_names) first_index <- index[1] suffix <- unlist(list[first_index]) # ne prendo solo uno names(suffix) <- NULL # if found retrieve samples that has at least one choosen metadata if (first_index && !is.na(first_index)) { suffix } else { "" } }, character(1)) } names(meta_list) <- NULL meta_list } .check_metadata_list <- function(metadata, metadata_prefix, meta_list) { vec_meta <- paste0(metadata_prefix, metadata) list <- mapply(function(x, index) { vec_names <- names(x) a <- lapply(vec_meta, function(y) { which(y == vec_names) }) ## we would like that manage more index from grep found <- as.logical(length(unlist(a))) # if found retrieve samples that has at least one choosen metadata if (found) { index } }, meta_list, seq_along(meta_list)) } .check_metadata_files <- function(metadata, metadata_prefix, meta_files) { vec_meta <- paste0(metadata_prefix, metadata) meta_list <- lapply(meta_files, function(x) { list <- .add_metadata(x) vec_names <- names(list) a <- lapply(vec_meta, function(y) { grep(y, vec_names) }) ## we would like that manage more index from grep found <- as.logical(length(unlist(a))) # if found retrieve samples that has at least one chosen metadata if (found) { x } }) } .parse_gtf_files <- function( gtf_region_files, regions, suffixes, vector_field ) { attr_col_names <- vector_field[ !vector_field %in% c("seqname", "seqid", "start", "end", "strand")] g1 <- rtracklayer::import( con = gtf_region_files[1], format = "gtf", colnames = attr_col_names ) elementMetadata(g1) <- NULL if (is.null(suffixes)) { suffixes <- "" } # check if we used a FULL parameter instead of char array containing # the region parameters if(is.object(regions) && ("FULL" %in% class(regions))) { all_values <- vector_field[!vector_field %in% c( "seqname", "strand", "start", "end") ] except_values <- regions$values regions <- if (is.null(except_values)) all_values else all_values[!all_values %in% except_values] names(regions) <- NULL # since import convert this value from GMQL schema to GTF format # we need to convert it back regions <- replace(regions, regions == "feature", "type") regions <- replace(regions, regions == "frame", "phase") } if (!is.null(regions)) { DF_list <- mapply(function(x, header) { g_x <- rtracklayer::import( x, format = "gtf", colnames = attr_col_names ) meta <- elementMetadata(g_x)[regions] if (header != "") { names(meta) <- paste(regions, header, sep = ".") } data.frame(meta) }, gtf_region_files, suffixes, SIMPLIFY = FALSE) DF_only_regions <- dplyr::bind_cols(DF_list) elementMetadata(g1) <- DF_only_regions } g1 } .parse_gdm_files <- function( vector_field, gdm_region_files, regions, suffixes ) { # read first sample cause chromosome regions are the same for all samples df <- data.table::fread( gdm_region_files[1], col.names = vector_field, header = FALSE, sep = "\t" ) col_names <- names(df) df <- subset(df, TRUE, c("chr", "left", "right", "strand")) # check if we used a FULL parameter instead of char array containing # the region parameters if(is.object(regions) && ("FULL" %in% class(regions))) { all_values <- vector_field[!vector_field %in% c( "chr", "left", "right", "strand") ] except_values <- regions$values regions <- if (is.null(except_values)) all_values else all_values[!all_values %in% except_values] names(regions) <- NULL } if (!is.null(regions)) { df_list <- lapply(gdm_region_files, function(x, regions, vector_field) { region_frame <- data.table::fread( x, col.names = vector_field, header = FALSE, sep = "\t") col_names <- names(region_frame) # delete column not choosen by input if (!is.null(regions)) { col_names <- col_names[col_names %in% regions] } if (length(col_names)) { r <- subset(region_frame, TRUE, col_names) } }, regions, vector_field) df_only_regions <- dplyr::bind_cols(df_list) complete_df <- dplyr::bind_cols(df, df_only_regions) region_names <- names(complete_df)[-(seq_len(4))] region_names <- gsub("[0-9]+", "", region_names) region_names <- paste(region_names, suffixes, sep = ".") region_names <- c(names(complete_df)[(seq_len(4))], region_names) names(complete_df) <- region_names g <- GenomicRanges::makeGRangesFromDataFrame( complete_df, keep.extra.columns = TRUE, seqnames.field = c("seqnames", "seqname", "chromosome", "chrom", "chr", "chromosome_name"), start.field = "left", end.field = "right") } else { g <- GenomicRanges::makeGRangesFromDataFrame( df, keep.extra.columns = TRUE, seqnames.field = c("seqnames", "seqname", "chromosome", "chrom", "chr", "chromosome_name"), start.field = "left", end.field = "right") } }