#' Filter and extract function
#'
#' This function lets user to create a new GRangesList with fixed information:
#' seqnames, ranges and strand, and a variable part made up by the regions
#' defined as input. The metadata and metadata_prefix are used to filter
#' the data and choose only the samples that match at least one metdatata
#' with its prefix. The input regions are shown for each sample obtained
#' from filtering.
#'
#' @import xml2
#' @importFrom dplyr bind_cols
#' @importFrom data.table fread
#' @importFrom rtracklayer import
#'
#' @param data string GMQL dataset folder path or GRangesList
#' object
#' @param metadata vector of strings containing names of metadata attributes
#' to be searched for in metadata files.
#' Data will be extracted if at least one condition is satisfied:
#' this condition is logically "ANDed" with prefix filtering (see below)
#' if NULL no filtering action occures
#' (i.e every sample is taken for region filtering)
#' @param metadata_prefix vector of strings that will support the metadata
#' filtering. If defined, each 'metadata' is concatenated with the
#' corresponding prefix.
#' @param region_attributes vector of strings that extracts only region
#' attributes  specified; if NULL no regions attribute is taken and the output
#' is only GRanges made up by the region coordinate attributes
#' (seqnames, start, end, strand)
#' @param suffix name for each metadata column of GRanges. By default it is the
#' value of the metadata attribute named "antibody_target". This string is
#' taken from sample metadata file or from metadata() associated.
#' If not present, the column name is the name of selected regions specified
#' by 'region_attributes' input parameter
#'
#' @details
#' This function works only with dataset or GRangesList all whose samples or
#' Granges have the same region coordinates (chr, ranges, strand) ordered in
#' the same way for each sample
#'
#' In case of GRangesList data input, the function searches for metadata
#' into metadata() function associated to GRangesList.
#'
#' @return GRanges with selected regions
#'
#' @examples
#'
#' ## This statement defines the path to the folder "DATASET" in the
#' ## subdirectory "example" of the package "RGMQL" and filters such folder
#' ## dataset including at output only "pvalue" and "peak" region attributes
#'
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' filter_and_extract(test_path, region_attributes = c("pvalue", "peak"))
#'
#' ## This statement imports a GMQL dataset as GRangesList and filters it
#' ## including at output only "pvalue" and "peak" region attributes, the sort
#' ## function makes sure that the region coordinates (chr, ranges, strand)
#' ## of all samples are ordered correctly
#'
#'
#' grl <- import_gmql(test_path, TRUE)
#' sorted_grl <- sort(grl)
#' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak"))
#' @export
#'
filter_and_extract <- function(
  data,
  metadata = NULL,
  metadata_prefix = NULL,
  region_attributes = NULL,
  suffix = "antibody_target"
) {
  
  if (is(data, "GRangesList")) {
    .extract_from_GRangesList(
      data,
      metadata,
      metadata_prefix,
      region_attributes,
      suffix
    )
  } else {
    .extract_from_dataset(
      data,
      metadata,
      metadata_prefix,
      region_attributes, suffix
    )
  }
}

.extract_from_dataset <- function(
  datasetName,
  metadata,
  metadata_prefix,
  regions,
  suffix
) {
  datasetName <- sub("/*[/]$", "", datasetName)
  if (basename(datasetName) != "files") {
    datasetName <- file.path(datasetName, "files")
  }
  
  if (!dir.exists(datasetName)) {
    stop("Directory does not exists")
  }
  
  gdm_meta_files <- list.files(
    datasetName,
    pattern = "*.gdm.meta$",
    full.names = TRUE
  )
  
  gtf_meta_files <- list.files(
    datasetName,
    pattern = "*.gtf.meta$",
    full.names = TRUE
  )
  
  if (!length(gdm_meta_files) && !length(gtf_meta_files)) {
    stop("no samples present or no files format supported")
  }
  
  if (length(gdm_meta_files) && length(gtf_meta_files)) {
    stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together")
  }
  
  vector_field <- .schema_header(datasetName)
  
  if (length(gdm_meta_files)) {
    samples_file <- .check_metadata_files(
      metadata, metadata_prefix,
      gdm_meta_files
    )
    
    samples_meta_to_read <- unlist(samples_file)
    
    if (length(samples_meta_to_read)) {
      samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
    } else {
      samples_to_read <- gsub(".meta$", "", gdm_meta_files)
      samples_meta_to_read <- gtf_meta_files
    }
    
    suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
    granges <- .parse_gdm_files(
      vector_field, 
      samples_to_read, 
      regions,
      suffix_vec,
      vector_field
    )
    
  } else {
    samples_file <- .check_metadata_files(
      metadata, 
      metadata_prefix,
      gtf_meta_files
    )
    samples_meta_to_read <- unlist(samples_file)
    
    if (length(samples_meta_to_read)) {
      samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
    } else {
      samples_to_read <- gsub(".meta$", "", gtf_meta_files)
      samples_meta_to_read <- gtf_meta_files
    }
    
    suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
    granges <- .parse_gtf_files(
      samples_to_read, 
      regions, 
      suffix_vec, 
      vector_field
    )
  }
}

.extract_from_GRangesList <- function(
  rangesList,
  metadata,
  metadata_prefix,
  regions,
  suffix
) {
  if (!is(rangesList, "GRangesList")) {
    stop("only GrangesList admitted")
  }
  
  if (!length(rangesList)) {
    stop("rangesList empty")
  }
  
  meta_list <- metadata(rangesList)
  samples <- .check_metadata_list(metadata, metadata_prefix, meta_list)
  if (!length(unlist(samples))) {
    samples <- rangesList
  } else {
    index <- unlist(samples)
    samples <- rangesList[c(index)]
  }
  new_meta_list <- metadata(samples)
  suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list)
  granges <- .parse_Granges(samples, regions, suffix_vec)
}

.parse_Granges <- function(region_list, regions, suffixes) {
  if (is.null(suffixes)) {
    suffixes <- ""
  }
  
  g1 <- region_list[[1]]
  elementMetadata(g1) <- NULL
  if (!is.null(regions)) {
    DF_list <- mapply(function(g_x, h) {
      meta <- elementMetadata(g_x)[regions]
      if (h != "") {
        names(meta) <- paste(regions, h, sep = ".")
      }
      data.frame(meta)
    }, region_list, suffixes, SIMPLIFY = FALSE)
    DF_only_regions <- dplyr::bind_cols(DF_list)
    elementMetadata(g1) <- DF_only_regions
  }
  g1
}

.get_suffix <- function(col_name, from_list, meta_fl) {
  suffix <- paste0(col_name, "$")
  
  if (from_list) {
    meta_list <- mapply(function(x, index) {
      vec_names <- names(x)
      s_index <- grep(suffix, vec_names)
      first_index <- s_index[1]
      suffix <- unlist(x[first_index]) # ne prendo solo uno
      names(suffix) <- NULL
      
      # if found retrieve samples that has at least one choosen metadata
      if (first_index && !is.na(first_index)) {
        suffix
      } else {
        ""
      }
    }, meta_fl, seq_along(meta_fl))
  }
  else {
    meta_list <- vapply(meta_fl, function(x) {
      list <- .add_metadata(x)
      vec_names <- names(list)
      index <- grep(suffix, vec_names)
      first_index <- index[1]
      suffix <- unlist(list[first_index]) # ne prendo solo uno
      names(suffix) <- NULL
      # if found retrieve samples that has at least one choosen metadata
      if (first_index && !is.na(first_index)) {
        suffix
      } else {
        ""
      }
    }, character(1))
  }
  names(meta_list) <- NULL
  meta_list
}

.check_metadata_list <- function(metadata, metadata_prefix, meta_list) {
  vec_meta <- paste0(metadata_prefix, metadata)
  list <- mapply(function(x, index) {
    vec_names <- names(x)
    a <- lapply(vec_meta, function(y) {
      which(y == vec_names)
    })
    ## we would like that manage more index from grep
    found <- as.logical(length(unlist(a)))
    # if found retrieve samples that has at least one choosen metadata
    if (found) {
      index
    }
  }, meta_list, seq_along(meta_list))
}

.check_metadata_files <- function(metadata, metadata_prefix, meta_files) {
  vec_meta <- paste0(metadata_prefix, metadata)
  meta_list <- lapply(meta_files, function(x) {
    list <- .add_metadata(x)
    vec_names <- names(list)
    a <- lapply(vec_meta, function(y) {
      grep(y, vec_names)
    })
    ## we would like that manage more index from grep
    found <- as.logical(length(unlist(a)))
    # if found retrieve samples that has at least one choosen metadata
    if (found) {
      x
    }
  })
}

.parse_gtf_files <- function(
  gtf_region_files, 
  regions, 
  suffixes,
  vector_field
) {
  g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf")
  elementMetadata(g1) <- NULL
  if (is.null(suffixes)) {
    suffixes <- ""
  }
  
  # check if we used a FULL parameter instead of char array containing
  # the region parameters
  if(is.object(regions) && ("FULL" %in% class(regions))) {
    all_values <- vector_field[!vector_field %in% c(
      "seqname", 
      "strand", 
      "start",
      "end"
      )
    ]
    except_values <- regions$values
    regions <- if (is.null(except_values))
      all_values
    else
      all_values[!all_values %in% except_values]
    names(regions) <- NULL
    # since import convert this value from GMQL schema to GTF format
    # we need to convert it back
    regions <- replace(regions, regions == "feature", "type")
    regions <- replace(regions, regions == "frame", "phase")
  }
  
  if (!is.null(regions)) {
    DF_list <- mapply(function(x, header) {
      g_x <- rtracklayer::import(con = x, format = "gtf")
      meta <- elementMetadata(g_x)[regions]
      if (header != "") {
        names(meta) <- paste(regions, header, sep = ".")
      }
      data.frame(meta)
    }, gtf_region_files, suffixes, SIMPLIFY = FALSE)
    DF_only_regions <- dplyr::bind_cols(DF_list)
    elementMetadata(g1) <- DF_only_regions
  }
  g1
}

.parse_gdm_files <- function(
  vector_field,
  gdm_region_files,
  regions,
  suffixes
) {
  # read first sample cause chromosome regions are the same for all samples
  df <- data.table::fread(
    gdm_region_files[1],
    col.names = vector_field,
    header = FALSE,
    sep = "\t"
  )
  col_names <- names(df)
  df <- subset(df, TRUE, c("chr", "left", "right", "strand"))
  
  if (!is.null(regions)) {
    df_list <- lapply(gdm_region_files, function(x, regions, vector_field) {
      region_frame <- data.table::fread(
        x,
        col.names = vector_field,
        header = FALSE,
        sep = "\t"
      )
      col_names <- names(region_frame)
      # delete column not choosen by input
      if (!is.null(regions)) {
        col_names <- col_names[col_names %in% regions]
      }
      
      if (length(col_names)) {
        r <- subset(region_frame, TRUE, col_names)
      }
    }, regions, vector_field)
    
    df_only_regions <- dplyr::bind_cols(df_list)
    complete_df <- dplyr::bind_cols(df, df_only_regions)
    
    region_names <- names(complete_df)[-(seq_len(4))]
    region_names <- gsub("[0-9]+", "", region_names)
    region_names <- paste(region_names, suffixes, sep = ".")
    region_names <- c(names(complete_df)[(seq_len(4))], region_names)
    names(complete_df) <- region_names
    g <- GenomicRanges::makeGRangesFromDataFrame(
      complete_df,
      keep.extra.columns = TRUE,
      start.field = "left",
      end.field = "right"
    )
  }
  else {
    g <- GenomicRanges::makeGRangesFromDataFrame(
      df,
      keep.extra.columns = TRUE,
      start.field = "left",
      end.field = "right"
    )
  }
}