#' Function read #' #' It reads a GMQL dataset, as a folder containing some homogenus samples on #' disk or as a GRangesList, saving it in Scala memory in a way that can be #' referenced in R. It is also used to read a repository dataset in case of #' remote processing. #' #' @importFrom rJava J .jnull .jarray #' @importFrom methods is #' #' @param dataset folder path for GMQL dataset or dataset name on repository #' @param parser string used to parsing dataset files. #' The Parsers available are: #' \itemize{ #' \item{BedParser} #' \item{BroadPeakParser} #' \item{NarrowPeakParser} #' \item{CustomParser} #' } #' Default is CustomParser. #' @param is_local logical value indicating local or remote dataset #' #' @return GMQLDataset object. It contains the value to use as input #' for the subsequent GMQLDataset method #' #' @details #' Normally, a GMQL dataset contains an XML schema file that contains #' name of region attributes. (e.g chr, start, stop, strand) #' The CustomParser reads this XML schema; #' if you already know what kind of schema your files have, use one of the #' parsers defined, without reading any XML schema. #' #' If GRangesList has no metadata: i.e. metadata() is empty, two metadata are #' generated: #' \itemize{ #' \item{"provider" = "PoliMi"} #' \item{"application" = "RGMQL"} #' } #' #' NOTE: #' The folder layout must obey the following rules and adopt #' the following layout: #' The dataset folder can have any name, but must contains the #' sub-folders named: "files". #' The sub-folder "files" contains the dataset files and #' the schema xml file. #' The schema files adopt the following the naming conventions: #' #' - "schema.xml" #' - "test.schema" #' #' The names must be in LOWERCASE. Any other schema file #' will not be conisdered, if both are present, "test.schema" will be used. #' #' @examples #' #' ## This statement initializes and runs the GMQL server for local execution #' ## and creation of results on disk. Then, with system.file() it defines #' ## the path to the folder "DATASET" in the subdirectory "example" #' ## of the package "RGMQL" and opens such folder as a GMQL dataset #' ## named "data" using CustomParser #' #' init_gmql() #' test_path <- system.file("example", "DATASET", package = "RGMQL") #' data = read_gmql(test_path) #' #' ## This statement opens such folder as a GMQL dataset named "data" using #' ## "NarrowPeakParser" #' dataPeak = read_gmql(test_path,"NarrowPeakParser") #' #' ## This statement reads a remote public dataset stored into GMQL system #' ## repository. For a public dataset in a (remote) GMQL repository the #' ## prefix "public." is needed before dataset name #' #' remote_url = "http://www.gmql.eu/gmql-rest/" #' login_gmql(remote_url) #' data1 = read_gmql("public.Example_Dataset_1", is_local = FALSE) #' #' #' @name read_gmql #' @rdname read-function #' @export #' read_gmql <- function( dataset, parser = "CustomParser", is_local = TRUE ) { .check_input(dataset) .check_logical(is_local) WrappeR <- J("it/polimi/genomics/r/Wrapper") parser_name <- .check_parser(parser) if(is_local) { if(!dir.exists(dataset)) stop("folder does not exist") dataset <- sub("/*[/]$","",dataset) if(basename(dataset) !="files") dataset <- file.path(dataset,"files") schema_XML <- .retrieve_schema(dataset, TRUE) schema_matrix <- .jnull("java/lang/String") url <- .jnull("java/lang/String") coords_sys <- .jnull("java/lang/String") type <- .jnull("java/lang/String") } else { if(!exists("GMQL_credentials", envir = .GlobalEnv)) stop("You have to log on using login function") credential <- get("GMQL_credentials", envir = .GlobalEnv) url <- credential$remote_url if(is.null(url)) stop("You have to log on using login function") if(identical(parser_name,"CUSTOMPARSER")) { list <- show_schema(url, dataset) coords_sys <- list$coordinate_system type <- list$type schema_names <- vapply( list$fields, function(x){ x$name },character(1) ) schema_type <- vapply( list$fields, function(x){ x$type },character(1) ) schema_matrix <- cbind(schema_names,schema_type) if(is.null(schema_matrix) || !length(schema_matrix)) schema_matrix <- .jnull("java/lang/String") else schema_matrix <- .jarray(schema_matrix, dispatch = TRUE) } else schema_matrix <- .jnull("java/lang/String") schema_XML <- .jnull("java/lang/String") } response <- WrappeR$readDataset( dataset, parser_name, is_local, schema_matrix, schema_XML, coords_sys, type ) error <- strtoi(response[1]) data <- response[2] if(error) stop(data) else GMQLDataset(data) } #' @importFrom S4Vectors metadata #' @importFrom rJava J .jarray #' #' @param samples GRangesList #' #' @name read_gmql #' @rdname read-function #' @export #' read_GRangesList <- function(samples) { if(!is(samples,"GRangesList")) stop("only GrangesList") meta <- S4Vectors::metadata(samples) if(is.null(meta) || !length(meta)) { #repeat meta for each sample in samples list len <- length(samples) warning( "No metadata.\nWe provide two metadata for you: \n1.provider = PoliMi\n2.application = RGMQL\n" ) index_meta <- rep(seq_len(len),each = len) rep_meta <- rep( c("provider","PoliMi", "application", "RGMQL"), times = len ) meta_matrix <- matrix(rep_meta,ncol = 2,byrow = TRUE) meta_matrix <- cbind(index_meta,meta_matrix) } else { unlist_meta <- unlist(meta) names_meta <- names(unlist_meta) group_names <- gsub(".*_([0-9]*)\\..*","\\1", names_meta) names(unlist_meta) <- NULL meta_matrix <- cbind(group_names,names_meta,unlist_meta) } df <- data.frame(samples) df <- df[-2] #delete group_name len_df <- dim(df)[1] # number of rows col_types <- vapply(df,class,character(1)) col_names <- names(col_types) #re order the schema? # if GTF, change if("phase" %in% col_names) { col_names <- plyr::revalue( col_names,c(type = "feature", phase = "frame", seqnames = "seqname") ) schema_matrix <- cbind(toupper(col_types),col_names) schema_matrix<- schema_matrix[ setdiff(rownames(schema_matrix), c("group","width")),] } else { col_names <- plyr::revalue( col_names, c(start = "left", end = "right", seqnames = "chr") ) schema_matrix <- cbind(col_names,toupper(col_types)) df$start = df$start - 1 schema_matrix<- schema_matrix[ setdiff(rownames(schema_matrix),c("group","width")),] } region_matrix <- as.matrix(vapply(df, as.character,character(len_df))) region_matrix[is.na(region_matrix)] <- "NA" region_matrix <- region_matrix[,setdiff(colnames(region_matrix),"width")] rownames(schema_matrix) <- NULL colnames(schema_matrix) <- NULL schema_matrix <- .jarray(schema_matrix,dispatch = TRUE) meta_matrix <- .jarray(meta_matrix,dispatch = TRUE) region_matrix <- .jarray(region_matrix,dispatch = TRUE) WrappeR <- J("it/polimi/genomics/r/Wrapper") response <- WrappeR$read( meta_matrix, region_matrix, schema_matrix, "default", "TAB" ) GMQLDataset(response) } .check_parser <- function(parser) { parser <- toupper(parser) parsers <- c( "BEDPARSER", "BROADPEAKPARSER", "NARROWPEAKPARSER", "CUSTOMPARSER" ) if(!parser %in% parsers) stop("parser not defined") parser }