R/gmql_materialize.R
43d9e4a1
 #' GMQL Function: EXECUTE
 #'
83eb0624
 #' It executes GMQL query.
5b358a31
 #' The function works only after invoking at least one collect
43d9e4a1
 #' 
 #' @importFrom rJava J
 #' 
 #' @return None
 #'
 #' @examples
18697071
 #' ## This statement initializes and runs the GMQL server for local execution 
83eb0624
 #' ## and creation of results on disk. Then, with system.file() it defines 
18697071
 #' ## the path to the folder "DATASET" in the subdirectory "example" 
83eb0624
 #' ## of the package "RGMQL" and opens such folder as a GMQL dataset 
 #' ## named "data"
 #' 
94a33e57
 #' init_gmql()
18697071
 #' test_path <- system.file("example", "DATASET", package = "RGMQL")
 #' data = read_gmql(test_path)
83eb0624
 #' 
18697071
 #' ## The following statement materializes the dataset "data", previoulsy read, 
 #' ## at the specific destination test_path into local folder "ds1" opportunely 
 #' ## created
83eb0624
 #' 
 #' collect(data, dir_out = test_path)
43d9e4a1
 #' 
83eb0624
 #' ## This statement executes GMQL query.
43d9e4a1
 #' \dontrun{
83eb0624
 #' 
43d9e4a1
 #' execute()
 #' }
 #' @export
 #'
73652f4d
 execute <- function() {
   WrappeR <- J("it/polimi/genomics/r/Wrapper")
   remote_proc <- WrappeR$is_remote_processing()
   datasets <- .jevalArray(WrappeR$get_dataset_list(), simplify = TRUE)
   
   if(!remote_proc)
     .download_or_upload(datasets)
   
   response <- WrappeR$execute()
   error <- strtoi(response[1])
   val <- response[2]
   if(error)
     stop(val)
   else {
     if(remote_proc) {
       isGTF <- FALSE
       outformat <- WrappeR$outputMaterialize()
       if(identical(outformat, "gtf"))
         isGTF <- TRUE
       
       url <- WrappeR$get_url()
       .download_or_upload(datasets)
       res <- serialize_query(url,isGTF,val)
43d9e4a1
     }
73652f4d
   }
43d9e4a1
 }
 
c9b073a5
 .download_or_upload <- function(datasets) {
73652f4d
   WrappeR <- J("it/polimi/genomics/r/Wrapper")
   data_list <- apply(datasets, 1, as.list)
   url <- WrappeR$get_url()
   remote <- WrappeR$is_remote_processing()
   if(remote) {
     lapply(data_list,function(x){
       if(!is.null(x[[1]]) && !is.na(x[[1]]))
709330ce
         upload_dataset(url,x[[2]],x[[1]],x[[3]])})
73652f4d
   } else {
     lapply(data_list,function(x){
       if(!is.null(x[[2]]) && !is.na(x[[2]]))
         download_dataset(url,x[[2]],x[[1]])})
   }
43d9e4a1
 }
 
4ad24353
 collect.GMQLDataset <- function(x, dir_out = getwd(), name = "ds1") {
73652f4d
   ptr_data <- value(x)
   gmql_materialize(ptr_data, dir_out, name)
e5131ba8
 }
07f76a54
 
 
5b358a31
 #' Method collect
43d9e4a1
 #'
e5131ba8
 #' @description Wrapper to GMQL MATERIALIZE operator
 #' 
 #' @description It saves the content of a dataset that contains samples 
 #' metadata and regions. It is normally used to persist the content of any 
 #' dataset generated during a GMQL query.
94a33e57
 #' Any dataset can be materialized, but the operation can be time-consuming.
43d9e4a1
 #' For best performance, materialize the relevant data only.
 #'
 #' @importFrom rJava J
e5131ba8
 #' @importFrom dplyr collect
43d9e4a1
 #' 
fce655ee
 #' @param x GMQLDataset class object
73652f4d
 #' @param name name of the result dataset. By default it is the string "ds1"
18697071
 #' @param dir_out destination folder path. By default it is the current 
 #' working directory of the R process
65260a4d
 #' 
e5131ba8
 #' @details 
 #' 
 #' An error occures if the directory already exist at the destination
 #' folder path
65260a4d
 #' 
43d9e4a1
 #' @return None
 #'
 #' @examples
e5131ba8
 #' 
 #' ## This statement initializes and runs the GMQL server for local execution 
 #' ## and creation of results on disk. Then, with system.file() it defines 
 #' ## the path to the folder "DATASET" in the subdirectory "example"
 #' ## of the package "RGMQL" and opens such file as a GMQL dataset named 
dda0cfa0
 #' ## "data" using CustomParser
43d9e4a1
 #'
94a33e57
 #' init_gmql()
e5131ba8
 #' test_path <- system.file("example", "DATASET", package = "RGMQL")
18697071
 #' data = read_gmql(test_path)
e5131ba8
 #' 
18697071
 #' ## The following statement materializes the dataset 'data', previoulsy read, 
 #' ## at the specific destination test_path into local folder "ds1" opportunely 
 #' ## created
e5131ba8
 #' 
 #' collect(data, dir_out = test_path)
43d9e4a1
 #' 
e5131ba8
 #' @name collect
 #' @rdname collect
 #' @aliases collect,GMQLDataset-method
fce655ee
 #' @aliases collect-method
43d9e4a1
 #' @export
e5131ba8
 setMethod("collect", "GMQLDataset",collect.GMQLDataset)
07f76a54
 
73652f4d
 gmql_materialize <- function(input_data, name, dir_out) {
   WrappeR <- J("it/polimi/genomics/r/Wrapper")
   remote_proc <- WrappeR$is_remote_processing()
   
   if(grepl("\\.",name))
     stop("dataset name cannot contains dot")
   
   if(!remote_proc) {
     dir_out <- sub("/*[/]$","",dir_out)
     res_dir_out <- file.path(dir_out,name)
     if(!dir.exists(res_dir_out))
       dir.create(res_dir_out)
   }
   else
     res_dir_out <- name
   
   response <- WrappeR$materialize(input_data, res_dir_out)
   error <- strtoi(response[1])
   val <- response[2]
   if(error)
     stop(val)
   else
     invisible(NULL)
43d9e4a1
 }
 
 
5b358a31
 #' Method take
43d9e4a1
 #'
83eb0624
 #' It saves the content of a dataset that contains samples metadata 
dda0cfa0
 #' and regions as GRangesList.
83eb0624
 #' It is normally used to store in memory the content of any dataset 
5b358a31
 #' generated during a GMQL query. The operation can be very time-consuming.
dda0cfa0
 #' If you invoked any materialization before take function, 
83eb0624
 #' all those datasets are materialized as folders.
43d9e4a1
 #'
83eb0624
 #' @importFrom GenomicRanges makeGRangesFromDataFrame
 #' @importFrom S4Vectors metadata
43d9e4a1
 #' @importFrom stats setNames
83eb0624
 #' @importFrom rJava J .jevalArray
 #' @importFrom GenomicRanges GRangesList
43d9e4a1
 #' 
f345b6e2
 #' @param .data returned object from any GMQL function
18697071
 #' @param rows number of regions rows for each sample that you want to 
83eb0624
 #' retrieve and store in memory.
18697071
 #' By default it is 0, that means take all rows for each sample
65260a4d
 #' 
83eb0624
 #' @param ... Additional arguments for use in other specific methods of the 
 #' generic take function
65260a4d
 #' 
83eb0624
 #' @return GRangesList with associated metadata
43d9e4a1
 #'
 #' @examples
83eb0624
 #' ## This statement initializes and runs the GMQL server for local execution 
 #' ## and creation of results on disk. Then, with system.file() it defines 
 #' ## the path to the folder "DATASET" in the subdirectory "example"
 #' ## of the package "RGMQL" and opens such folder as a GMQL dataset 
dda0cfa0
 #' ## named "rd" using CustomParser
83eb0624
 #' 
94a33e57
 #' init_gmql()
 #' test_path <- system.file("example", "DATASET", package = "RGMQL")
18697071
 #' rd = read_gmql(test_path)
 #' 
dda0cfa0
 #' ## This statement creates a dataset called 'aggr' which contains one 
18697071
 #' ## sample for each antibody_target and cell value found within the metadata 
dda0cfa0
 #' ## of the 'rd' dataset sample; each created sample contains all regions 
 #' ## from all 'rd' samples with a specific value for their 
18697071
 #' ## antibody_target and cell metadata attributes.
 #'  
 #' aggr = aggregate(rd, conds(c("antibody_target", "cell")))
 #' 
dda0cfa0
 #' ## This statement performs the query and returns the resulted dataset as 
18697071
 #' ## GRangesList named 'taken'. It returns only the first 45 regions of 
dda0cfa0
 #' ## each sample present into GRangesList and all the medatata associated 
 #' ## with each sample
83eb0624
 #' 
65260a4d
 #' taken <- take(aggr, rows = 45)
43d9e4a1
 #' 
e5131ba8
 #' @name take
 #' @rdname take
65260a4d
 #' @aliases take-method
43d9e4a1
 #' @export
65260a4d
 setMethod("take", "GMQLDataset",
73652f4d
           function(.data, rows = 0L)
           {
             ptr_data <- value(.data)
             gmql_take(ptr_data, rows)
           })
65260a4d
 
4ad24353
 gmql_take <- function(input_data, rows) {
73652f4d
   rows <- as.integer(rows[1])
   if(rows<0)
     stop("rows cannot be negative")
   
   WrappeR <- J("it/polimi/genomics/r/Wrapper")
   response <- WrappeR$take(input_data, rows)
   error <- strtoi(response[1])
   data <- response[2]
   if(error)
     stop(data)
   
   reg <- .jevalArray(WrappeR$get_reg(),simplify = TRUE)
   if(is.null(reg))
     stop("no regions defined")
   meta <- .jevalArray(WrappeR$get_meta(),simplify = TRUE)
   if(is.null(meta))
     stop("no metadata defined")
   schema <- .jevalArray(WrappeR$get_schema(),simplify = TRUE)
   if(is.null(schema))
     stop("no schema defined")
   
   reg_data_frame <- as.data.frame(reg)
   if (!length(reg_data_frame)){
     return(GRangesList())
   }
   list <- split(reg_data_frame, reg_data_frame[1])
   seq_name <- c("seqname","start","end","strand",schema)
   
   sampleList <- lapply(list, function(x){
     x <- x[-1]
     names(x) <- seq_name
4ad24353
     #    start_numeric = as.numeric(levels(x$start))[x$start]
73652f4d
     start_numeric = as.numeric(x$start)
     start_numeric = start_numeric + 1
     x$start =  start_numeric
     #levels(x$start)[x$start] = start_numeric
     g <- GenomicRanges::makeGRangesFromDataFrame(
       x,
       keep.extra.columns = TRUE,
       start.field = "start",
       end.field = "end")
   })
   
   gRange_list <- GRangesList(sampleList)
   len = length(gRange_list)
   names(gRange_list) <- paste0("S_",seq_len(len))
   meta_list <- .metadata_from_frame_to_list(meta)
   names(meta_list) <- paste0("S_",seq_len(len))
   S4Vectors::metadata(gRange_list) <- meta_list
   return(gRange_list)
43d9e4a1
 }
 
4ad24353
 .metadata_from_frame_to_list <- function(metadata_frame) {
73652f4d
   meta_frame <- as.data.frame(metadata_frame)
   list <- split(meta_frame, meta_frame[1])
   name_value_list <- lapply(list, function(x){x <- x[-1]})
   meta_list <- lapply(name_value_list, function(x){
     stats::setNames(as.list(as.character(x[[2]])), x[[1]])
   })
43d9e4a1
 }