... | ... |
@@ -26,6 +26,16 @@ |
26 | 26 |
#' |
27 | 27 |
execute <- function() |
28 | 28 |
{ |
29 |
+ remote_proc <- WrappeR$is_remote_processing() |
|
30 |
+ array_dataset <- WrappeR$list_dataset_upload_download() |
|
31 |
+ if(remote_proc) |
|
32 |
+ { |
|
33 |
+ |
|
34 |
+ } |
|
35 |
+ else |
|
36 |
+ { |
|
37 |
+ |
|
38 |
+ } |
|
29 | 39 |
out <- WrappeR$execute() |
30 | 40 |
if(grepl("OK",out,ignore.case = TRUE)) |
31 | 41 |
print("Executed") |
... | ... |
@@ -66,7 +76,9 @@ materialize <- function(input_data, dir_out = getwd()) |
66 | 76 |
if(grepl("No",out,ignore.case = TRUE)) |
67 | 77 |
stop(out) |
68 | 78 |
else |
79 |
+ { |
|
69 | 80 |
invisible(NULL) |
81 |
+ } |
|
70 | 82 |
} |
71 | 83 |
|
72 | 84 |
#' GMQL Operation: TAKE |
... | ... |
@@ -59,14 +59,10 @@ initGMQL <- function(output_format="gtf", remote_processing = FALSE) |
59 | 59 |
#' } |
60 | 60 |
#' Default is CustomParser. |
61 | 61 |
#' @param is_local single logical value indicating local or remote dataset |
62 |
-#' if the remote processing is off you cannot set is_local=FALSE (an error occures) |
|
62 |
+#' @param is_GMQL single logical value indicating if dataset is GMQL dataset or not |
|
63 | 63 |
#' @param url single string url of server: it must contain the server address and base url; |
64 | 64 |
#' service name will be added automatically |
65 | 65 |
#' useful only in remote processing |
66 |
-#' @param override single logical value used in order to determine the overriding of reading |
|
67 |
-#' dataset into repository, if an other dataset with the same name already exist into repostiory |
|
68 |
-#' and override value is FALSE an error occures. |
|
69 |
-#' useful only in remote processing |
|
70 | 66 |
#' |
71 | 67 |
#' @importFrom methods is |
72 | 68 |
#' |
... | ... |
@@ -87,6 +83,7 @@ initGMQL <- function(output_format="gtf", remote_processing = FALSE) |
87 | 83 |
#' r = readDataset(test_path) |
88 | 84 |
#' |
89 | 85 |
#' \dontrun{ |
86 |
+#' |
|
90 | 87 |
#' ### local with other Parser |
91 | 88 |
#' initGMQL("gtf") |
92 | 89 |
#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
... | ... |
@@ -95,52 +92,25 @@ initGMQL <- function(output_format="gtf", remote_processing = FALSE) |
95 | 92 |
#' |
96 | 93 |
#' @export |
97 | 94 |
#' |
98 |
-readDataset <- function(dataset, parser = "CustomParser",is_local=TRUE,url=NULL, override= FALSE) |
|
95 |
+readDataset <- function(dataset, parser = "CustomParser",is_local=TRUE, |
|
96 |
+ is_GMQL=TRUE, url=NULL) |
|
99 | 97 |
{ |
100 |
- remote_proc <- WrappeR$is_remote_processing() |
|
101 |
- if(!remote_proc && !is_local) |
|
102 |
- stop("you cannot use local processing with remote repository") |
|
103 |
- |
|
104 |
- if(!is.character(dataset) || length(dataset) >1) |
|
105 |
- stop("dataset: invalid input or length > 1") |
|
106 |
- |
|
107 |
- if(!is.logical(override) || length(override) >1) |
|
108 |
- stop("override: invalid input or length > 1") |
|
109 |
- |
|
110 |
- if(!is.logical(is_local) || length(is_local) >1) |
|
111 |
- stop("is_local: invalid input or length > 1") |
|
98 |
+ .check_input(dataset) |
|
99 |
+ .check_logical(is_local) |
|
100 |
+ .check_logical(is_GMQL) |
|
112 | 101 |
|
113 | 102 |
if(is_local) |
114 | 103 |
{ |
115 | 104 |
if(!dir.exists(dataset)) |
116 | 105 |
stop("folder does not exist") |
117 |
- |
|
118 |
- remote_proc <- WrappeR$is_remote_processing() |
|
119 |
- if(remote_proc) |
|
120 |
- { |
|
121 |
- if(override) |
|
122 |
- { |
|
123 |
- list <- showDatasets(url) |
|
124 |
- name_dataset <- basename(dataset) |
|
125 |
- if(name_dataset %in% unlist(list$datasets)) |
|
126 |
- deleteDataset(url,name_dataset) |
|
127 |
- } |
|
128 |
- else |
|
129 |
- { |
|
130 |
- list <- showDatasets(url) |
|
131 |
- name_dataset <- basename(dataset) |
|
132 |
- if(name_dataset %in% unlist(list$datasets)) |
|
133 |
- stop("dataset already exist in repository") |
|
134 |
- } |
|
135 |
- uploadSamples(url,name_dataset,dataset,isGMQL = TRUE) |
|
136 |
- } |
|
137 | 106 |
|
138 | 107 |
schema_matrix <- scalaNull("Array[Array[String]]") |
139 | 108 |
schema_type <- scalaNull("String") |
140 | 109 |
} |
141 | 110 |
else |
142 | 111 |
{ |
143 |
- list <- showSchemaFromDataset(url,name_dataset) |
|
112 |
+ #name_dataset <- basename(dataset) |
|
113 |
+ list <- showSchemaFromDataset(url,dataset) |
|
144 | 114 |
schema_names <- sapply(list$fields, function(x){x$name}) |
145 | 115 |
schema_type <- sapply(list$fields, function(x){x$fieldType}) |
146 | 116 |
schema_matrix <- cbind(schema_type,schema_names) |
... | ... |
@@ -149,7 +119,7 @@ readDataset <- function(dataset, parser = "CustomParser",is_local=TRUE,url=NULL, |
149 | 119 |
|
150 | 120 |
parser_name <- .check_parser(parser) |
151 | 121 |
|
152 |
- out <- WrappeR$readDataset(dataset,parser_name,is_local,schema_matrix) |
|
122 |
+ out <- WrappeR$readDataset(dataset,parser_name,is_local,is_GMQL,schema_matrix) |
|
153 | 123 |
if(grepl("File",out,ignore.case = TRUE) || grepl("No",out,ignore.case = TRUE)) |
154 | 124 |
stop(out) |
155 | 125 |
else |
... | ... |
@@ -262,4 +232,23 @@ remote_processing<-function(is_remote) |
262 | 232 |
} |
263 | 233 |
|
264 | 234 |
|
235 |
+# remote_proc <- WrappeR$is_remote_processing() |
|
236 |
+# if(remote_proc) |
|
237 |
+# { |
|
238 |
+# if(override) |
|
239 |
+# { |
|
240 |
+# list <- showDatasets(url) |
|
241 |
+# name_dataset <- basename(dataset) |
|
242 |
+# if(name_dataset %in% unlist(list$datasets)) |
|
243 |
+# deleteDataset(url,name_dataset) |
|
244 |
+# } |
|
245 |
+# else |
|
246 |
+# { |
|
247 |
+# list <- showDatasets(url) |
|
248 |
+# name_dataset <- basename(dataset) |
|
249 |
+# if(name_dataset %in% unlist(list$datasets)) |
|
250 |
+# stop("dataset already exist in repository") |
|
251 |
+# } |
|
252 |
+# uploadSamples(url,name_dataset,dataset,isGMQL = TRUE) |
|
253 |
+# } |
|
265 | 254 |
|
... | ... |
@@ -112,7 +112,23 @@ |
112 | 112 |
envirs[xin] |
113 | 113 |
} |
114 | 114 |
|
115 |
+.check_input <- function(value) |
|
116 |
+{ |
|
117 |
+ if(!is.character(value)) |
|
118 |
+ stop("no valid data") |
|
119 |
+ |
|
120 |
+ if(length(value)>1) |
|
121 |
+ stop("no multiple string") |
|
122 |
+} |
|
115 | 123 |
|
124 |
+.check_logical <- function(value) |
|
125 |
+{ |
|
126 |
+ if(!is.logical(value)) |
|
127 |
+ stop("no valid data") |
|
128 |
+ |
|
129 |
+ if(length(value)>1) |
|
130 |
+ stop("no multiple string") |
|
131 |
+} |
|
116 | 132 |
|
117 | 133 |
# if(!is.null(groupBy)) |
118 | 134 |
#{ |
... | ... |
@@ -3,6 +3,9 @@ |
3 | 3 |
# biocLite("GMQL") |
4 | 4 |
|
5 | 5 |
## ---- initialization, eval=FALSE----------------------------------------- |
6 |
+# library('GMQL') |
|
7 |
+ |
|
8 |
+## ---- init, eval=FALSE--------------------------------------------------- |
|
6 | 9 |
# initGMQL() |
7 | 10 |
|
8 | 11 |
## ----read GMQL dataset, eval=FALSE--------------------------------------- |
... | ... |
@@ -14,6 +17,9 @@ |
14 | 17 |
# login.GMQL(test_url) |
15 | 18 |
# downloadDataset(test_url,"dataset_test",path = getwd()) |
16 | 19 |
|
20 |
+## ----read remote dataset, eval=FALSE------------------------------------- |
|
21 |
+# data_out = readDataset("dataset_name_on_repo") |
|
22 |
+ |
|
17 | 23 |
## ---- read GRangesList, eval=FALSE--------------------------------------- |
18 | 24 |
# gr1 <- GRanges(seqnames = "chr2", |
19 | 25 |
# ranges = IRanges(103, 106), |
... | ... |
@@ -28,9 +34,37 @@ |
28 | 34 |
# grl <- GRangesList("txA" = gr1, "txB" = gr2) |
29 | 35 |
# data_out <- read(grl) |
30 | 36 |
|
37 |
+## ----query, eval=FALSE--------------------------------------------------- |
|
38 |
+# initGMQL("gtf") |
|
39 |
+# test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
40 |
+# input = readDataset(test_path) |
|
41 |
+# |
|
42 |
+# ## it selects from input data samples of patients younger than 70 years old, |
|
43 |
+# ## based on filtering on sample metadata attribute Patient_age |
|
44 |
+# s=select(input,"Patient_age < 70") |
|
45 |
+# |
|
46 |
+# ## it counts the regions in each sample and stores their number as value of the new metadata |
|
47 |
+# ## RegionCount attribute of the sample. |
|
48 |
+# e = extend(input_data = s, list(RegionCount = COUNT())) |
|
49 |
+# |
|
50 |
+# ## materialize the result dataset on disk |
|
51 |
+# m = materialize(e) |
|
52 |
+ |
|
53 |
+## ----execute, eval=FALSE------------------------------------------------- |
|
54 |
+# execute() |
|
55 |
+ |
|
56 |
+## ----take, eval=FALSE---------------------------------------------------- |
|
57 |
+# g <- take(input_data = m, rows = 45) |
|
58 |
+ |
|
31 | 59 |
## ---- eval=TRUE---------------------------------------------------------- |
32 | 60 |
library("GMQL") |
33 | 61 |
|
34 | 62 |
test_url = "http://130.186.13.219/gmql-rest" |
35 | 63 |
login.GMQL(test_url) |
36 | 64 |
|
65 |
+## ---- eval=FALSE--------------------------------------------------------- |
|
66 |
+# test_url = "http://130.186.13.219/gmql-rest" |
|
67 |
+# login.GMQL(test_url) |
|
68 |
+# runQuery(test_url, "query_1", "DATA_SET_VAR = SELECT() HG19_TCGA_dnaseq; |
|
69 |
+# MATERIALIZE DATA_SET_VAR INTO RESULT_DS;", output_gtf = FALSE) |
|
70 |
+ |
... | ... |
@@ -2,22 +2,26 @@ |
2 | 2 |
title: "GMQL: GenoMetrics Query Language" |
3 | 3 |
author: "Simone Pallotta" |
4 | 4 |
date: "`r Sys.Date()`" |
5 |
+bibliography: bibliography.bib |
|
5 | 6 |
output: BiocStyle::pdf_document |
6 | 7 |
vignette: > |
7 | 8 |
%\VignetteIndexEntry{Vignette Title} |
8 | 9 |
%\VignetteEngine{knitr::rmarkdown} |
9 | 10 |
%\VignetteEncoding{UTF-8} |
11 |
+link-citations: true |
|
10 | 12 |
--- |
11 | 13 |
|
12 | 14 |
# Introduction |
13 | 15 |
|
14 |
-Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions.\newline |
|
16 |
+Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions. |
|
15 | 17 |
For this purpose GMQL has been proposed a high-level, declarative GenoMetric Query Language (GMQL) and a toolkit for its use. |
16 | 18 |
|
17 | 19 |
## Purpose |
18 | 20 |
|
19 |
-This package provides a set of functions to create, manipulate and extract genomic data from different datasources from local and remote datasets.\newline |
|
20 |
-Also, these functios allow performing complex queries without the knowledge of GMQL syntax. |
|
21 |
+GMQL operations focus on genomic domain-specific operations written as simple queries with implicit iterations over thousands of heterogeneous samples, computed in few minutes over servers [@IEEEACM7484654]. |
|
22 |
+This package provides a set of functions to create, manipulate and extract genomic data from different datasources both from local and remote datasets. |
|
23 |
+Also, these functios allow performing complex queries without knowledge of GMQL syntax. |
|
24 |
+ |
|
21 | 25 |
|
22 | 26 |
# Dataset |
23 | 27 |
|
... | ... |
@@ -25,19 +29,16 @@ We usually distinguish two kinds of dataset layout:\newline |
25 | 29 |
These contains large number of information describing regions of genome.\newline |
26 | 30 |
Data are encoded in human readable format using plain text file. |
27 | 31 |
|
28 |
-- GMQL standard layout :\newline\newline |
|
29 |
- Dataset is composed basically of three type of file: |
|
30 |
- |
|
31 |
- 1) region files usually terminating in .gtf or .gdm |
|
32 |
- 2) metadata files terminating in .meta |
|
33 |
- 3) schema XML file containing regions attributes |
|
34 |
-\newline\newline |
|
35 |
- Each region sample file owns its metadata file. |
|
36 |
- All these files must reside in unique folder called files. |
|
32 |
+* GMQL standard layout:\newline\newline |
|
33 |
+ GMQL dataset is a collection of samples with the same region schema, is composed basically of three type of file: |
|
34 |
+ 1. region files usually terminating in .gtf or .gdm |
|
35 |
+ 2. metadata files terminating in .meta |
|
36 |
+ 3. schema XML file containing regions attributes |
|
37 |
+ Each region sample file owns its metadata file. All these files must reside in unique folder called files. |
|
37 | 38 |
|
38 |
- |
|
39 |
+ |
|
39 | 40 |
|
40 |
-- Generic text based dataset:\newline\newline |
|
41 |
+* Generic text based dataset:\newline\newline |
|
41 | 42 |
Dataset composed by heterogeneous sample organised in simple text files probably |
42 | 43 |
stem from different medical, biological sytem |
43 | 44 |
Sample files are simply contained on a folder whose name must be |
... | ... |
@@ -45,17 +46,22 @@ Data are encoded in human readable format using plain text file. |
45 | 46 |
\newline |
46 | 47 |
|
47 | 48 |
In our package dataset files are considered read-only. |
48 |
-Once read genomic information is represented in abstract structure inside |
|
49 |
+Once read, genomic information is represented in abstract structure inside |
|
49 | 50 |
package. |
50 | 51 |
|
52 |
+# Genomic Data Model |
|
53 |
+ |
|
54 |
+The proposed Genomic Data Model (GDM) is based on the notions of datasets and samples; datasets are collections of samples, and each sample consists of two parts, the region data, which describe portions of the DNA, and the metadata, which describe sample general properties.[@IEEEACM7484654]. |
|
51 | 55 |
|
52 | 56 |
# Basic Requirements |
57 |
+ |
|
58 |
+The GMQL package requires: |
|
53 | 59 |
|
54 |
-- javaSE version 8 |
|
55 |
-- java environment correctly set (i.e JAVA_HOME) |
|
56 |
-- scala version 2.11.8 |
|
57 |
-- scala environment correctly set (i.e SCALA_HOME) |
|
58 |
-- network connectivity to web services (if required) |
|
60 |
+* javaSE version 8 |
|
61 |
+* java environment correctly set (i.e JAVA_HOME) |
|
62 |
+* scala version 2.11.8 |
|
63 |
+* scala environment correctly set (i.e SCALA_HOME) |
|
64 |
+* network connectivity to web services (if required) |
|
59 | 65 |
|
60 | 66 |
# How to Install |
61 | 67 |
|
... | ... |
@@ -67,36 +73,42 @@ biocLite("GMQL") |
67 | 73 |
|
68 | 74 |
# Processing Environments |
69 | 75 |
|
70 |
-This package allow to create, manipulate and extract genomic data from |
|
71 |
-different datasets using different processing modes. |
|
76 |
+This package allows to create, manipulate and extract genomic data from |
|
77 |
+different datasets using different processing modes both local and remote. |
|
72 | 78 |
|
73 | 79 |
## Local Processing |
74 | 80 |
|
75 | 81 |
Query processing consumes computational power directly from local CPUs/system while |
76 | 82 |
managing datasets (both GMQL or generic text plain dataset). |
77 | 83 |
|
78 |
-### Initialisation |
|
84 |
+### Initialization |
|
79 | 85 |
|
86 |
+Load and attach the GMQL package in an R session using library function: |
|
87 |
+```{r, initialization, eval=FALSE} |
|
88 |
+library('GMQL') |
|
89 |
+``` |
|
80 | 90 |
Before starting using any GMQL operation we need to initialise the GMQL context |
81 | 91 |
with the following code: |
82 |
-```{r, initialization, eval=FALSE} |
|
92 |
+```{r, init, eval=FALSE} |
|
83 | 93 |
initGMQL() |
84 | 94 |
``` |
85 |
-No parameter means that we are initialising the context with GTF as output format for |
|
86 |
-our regions sample files and metadata files. |
|
87 |
-Of Course, other parameter are available. |
|
95 |
+Calling initGMQL() with no parameters means we are initialising the context with GTF as output format for sample and metadata files. |
|
96 |
+Details on this and all other functions are provided in the R documentation for this packag (e.g., help(GMQL)). |
|
88 | 97 |
|
89 | 98 |
### Datasource |
90 | 99 |
|
91 |
-After initialisation we need to read the dataset. |
|
92 |
-We present different source we can get the data from: \newline |
|
93 |
-we can read local GMQL dataset: |
|
100 |
+After initialization we need to read the dataset. |
|
101 |
+In the following section we show how getting data from different sources.\newline |
|
102 |
+We have four different cases: |
|
103 |
+ |
|
104 |
+1. Local GMQL dataset:\newline |
|
105 |
+As data are already in user computer, we simply execute: |
|
94 | 106 |
```{r,read GMQL dataset, eval=FALSE} |
95 | 107 |
gmql_dataset_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
96 | 108 |
data_out = readDataset("gmql_dataset_path") |
97 | 109 |
``` |
98 |
-In case of remote datasets, user have to download it locally using |
|
99 |
-specifying function: |
|
110 |
+2. Remote dataset / explicit download:\newline |
|
111 |
+User can download it locally using: |
|
100 | 112 |
```{r, download dataset, eval=FALSE} |
101 | 113 |
test_url <- "http://130.186.13.219/gmql-rest" |
102 | 114 |
login.GMQL(test_url) |
... | ... |
@@ -105,9 +117,16 @@ downloadDataset(test_url,"dataset_test",path = getwd()) |
105 | 117 |
where *test_url* is a R variable that define URL of remote server where |
106 | 118 |
web services are located.\newline |
107 | 119 |
Once local, these datatset behave like local dataset as written above. |
108 |
-\newline |
|
120 |
+ |
|
121 |
+3. Remote dataset (/ implicit download): |
|
122 |
+```{r,read remote dataset, eval=FALSE} |
|
123 |
+data_out = readDataset("dataset_name_on_repo") |
|
124 |
+``` |
|
125 |
+There is no need to explicitally download data since execution will trigger download automatically. |
|
126 |
+ |
|
127 |
+4. GrangesList:\newline |
|
109 | 128 |
Also, for better integration in R environment and with other packages, we provide a function |
110 |
-to read a GrangesList: |
|
129 |
+to read from GrangesList, for example: |
|
111 | 130 |
```{r, read GRangesList, eval=FALSE} |
112 | 131 |
gr1 <- GRanges(seqnames = "chr2", |
113 | 132 |
ranges = IRanges(103, 106), |
... | ... |
@@ -122,18 +141,52 @@ score = 3:4, GC = c(0.3, 0.5)) |
122 | 141 |
grl <- GRangesList("txA" = gr1, "txB" = gr2) |
123 | 142 |
data_out <- read(grl) |
124 | 143 |
``` |
125 |
-Every read function return a value, this value is used as first step |
|
126 |
-for execution the subsequent GMQL operation. |
|
144 |
+Every read function return a result object a value containing internal details used for executing the subsequent GMQL operation. |
|
127 | 145 |
|
128 | 146 |
### Queries |
129 | 147 |
|
130 |
-Thwe core concept of GMQL package is build a query as the name *GMQL* suggest. |
|
131 |
-Unfortunatley is not the same as any query language. |
|
132 |
-the building of query is more like a batch workflow |
|
148 |
+GMQL is not DDL/DML traditional query language: |
|
149 |
+With "query" we intend a group of operation that together produce result; in that sense GMQL query are more similar to SQL script. |
|
150 |
+GMQL programming consist of a series of select, union, project, difference (and so on...) command. |
|
151 |
+ |
|
152 |
+If you want to persist result, you can materialize as last step. |
|
153 |
+Let's see a short example: |
|
154 |
+```{r,query, eval=FALSE} |
|
155 |
+initGMQL("gtf") |
|
156 |
+test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
157 |
+input = readDataset(test_path) |
|
158 |
+ |
|
159 |
+## it selects from input data samples of patients younger than 70 years old, |
|
160 |
+## based on filtering on sample metadata attribute Patient_age |
|
161 |
+s=select(input,"Patient_age < 70") |
|
162 |
+ |
|
163 |
+## it counts the regions in each sample and stores their number as value of the new metadata |
|
164 |
+## RegionCount attribute of the sample. |
|
165 |
+e = extend(input_data = s, list(RegionCount = COUNT())) |
|
166 |
+ |
|
167 |
+## materialize the result dataset on disk |
|
168 |
+m = materialize(e) |
|
169 |
+``` |
|
133 | 170 |
|
134 | 171 |
### Execution |
135 | 172 |
|
136 |
-## Remote Environment |
|
173 |
+GMQL processing does not store results: |
|
174 |
+They remain in the environment until you invoke *execute* function. |
|
175 |
+```{r,execute, eval=FALSE} |
|
176 |
+execute() |
|
177 |
+``` |
|
178 |
+*execute* can be issued only if at least one *materialize* is present in GMQL query, otherwise an error is generated. |
|
179 |
+Data are saved in the path specified in every *materialize*. |
|
180 |
+Besside *execute* we can use |
|
181 |
+```{r,take, eval=FALSE} |
|
182 |
+g <- take(input_data = m, rows = 45) |
|
183 |
+``` |
|
184 |
+to extract data as GRangesList format and execute all *materialize* commands. |
|
185 |
+NOTE: GRangesList are contained in R environment and are not saved on disk. |
|
186 |
+ |
|
187 |
+*rows* parameter specified how many rows will be exported |
|
188 |
+ |
|
189 |
+## Remote Processing |
|
137 | 190 |
|
138 | 191 |
Query processing consumes computational power from remote clusters/system while |
139 | 192 |
managing datasets that are only GMQL dataset.\newline |
... | ... |
@@ -149,8 +202,8 @@ Remote processing exits in two flavour:\newline |
149 | 202 |
|
150 | 203 |
### REST web services |
151 | 204 |
|
152 |
-We talk about only for REST web service porocessing, because batch remote processing is |
|
153 |
-quite similar to local processing. |
|
205 |
+This package allows to invoke rest services implementing the commands specified at [link](http://130.186.13.219/gmql-rest/swagger). |
|
206 |
+ |
|
154 | 207 |
|
155 | 208 |
#### Initialization |
156 | 209 |
|
... | ... |
@@ -164,22 +217,28 @@ library("GMQL") |
164 | 217 |
test_url = "http://130.186.13.219/gmql-rest" |
165 | 218 |
login.GMQL(test_url) |
166 | 219 |
``` |
167 |
-that saves token in R environment.\newline |
|
168 |
- |
|
169 |
-### Datasource |
|
220 |
+that saves token in Global R environment with variable named *authToken*.\newline |
|
221 |
+wit this token you can call all the funciton in web services suite. |
|
170 | 222 |
|
171 |
-### Queries |
|
172 |
- |
|
173 |
-### Execution |
|
174 |
- |
|
175 |
-Saved data will be stored in repository and eventually can be downloaded locally. |
|
223 |
+#### Execution |
|
176 | 224 |
|
225 |
+User can write the query as in the following example, as the second parameter of *runQuery*. |
|
226 |
+```{r, eval=FALSE} |
|
227 |
+test_url = "http://130.186.13.219/gmql-rest" |
|
228 |
+login.GMQL(test_url) |
|
229 |
+runQuery(test_url, "query_1", "DATA_SET_VAR = SELECT() HG19_TCGA_dnaseq; |
|
230 |
+ MATERIALIZE DATA_SET_VAR INTO RESULT_DS;", output_gtf = FALSE) |
|
231 |
+``` |
|
177 | 232 |
|
178 |
-# Biological Example |
|
233 |
+Once run, query continues on the server while *runQuery* returns immediately. |
|
234 |
+User can extract from result the job_id and status. |
|
235 |
+jod_id can be used to invoke log and trace calls both in this R package. |
|
179 | 236 |
|
180 |
-This section collects several examples where GMQL is used to answer practical questions/tasks of biological and clinical interest. |
|
181 |
-For each example, after an initial textual statement describing the question/task to be answered, |
|
182 |
-the GMQL query is reported with a detailed commented description of the query and its results. |
|
237 |
+### Batch execution |
|
183 | 238 |
|
239 |
+This function is similar to local processing (syntax, function and so on ...) except: |
|
240 |
+1. if data is local is uploaded on repository implicitly |
|
241 |
+2. materialized data only on repository |
|
184 | 242 |
|
243 |
+# References |
|
185 | 244 |
|
... | ... |
@@ -4,8 +4,8 @@ |
4 | 4 |
\alias{readDataset} |
5 | 5 |
\title{GMQL Function: READ} |
6 | 6 |
\usage{ |
7 |
-readDataset(dataset, parser = "CustomParser", is_local = TRUE, url = NULL, |
|
8 |
- override = FALSE) |
|
7 |
+readDataset(dataset, parser = "CustomParser", is_local = TRUE, |
|
8 |
+ is_GMQL = TRUE, url = NULL) |
|
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 | 11 |
\item{dataset}{single string folder path for GMQL dataset or datasetname on repository} |
... | ... |
@@ -23,17 +23,13 @@ The Parser's available are: |
23 | 23 |
} |
24 | 24 |
Default is CustomParser.} |
25 | 25 |
|
26 |
-\item{is_local}{single logical value indicating local or remote dataset |
|
27 |
-if the remote processing is off you cannot set is_local=FALSE (an error occures)} |
|
26 |
+\item{is_local}{single logical value indicating local or remote dataset} |
|
27 |
+ |
|
28 |
+\item{is_GMQL}{single logical value indicating if dataset is GMQL dataset or not} |
|
28 | 29 |
|
29 | 30 |
\item{url}{single string url of server: it must contain the server address and base url; |
30 | 31 |
service name will be added automatically |
31 | 32 |
useful only in remote processing} |
32 |
- |
|
33 |
-\item{override}{single logical value used in order to determine the overriding of reading |
|
34 |
-dataset into repository, if an other dataset with the same name already exist into repostiory |
|
35 |
-and override value is FALSE an error occures. |
|
36 |
-useful only in remote processing} |
|
37 | 33 |
} |
38 | 34 |
\value{ |
39 | 35 |
DAGgraph class object. It contains the value associated to the graph used |
... | ... |
@@ -58,6 +54,7 @@ test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
58 | 54 |
r = readDataset(test_path) |
59 | 55 |
|
60 | 56 |
\dontrun{ |
57 |
+ |
|
61 | 58 |
### local with other Parser |
62 | 59 |
initGMQL("gtf") |
63 | 60 |
test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
64 | 61 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,14 @@ |
1 |
+@article{IEEEACM7484654, |
|
2 |
+ title = {Data Management for Heterogeneous Genomic Datasets}, |
|
3 |
+ author = {Masseroli Marco and Stefano Ceri and Abdulrahman Kaitoua}, |
|
4 |
+ year = {2016}, |
|
5 |
+ url = {http://ieeexplore.ieee.org/document/7484654/} |
|
6 |
+} |
|
7 |
+ |
|
8 |
+@article{masseroli2015genometric, |
|
9 |
+ title={GenoMetric Query Language: a novel approach to large-scale genomic data management}, |
|
10 |
+ author={Masseroli Marco, Pinoli Pietro, Venco, Francesco and Kaitoua, Abdulrahman and Jalili, Vahid and Palluzzi, Fernando and Muller, Heiko and Ceri, Stefano}, |
|
11 |
+ journal={Bioinformatics}, |
|
12 |
+ year={2015}, |
|
13 |
+ publisher={Oxford Univ Press} |
|
14 |
+} |
|
0 | 15 |
\ No newline at end of file |
... | ... |
@@ -2,22 +2,26 @@ |
2 | 2 |
title: "GMQL: GenoMetrics Query Language" |
3 | 3 |
author: "Simone Pallotta" |
4 | 4 |
date: "`r Sys.Date()`" |
5 |
+bibliography: bibliography.bib |
|
5 | 6 |
output: BiocStyle::pdf_document |
6 | 7 |
vignette: > |
7 | 8 |
%\VignetteIndexEntry{Vignette Title} |
8 | 9 |
%\VignetteEngine{knitr::rmarkdown} |
9 | 10 |
%\VignetteEncoding{UTF-8} |
11 |
+link-citations: true |
|
10 | 12 |
--- |
11 | 13 |
|
12 | 14 |
# Introduction |
13 | 15 |
|
14 |
-Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions.\newline |
|
16 |
+Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions. |
|
15 | 17 |
For this purpose GMQL has been proposed a high-level, declarative GenoMetric Query Language (GMQL) and a toolkit for its use. |
16 | 18 |
|
17 | 19 |
## Purpose |
18 | 20 |
|
19 |
-This package provides a set of functions to create, manipulate and extract genomic data from different datasources from local and remote datasets.\newline |
|
20 |
-Also, these functios allow performing complex queries without the knowledge of GMQL syntax. |
|
21 |
+GMQL operations focus on genomic domain-specific operations written as simple queries with implicit iterations over thousands of heterogeneous samples, computed in few minutes over servers [@IEEEACM7484654]. |
|
22 |
+This package provides a set of functions to create, manipulate and extract genomic data from different datasources both from local and remote datasets. |
|
23 |
+Also, these functios allow performing complex queries without knowledge of GMQL syntax. |
|
24 |
+ |
|
21 | 25 |
|
22 | 26 |
# Dataset |
23 | 27 |
|
... | ... |
@@ -25,19 +29,16 @@ We usually distinguish two kinds of dataset layout:\newline |
25 | 29 |
These contains large number of information describing regions of genome.\newline |
26 | 30 |
Data are encoded in human readable format using plain text file. |
27 | 31 |
|
28 |
-- GMQL standard layout :\newline\newline |
|
29 |
- Dataset is composed basically of three type of file: |
|
30 |
- |
|
31 |
- 1) region files usually terminating in .gtf or .gdm |
|
32 |
- 2) metadata files terminating in .meta |
|
33 |
- 3) schema XML file containing regions attributes |
|
34 |
-\newline\newline |
|
35 |
- Each region sample file owns its metadata file. |
|
36 |
- All these files must reside in unique folder called files. |
|
32 |
+* GMQL standard layout:\newline\newline |
|
33 |
+ GMQL dataset is a collection of samples with the same region schema, is composed basically of three type of file: |
|
34 |
+ 1. region files usually terminating in .gtf or .gdm |
|
35 |
+ 2. metadata files terminating in .meta |
|
36 |
+ 3. schema XML file containing regions attributes |
|
37 |
+ Each region sample file owns its metadata file. All these files must reside in unique folder called files. |
|
37 | 38 |
|
38 | 39 |
 |
39 | 40 |
|
40 |
-- Generic text based dataset:\newline\newline |
|
41 |
+* Generic text based dataset:\newline\newline |
|
41 | 42 |
Dataset composed by heterogeneous sample organised in simple text files probably |
42 | 43 |
stem from different medical, biological sytem |
43 | 44 |
Sample files are simply contained on a folder whose name must be |
... | ... |
@@ -45,17 +46,22 @@ Data are encoded in human readable format using plain text file. |
45 | 46 |
\newline |
46 | 47 |
|
47 | 48 |
In our package dataset files are considered read-only. |
48 |
-Once read genomic information is represented in abstract structure inside |
|
49 |
+Once read, genomic information is represented in abstract structure inside |
|
49 | 50 |
package. |
50 | 51 |
|
52 |
+# Genomic Data Model |
|
53 |
+ |
|
54 |
+The proposed Genomic Data Model (GDM) is based on the notions of datasets and samples; datasets are collections of samples, and each sample consists of two parts, the region data, which describe portions of the DNA, and the metadata, which describe sample general properties.[@IEEEACM7484654]. |
|
51 | 55 |
|
52 | 56 |
# Basic Requirements |
57 |
+ |
|
58 |
+The GMQL package requires: |
|
53 | 59 |
|
54 |
-- javaSE version 8 |
|
55 |
-- java environment correctly set (i.e JAVA_HOME) |
|
56 |
-- scala version 2.11.8 |
|
57 |
-- scala environment correctly set (i.e SCALA_HOME) |
|
58 |
-- network connectivity to web services (if required) |
|
60 |
+* javaSE version 8 |
|
61 |
+* java environment correctly set (i.e JAVA_HOME) |
|
62 |
+* scala version 2.11.8 |
|
63 |
+* scala environment correctly set (i.e SCALA_HOME) |
|
64 |
+* network connectivity to web services (if required) |
|
59 | 65 |
|
60 | 66 |
# How to Install |
61 | 67 |
|
... | ... |
@@ -67,36 +73,42 @@ biocLite("GMQL") |
67 | 73 |
|
68 | 74 |
# Processing Environments |
69 | 75 |
|
70 |
-This package allow to create, manipulate and extract genomic data from |
|
71 |
-different datasets using different processing modes. |
|
76 |
+This package allows to create, manipulate and extract genomic data from |
|
77 |
+different datasets using different processing modes both local and remote. |
|
72 | 78 |
|
73 | 79 |
## Local Processing |
74 | 80 |
|
75 | 81 |
Query processing consumes computational power directly from local CPUs/system while |
76 | 82 |
managing datasets (both GMQL or generic text plain dataset). |
77 | 83 |
|
78 |
-### Initialisation |
|
84 |
+### Initialization |
|
79 | 85 |
|
86 |
+Load and attach the GMQL package in an R session using library function: |
|
87 |
+```{r, initialization, eval=FALSE} |
|
88 |
+library('GMQL') |
|
89 |
+``` |
|
80 | 90 |
Before starting using any GMQL operation we need to initialise the GMQL context |
81 | 91 |
with the following code: |
82 |
-```{r, initialization, eval=FALSE} |
|
92 |
+```{r, init, eval=FALSE} |
|
83 | 93 |
initGMQL() |
84 | 94 |
``` |
85 |
-No parameter means that we are initialising the context with GTF as output format for |
|
86 |
-our regions sample files and metadata files. |
|
87 |
-Of Course, other parameter are available. |
|
95 |
+Calling initGMQL() with no parameters means we are initialising the context with GTF as output format for sample and metadata files. |
|
96 |
+Details on this and all other functions are provided in the R documentation for this packag (e.g., help(GMQL)). |
|
88 | 97 |
|
89 | 98 |
### Datasource |
90 | 99 |
|
91 |
-After initialisation we need to read the dataset. |
|
92 |
-We present different source we can get the data from: \newline |
|
93 |
-we can read local GMQL dataset: |
|
100 |
+After initialization we need to read the dataset. |
|
101 |
+In the following section we show how getting data from different sources.\newline |
|
102 |
+We have four different cases: |
|
103 |
+ |
|
104 |
+1. Local GMQL dataset:\newline |
|
105 |
+As data are already in user computer, we simply execute: |
|
94 | 106 |
```{r,read GMQL dataset, eval=FALSE} |
95 | 107 |
gmql_dataset_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
96 | 108 |
data_out = readDataset("gmql_dataset_path") |
97 | 109 |
``` |
98 |
-In case of remote datasets, user have to download it locally using |
|
99 |
-specifying function: |
|
110 |
+2. Remote dataset / explicit download:\newline |
|
111 |
+User can download it locally using: |
|
100 | 112 |
```{r, download dataset, eval=FALSE} |
101 | 113 |
test_url <- "http://130.186.13.219/gmql-rest" |
102 | 114 |
login.GMQL(test_url) |
... | ... |
@@ -105,9 +117,16 @@ downloadDataset(test_url,"dataset_test",path = getwd()) |
105 | 117 |
where *test_url* is a R variable that define URL of remote server where |
106 | 118 |
web services are located.\newline |
107 | 119 |
Once local, these datatset behave like local dataset as written above. |
108 |
-\newline |
|
120 |
+ |
|
121 |
+3. Remote dataset (/ implicit download): |
|
122 |
+```{r,read remote dataset, eval=FALSE} |
|
123 |
+data_out = readDataset("dataset_name_on_repo") |
|
124 |
+``` |
|
125 |
+There is no need to explicitally download data since execution will trigger download automatically. |
|
126 |
+ |
|
127 |
+4. GrangesList:\newline |
|
109 | 128 |
Also, for better integration in R environment and with other packages, we provide a function |
110 |
-to read a GrangesList: |
|
129 |
+to read from GrangesList, for example: |
|
111 | 130 |
```{r, read GRangesList, eval=FALSE} |
112 | 131 |
gr1 <- GRanges(seqnames = "chr2", |
113 | 132 |
ranges = IRanges(103, 106), |
... | ... |
@@ -122,22 +141,52 @@ score = 3:4, GC = c(0.3, 0.5)) |
122 | 141 |
grl <- GRangesList("txA" = gr1, "txB" = gr2) |
123 | 142 |
data_out <- read(grl) |
124 | 143 |
``` |
125 |
-Every read function return a value, this value is used as first step |
|
126 |
-for execution the subsequent GMQL operation. |
|
144 |
+Every read function return a result object a value containing internal details used for executing the subsequent GMQL operation. |
|
127 | 145 |
|
128 | 146 |
### Queries |
129 | 147 |
|
130 |
-The core concept of GMQL package is build a query as the name *GMQL* suggest. |
|
131 |
-Unfortunatley is not the same as any query language (e.g SQL) where the query is composed by only |
|
132 |
-select statement with parameter |
|
133 |
-Thq query in GMQL is a set of operation che finiscoono con almeno una materialzie. |
|
134 |
-Vediamone un esempio: |
|
148 |
+GMQL is not DDL/DML traditional query language: |
|
149 |
+With "query" we intend a group of operation that together produce result; in that sense GMQL query are more similar to SQL script. |
|
150 |
+GMQL programming consist of a series of select, union, project, difference (and so on...) command. |
|
135 | 151 |
|
152 |
+If you want to persist result, you can materialize as last step. |
|
153 |
+Let's see a short example: |
|
154 |
+```{r,query, eval=FALSE} |
|
155 |
+initGMQL("gtf") |
|
156 |
+test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
157 |
+input = readDataset(test_path) |
|
136 | 158 |
|
159 |
+## it selects from input data samples of patients younger than 70 years old, |
|
160 |
+## based on filtering on sample metadata attribute Patient_age |
|
161 |
+s=select(input,"Patient_age < 70") |
|
162 |
+ |
|
163 |
+## it counts the regions in each sample and stores their number as value of the new metadata |
|
164 |
+## RegionCount attribute of the sample. |
|
165 |
+e = extend(input_data = s, list(RegionCount = COUNT())) |
|
166 |
+ |
|
167 |
+## materialize the result dataset on disk |
|
168 |
+m = materialize(e) |
|
169 |
+``` |
|
137 | 170 |
|
138 | 171 |
### Execution |
139 | 172 |
|
140 |
-## Remote Environment |
|
173 |
+GMQL processing does not store results: |
|
174 |
+They remain in the environment until you invoke *execute* function. |
|
175 |
+```{r,execute, eval=FALSE} |
|
176 |
+execute() |
|
177 |
+``` |
|
178 |
+*execute* can be issued only if at least one *materialize* is present in GMQL query, otherwise an error is generated. |
|
179 |
+Data are saved in the path specified in every *materialize*. |
|
180 |
+Besside *execute* we can use |
|
181 |
+```{r,take, eval=FALSE} |
|
182 |
+g <- take(input_data = m, rows = 45) |
|
183 |
+``` |
|
184 |
+to extract data as GRangesList format and execute all *materialize* commands. |
|
185 |
+NOTE: GRangesList are contained in R environment and are not saved on disk. |
|
186 |
+ |
|
187 |
+*rows* parameter specified how many rows will be exported |
|
188 |
+ |
|
189 |
+## Remote Processing |
|
141 | 190 |
|
142 | 191 |
Query processing consumes computational power from remote clusters/system while |
143 | 192 |
managing datasets that are only GMQL dataset.\newline |
... | ... |
@@ -153,8 +202,8 @@ Remote processing exits in two flavour:\newline |
153 | 202 |
|
154 | 203 |
### REST web services |
155 | 204 |
|
156 |
-We talk about only for REST web service porocessing, because batch remote processing is |
|
157 |
-quite similar to local processing. |
|
205 |
+This package allows to invoke rest services implementing the commands specified at [link](http://130.186.13.219/gmql-rest/swagger). |
|
206 |
+ |
|
158 | 207 |
|
159 | 208 |
#### Initialization |
160 | 209 |
|
... | ... |
@@ -171,20 +220,25 @@ login.GMQL(test_url) |
171 | 220 |
that saves token in Global R environment with variable named *authToken*.\newline |
172 | 221 |
wit this token you can call all the funciton in web services suite. |
173 | 222 |
|
174 |
-#### Queries |
|
175 |
- |
|
176 |
- |
|
177 |
- |
|
178 | 223 |
#### Execution |
179 | 224 |
|
180 |
-Saved data will be stored in repository and eventually can be downloaded locally. |
|
181 |
- |
|
225 |
+User can write the query as in the following example, as the second parameter of *runQuery*. |
|
226 |
+```{r, eval=FALSE} |
|
227 |
+test_url = "http://130.186.13.219/gmql-rest" |
|
228 |
+login.GMQL(test_url) |
|
229 |
+runQuery(test_url, "query_1", "DATA_SET_VAR = SELECT() HG19_TCGA_dnaseq; |
|
230 |
+ MATERIALIZE DATA_SET_VAR INTO RESULT_DS;", output_gtf = FALSE) |
|
231 |
+``` |
|
182 | 232 |
|
183 |
-# Biological Example |
|
233 |
+Once run, query continues on the server while *runQuery* returns immediately. |
|
234 |
+User can extract from result the job_id and status. |
|
235 |
+jod_id can be used to invoke log and trace calls both in this R package. |
|
184 | 236 |
|
185 |
-This section collects several examples where GMQL is used to answer practical questions/tasks of biological and clinical interest. |
|
186 |
-For each example, after an initial textual statement describing the question/task to be answered, |
|
187 |
-the GMQL query is reported with a detailed commented description of the query and its results. |
|
237 |
+### Batch execution |
|
188 | 238 |
|
239 |
+This function is similar to local processing (syntax, function and so on ...) except: |
|
240 |
+1. if data is local is uploaded on repository implicitly |
|
241 |
+2. materialized data only on repository |
|
189 | 242 |
|
243 |
+# References |
|
190 | 244 |
|