Browse code

add bibliography, update vignette

Simone authored on 07/10/2017 16:00:15
Showing 10 changed files

... ...
@@ -26,6 +26,16 @@
26 26
 #'
27 27
 execute <- function()
28 28
 {
29
+  remote_proc <- WrappeR$is_remote_processing()
30
+  array_dataset <- WrappeR$list_dataset_upload_download()
31
+  if(remote_proc)
32
+  {
33
+    
34
+  }
35
+  else
36
+  {
37
+    
38
+  }
29 39
   out <- WrappeR$execute()
30 40
   if(grepl("OK",out,ignore.case = TRUE))
31 41
     print("Executed")
... ...
@@ -66,7 +76,9 @@ materialize <- function(input_data, dir_out = getwd())
66 76
   if(grepl("No",out,ignore.case = TRUE))
67 77
     stop(out)
68 78
   else
79
+  {
69 80
     invisible(NULL)
81
+  }
70 82
 }
71 83
 
72 84
 #' GMQL Operation: TAKE
... ...
@@ -59,14 +59,10 @@ initGMQL <- function(output_format="gtf", remote_processing = FALSE)
59 59
 #' }
60 60
 #' Default is CustomParser.
61 61
 #' @param is_local single logical value indicating local or remote dataset
62
-#' if the remote processing is off you cannot set is_local=FALSE (an error occures)
62
+#' @param is_GMQL single logical value indicating if dataset is GMQL dataset or not 
63 63
 #' @param url single string url of server: it must contain the server address and base url;
64 64
 #' service name will be added automatically
65 65
 #' useful only in remote processing
66
-#' @param override single logical value used in order to determine the overriding of reading
67
-#' dataset into repository, if an other dataset with the same name already exist into repostiory 
68
-#' and override value is FALSE an error occures.
69
-#' useful only in remote processing
70 66
 #' 
71 67
 #' @importFrom methods is
72 68
 #' 
... ...
@@ -87,6 +83,7 @@ initGMQL <- function(output_format="gtf", remote_processing = FALSE)
87 83
 #' r = readDataset(test_path)
88 84
 #' 
89 85
 #' \dontrun{
86
+#' 
90 87
 #' ### local with other Parser
91 88
 #' initGMQL("gtf")
92 89
 #' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
... ...
@@ -95,52 +92,25 @@ initGMQL <- function(output_format="gtf", remote_processing = FALSE)
95 92
 #' 
96 93
 #' @export
97 94
 #'
98
-readDataset <- function(dataset, parser = "CustomParser",is_local=TRUE,url=NULL, override= FALSE)
95
+readDataset <- function(dataset, parser = "CustomParser",is_local=TRUE,
96
+                        is_GMQL=TRUE, url=NULL)
99 97
 {
100
-  remote_proc <- WrappeR$is_remote_processing()
101
-  if(!remote_proc && !is_local)
102
-    stop("you cannot use local processing with remote repository")
103
-  
104
-  if(!is.character(dataset) || length(dataset) >1)
105
-    stop("dataset: invalid input or length > 1")
106
-  
107
-  if(!is.logical(override) || length(override) >1)
108
-    stop("override: invalid input or length > 1")
109
-  
110
-  if(!is.logical(is_local) || length(is_local) >1)
111
-    stop("is_local: invalid input or length > 1")
98
+  .check_input(dataset)
99
+  .check_logical(is_local)
100
+  .check_logical(is_GMQL)
112 101
 
113 102
   if(is_local)
114 103
   {
115 104
     if(!dir.exists(dataset))
116 105
       stop("folder does not exist")
117
-
118
-    remote_proc <- WrappeR$is_remote_processing()
119
-    if(remote_proc)
120
-    {
121
-      if(override)
122
-      {
123
-        list <- showDatasets(url)
124
-        name_dataset <- basename(dataset)
125
-        if(name_dataset %in% unlist(list$datasets))
126
-          deleteDataset(url,name_dataset)
127
-      }
128
-      else
129
-      {
130
-        list <- showDatasets(url)
131
-        name_dataset <- basename(dataset)
132
-        if(name_dataset %in% unlist(list$datasets))
133
-          stop("dataset already exist in repository")
134
-      }
135
-      uploadSamples(url,name_dataset,dataset,isGMQL = TRUE)
136
-    }
137 106
     
138 107
     schema_matrix <- scalaNull("Array[Array[String]]")
139 108
     schema_type <- scalaNull("String")
140 109
   }
141 110
   else
142 111
   {
143
-    list <- showSchemaFromDataset(url,name_dataset)
112
+    #name_dataset <- basename(dataset)
113
+    list <- showSchemaFromDataset(url,dataset)
144 114
     schema_names <- sapply(list$fields, function(x){x$name})
145 115
     schema_type <- sapply(list$fields, function(x){x$fieldType})
146 116
     schema_matrix <- cbind(schema_type,schema_names)
... ...
@@ -149,7 +119,7 @@ readDataset <- function(dataset, parser = "CustomParser",is_local=TRUE,url=NULL,
149 119
 
150 120
   parser_name <- .check_parser(parser)
151 121
 
152
-  out <- WrappeR$readDataset(dataset,parser_name,is_local,schema_matrix)
122
+  out <- WrappeR$readDataset(dataset,parser_name,is_local,is_GMQL,schema_matrix)
153 123
   if(grepl("File",out,ignore.case = TRUE) || grepl("No",out,ignore.case = TRUE))
154 124
     stop(out)
155 125
   else
... ...
@@ -262,4 +232,23 @@ remote_processing<-function(is_remote)
262 232
 }
263 233
 
264 234
 
235
+# remote_proc <- WrappeR$is_remote_processing()
236
+# if(remote_proc)
237
+# {
238
+#   if(override)
239
+#   {
240
+#     list <- showDatasets(url)
241
+#     name_dataset <- basename(dataset)
242
+#     if(name_dataset %in% unlist(list$datasets))
243
+#       deleteDataset(url,name_dataset)
244
+#   }
245
+#   else
246
+#   {
247
+#     list <- showDatasets(url)
248
+#     name_dataset <- basename(dataset)
249
+#     if(name_dataset %in% unlist(list$datasets))
250
+#       stop("dataset already exist in repository")
251
+#   }
252
+#   uploadSamples(url,name_dataset,dataset,isGMQL = TRUE)
253
+# }
265 254
 
... ...
@@ -112,7 +112,23 @@
112 112
   envirs[xin] 
113 113
 }
114 114
 
115
+.check_input <- function(value)
116
+{
117
+  if(!is.character(value))
118
+    stop("no valid data")
119
+  
120
+  if(length(value)>1)
121
+    stop("no multiple string")
122
+}
115 123
 
124
+.check_logical <- function(value)
125
+{
126
+  if(!is.logical(value))
127
+    stop("no valid data")
128
+  
129
+  if(length(value)>1)
130
+    stop("no multiple string")
131
+}
116 132
 
117 133
 #  if(!is.null(groupBy))
118 134
 #{
... ...
@@ -3,6 +3,9 @@
3 3
 #  biocLite("GMQL")
4 4
 
5 5
 ## ---- initialization, eval=FALSE-----------------------------------------
6
+#  library('GMQL')
7
+
8
+## ---- init, eval=FALSE---------------------------------------------------
6 9
 #  initGMQL()
7 10
 
8 11
 ## ----read GMQL dataset, eval=FALSE---------------------------------------
... ...
@@ -14,6 +17,9 @@
14 17
 #  login.GMQL(test_url)
15 18
 #  downloadDataset(test_url,"dataset_test",path = getwd())
16 19
 
20
+## ----read remote dataset, eval=FALSE-------------------------------------
21
+#  data_out = readDataset("dataset_name_on_repo")
22
+
17 23
 ## ---- read GRangesList, eval=FALSE---------------------------------------
18 24
 #  gr1 <- GRanges(seqnames = "chr2",
19 25
 #  ranges = IRanges(103, 106),
... ...
@@ -28,9 +34,37 @@
28 34
 #  grl <- GRangesList("txA" = gr1, "txB" = gr2)
29 35
 #  data_out <- read(grl)
30 36
 
37
+## ----query, eval=FALSE---------------------------------------------------
38
+#  initGMQL("gtf")
39
+#  test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
40
+#  input = readDataset(test_path)
41
+#  
42
+#  ## it selects from input data samples of patients younger than 70 years old,
43
+#  ## based on filtering on sample metadata attribute Patient_age
44
+#  s=select(input,"Patient_age < 70")
45
+#  
46
+#  ## it counts the regions in each sample and stores their number as value of the new metadata
47
+#  ## RegionCount attribute of the sample.
48
+#  e = extend(input_data = s, list(RegionCount = COUNT()))
49
+#  
50
+#  ## materialize the result dataset on disk
51
+#  m = materialize(e)
52
+
53
+## ----execute, eval=FALSE-------------------------------------------------
54
+#  execute()
55
+
56
+## ----take, eval=FALSE----------------------------------------------------
57
+#  g <- take(input_data = m, rows = 45)
58
+
31 59
 ## ---- eval=TRUE----------------------------------------------------------
32 60
 library("GMQL")
33 61
 
34 62
 test_url = "http://130.186.13.219/gmql-rest"
35 63
 login.GMQL(test_url)
36 64
 
65
+## ---- eval=FALSE---------------------------------------------------------
66
+#  test_url = "http://130.186.13.219/gmql-rest"
67
+#  login.GMQL(test_url)
68
+#  runQuery(test_url, "query_1", "DATA_SET_VAR = SELECT() HG19_TCGA_dnaseq;
69
+#           MATERIALIZE DATA_SET_VAR INTO RESULT_DS;", output_gtf = FALSE)
70
+
... ...
@@ -2,22 +2,26 @@
2 2
 title: "GMQL: GenoMetrics Query Language"
3 3
 author: "Simone Pallotta"
4 4
 date: "`r Sys.Date()`"
5
+bibliography: bibliography.bib
5 6
 output: BiocStyle::pdf_document
6 7
 vignette: >
7 8
   %\VignetteIndexEntry{Vignette Title}
8 9
   %\VignetteEngine{knitr::rmarkdown}
9 10
   %\VignetteEncoding{UTF-8}
11
+link-citations: true
10 12
 ---
11 13
 
12 14
 # Introduction
13 15
 
14
-Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions.\newline
16
+Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions.
15 17
 For this purpose GMQL has been proposed a high-level, declarative GenoMetric Query Language (GMQL) and a toolkit for its use.
16 18
 
17 19
 ## Purpose
18 20
 
19
-This package provides a set of functions to create, manipulate and extract genomic data from different datasources from local and remote datasets.\newline
20
-Also, these functios allow performing complex queries without the knowledge of GMQL syntax.
21
+GMQL operations focus on genomic domain-specific operations written as simple queries with implicit iterations over thousands of heterogeneous samples, computed in few minutes over servers [@IEEEACM7484654].
22
+This package provides a set of functions to create, manipulate and extract genomic data from different datasources both from local and remote datasets.
23
+Also, these functios allow performing complex queries without knowledge of GMQL syntax.
24
+
21 25
 
22 26
 # Dataset
23 27
 
... ...
@@ -25,19 +29,16 @@ We usually distinguish two kinds of dataset layout:\newline
25 29
 These contains large number of information describing regions of genome.\newline
26 30
 Data are encoded in human readable format using plain text file.
27 31
 
28
-- GMQL standard layout :\newline\newline
29
-  Dataset is composed basically of three type of file:
30
-  
31
-	1) region files usually terminating in .gtf or .gdm
32
-	2) metadata files terminating in .meta
33
-	3) schema XML file containing regions attributes
34
-\newline\newline
35
-	Each region sample file owns its metadata file.
36
-	All these files must reside in unique folder called files.
32
+* GMQL standard layout:\newline\newline
33
+  GMQL dataset is a collection of samples with the same region schema, is composed basically of three type of file:
34
+    1. region files usually terminating in .gtf or .gdm
35
+	  2. metadata files terminating in .meta
36
+	  3. schema XML file containing regions attributes
37
+	Each region sample file owns its metadata file. All these files must reside in unique folder called files.
37 38
 	
38
-![GMQL dataset folder](../inst/doc/fig/dataset_gmql.png)
39
+![GMQL dataset folder](dataset_gmql.png)
39 40
 	
40
-- Generic text based dataset:\newline\newline
41
+* Generic text based dataset:\newline\newline
41 42
   Dataset composed by heterogeneous sample organised in simple text files probably 
42 43
 	stem from different medical, biological sytem
43 44
 	Sample files are simply contained on a folder whose name must be 
... ...
@@ -45,17 +46,22 @@ Data are encoded in human readable format using plain text file.
45 46
 	\newline
46 47
 
47 48
 In our package dataset files are considered read-only.
48
-Once read genomic information is represented in abstract structure inside 
49
+Once read, genomic information is represented in abstract structure inside 
49 50
 package.
50 51
 
52
+# Genomic Data Model
53
+
54
+The proposed Genomic Data Model (GDM) is based on the notions of datasets and samples; datasets are collections of samples, and each sample consists of two parts, the region data, which describe portions of the DNA, and the metadata, which describe sample general properties.[@IEEEACM7484654].
51 55
 
52 56
 # Basic Requirements
57
+
58
+The GMQL package requires:
53 59
 	
54
-- javaSE version 8 
55
-- java environment correctly set (i.e JAVA_HOME)
56
-- scala version 2.11.8
57
-- scala environment correctly set (i.e SCALA_HOME)
58
-- network connectivity to web services (if required)
60
+* javaSE version 8 
61
+* java environment correctly set (i.e JAVA_HOME)
62
+* scala version 2.11.8
63
+* scala environment correctly set (i.e SCALA_HOME)
64
+* network connectivity to web services (if required)
59 65
 	
60 66
 # How to Install
61 67
 	
... ...
@@ -67,36 +73,42 @@ biocLite("GMQL")
67 73
   
68 74
 # Processing Environments
69 75
 
70
-This package allow to create, manipulate and extract genomic data from 
71
-different datasets using different processing modes.
76
+This package allows to create, manipulate and extract genomic data from 
77
+different datasets using different processing modes both local and remote.
72 78
 
73 79
 ## Local Processing
74 80
 
75 81
 Query processing consumes computational power directly from local CPUs/system while
76 82
 managing datasets (both GMQL or generic text plain dataset).
77 83
 
78
-### Initialisation 
84
+### Initialization
79 85
 
86
+Load and attach the GMQL package in an R session using library function:
87
+```{r, initialization, eval=FALSE}
88
+library('GMQL')
89
+```
80 90
 Before starting using any GMQL operation we need to initialise the GMQL context 
81 91
 with the following code:
82
-```{r, initialization, eval=FALSE}
92
+```{r, init, eval=FALSE}
83 93
 initGMQL()
84 94
 ```
85
-No parameter means that we are initialising the context with GTF as output format for
86
-our regions sample files and metadata files.
87
-Of Course, other parameter are available. 
95
+Calling initGMQL() with no parameters means we are initialising the context with GTF as output format for sample and metadata files.
96
+Details on this and all other functions are provided in the R documentation for this packag (e.g., help(GMQL)).
88 97
 
89 98
 ### Datasource
90 99
 
91
-After initialisation we need to read the dataset.
92
-We present different source we can get the data from: \newline
93
-we can read local GMQL dataset:
100
+After initialization we need to read the dataset.
101
+In the following section we show how getting data from different sources.\newline
102
+We have four different cases:
103
+
104
+1. Local GMQL dataset:\newline
105
+As data are already in user computer, we simply execute:
94 106
 ```{r,read GMQL dataset, eval=FALSE}
95 107
 gmql_dataset_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
96 108
 data_out = readDataset("gmql_dataset_path")
97 109
 ```
98
-In case of remote datasets, user have to download it locally using 
99
-specifying function:
110
+2. Remote dataset / explicit download:\newline
111
+User can download it locally using:
100 112
 ```{r, download dataset, eval=FALSE}
101 113
 test_url <- "http://130.186.13.219/gmql-rest"
102 114
 login.GMQL(test_url)
... ...
@@ -105,9 +117,16 @@ downloadDataset(test_url,"dataset_test",path = getwd())
105 117
 where *test_url* is a R variable that define URL of remote server where 
106 118
 web services are located.\newline
107 119
 Once local, these datatset behave like local dataset as written above.
108
-\newline
120
+
121
+3. Remote dataset (/ implicit download):
122
+```{r,read remote dataset, eval=FALSE}
123
+data_out = readDataset("dataset_name_on_repo")
124
+```
125
+There is no need to explicitally download data since execution will trigger download automatically.
126
+
127
+4. GrangesList:\newline
109 128
 Also, for better integration in R environment and with other packages, we provide a function
110
-to read a GrangesList:
129
+to read from GrangesList, for example:
111 130
 ```{r, read GRangesList, eval=FALSE}
112 131
 gr1 <- GRanges(seqnames = "chr2",
113 132
 ranges = IRanges(103, 106),
... ...
@@ -122,18 +141,52 @@ score = 3:4, GC = c(0.3, 0.5))
122 141
 grl <- GRangesList("txA" = gr1, "txB" = gr2)
123 142
 data_out <- read(grl)
124 143
 ```
125
-Every read function return a value, this value is used as first step 
126
-for execution the subsequent GMQL operation.
144
+Every read function return a result object a value containing internal details used for executing the subsequent GMQL operation.
127 145
 
128 146
 ### Queries
129 147
 
130
-Thwe core concept of GMQL package is build a query as the  name *GMQL* suggest.
131
-Unfortunatley is not the same as any query language.
132
-the building of query is more like a batch workflow
148
+GMQL is not DDL/DML traditional query language:
149
+With "query" we intend a group of operation that together produce result; in that sense GMQL query are more similar to SQL script.
150
+GMQL programming consist of a series of select, union, project, difference (and so on...) command.
151
+
152
+If you want to persist result, you can materialize as last step.
153
+Let's see a short example:
154
+```{r,query, eval=FALSE}
155
+initGMQL("gtf")
156
+test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
157
+input = readDataset(test_path)
158
+
159
+## it selects from input data samples of patients younger than 70 years old, 
160
+## based on filtering on sample metadata attribute Patient_age
161
+s=select(input,"Patient_age < 70")
162
+
163
+## it counts the regions in each sample and stores their number as value of the new metadata 
164
+## RegionCount attribute of the sample.
165
+e = extend(input_data = s, list(RegionCount = COUNT()))
166
+
167
+## materialize the result dataset on disk
168
+m = materialize(e)
169
+```
133 170
 
134 171
 ### Execution
135 172
 
136
-## Remote Environment
173
+GMQL processing does not store results:
174
+They remain in the environment until you invoke *execute* function.
175
+```{r,execute, eval=FALSE}
176
+execute()
177
+```
178
+*execute* can be issued only if at least one *materialize* is present in GMQL query, otherwise an error is generated.
179
+Data are saved in the path specified in every *materialize*.
180
+Besside *execute* we can use 
181
+```{r,take, eval=FALSE}
182
+g <- take(input_data = m, rows = 45)
183
+```
184
+to extract data as GRangesList format and execute all *materialize* commands.
185
+NOTE: GRangesList are contained in R environment and are not saved on disk.
186
+
187
+*rows* parameter specified how many rows will be exported
188
+
189
+## Remote Processing
137 190
 
138 191
 Query processing consumes computational power from remote clusters/system while
139 192
 managing datasets that are only GMQL dataset.\newline
... ...
@@ -149,8 +202,8 @@ Remote processing exits in two flavour:\newline
149 202
 
150 203
 ### REST web services
151 204
 
152
-We talk about only for REST web service porocessing, because batch remote processing is
153
-quite similar to local processing.
205
+This package allows to invoke rest services implementing the commands specified at [link](http://130.186.13.219/gmql-rest/swagger).
206
+
154 207
 
155 208
 #### Initialization
156 209
 
... ...
@@ -164,22 +217,28 @@ library("GMQL")
164 217
 test_url = "http://130.186.13.219/gmql-rest"
165 218
 login.GMQL(test_url)
166 219
 ```
167
-that saves token in R environment.\newline
168
-
169
-### Datasource
220
+that saves token in Global R environment with variable named *authToken*.\newline
221
+wit this token you can call all the funciton in web services suite.
170 222
 
171
-### Queries
172
-
173
-### Execution
174
-
175
-Saved data will be stored in repository and eventually can be downloaded locally.
223
+#### Execution
176 224
 
225
+User can write the query as in the following example, as the second parameter of *runQuery*.
226
+```{r, eval=FALSE}
227
+test_url = "http://130.186.13.219/gmql-rest"
228
+login.GMQL(test_url)
229
+runQuery(test_url, "query_1", "DATA_SET_VAR = SELECT() HG19_TCGA_dnaseq; 
230
+         MATERIALIZE DATA_SET_VAR INTO RESULT_DS;", output_gtf = FALSE)
231
+```
177 232
 
178
-# Biological Example
233
+Once run, query continues on the server while *runQuery* returns immediately.
234
+User can extract from result the job_id and status.
235
+jod_id can be used to invoke log and trace calls both in this R package.
179 236
 
180
-This section collects several examples where GMQL is used to answer practical questions/tasks of biological and clinical interest.
181
-For each example, after an initial textual statement describing the question/task to be answered, 
182
-the GMQL query is reported with a detailed commented description of the query and its results.
237
+### Batch execution
183 238
 
239
+This function is similar to local processing (syntax, function and so on ...) except:
240
+1. if data is local is uploaded on repository implicitly
241
+2. materialized data only on repository
184 242
 
243
+# References
185 244
 
186 245
Binary files a/inst/doc/my-vignette.pdf and b/inst/doc/my-vignette.pdf differ
187 246
Binary files a/inst/java/scala-2.11/GMQL.jar and b/inst/java/scala-2.11/GMQL.jar differ
... ...
@@ -4,8 +4,8 @@
4 4
 \alias{readDataset}
5 5
 \title{GMQL Function: READ}
6 6
 \usage{
7
-readDataset(dataset, parser = "CustomParser", is_local = TRUE, url = NULL,
8
-  override = FALSE)
7
+readDataset(dataset, parser = "CustomParser", is_local = TRUE,
8
+  is_GMQL = TRUE, url = NULL)
9 9
 }
10 10
 \arguments{
11 11
 \item{dataset}{single string folder path for GMQL dataset or datasetname on repository}
... ...
@@ -23,17 +23,13 @@ The Parser's available are:
23 23
 }
24 24
 Default is CustomParser.}
25 25
 
26
-\item{is_local}{single logical value indicating local or remote dataset
27
-if the remote processing is off you cannot set is_local=FALSE (an error occures)}
26
+\item{is_local}{single logical value indicating local or remote dataset}
27
+
28
+\item{is_GMQL}{single logical value indicating if dataset is GMQL dataset or not}
28 29
 
29 30
 \item{url}{single string url of server: it must contain the server address and base url;
30 31
 service name will be added automatically
31 32
 useful only in remote processing}
32
-
33
-\item{override}{single logical value used in order to determine the overriding of reading
34
-dataset into repository, if an other dataset with the same name already exist into repostiory 
35
-and override value is FALSE an error occures.
36
-useful only in remote processing}
37 33
 }
38 34
 \value{
39 35
 DAGgraph class object. It contains the value associated to the graph used 
... ...
@@ -58,6 +54,7 @@ test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
58 54
 r = readDataset(test_path)
59 55
 
60 56
 \dontrun{
57
+
61 58
 ### local with other Parser
62 59
 initGMQL("gtf")
63 60
 test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
64 61
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+@article{IEEEACM7484654,
2
+    title = {Data Management for Heterogeneous Genomic Datasets},
3
+    author = {Masseroli Marco and Stefano Ceri and Abdulrahman Kaitoua},
4
+    year = {2016},
5
+    url = {http://ieeexplore.ieee.org/document/7484654/}
6
+}
7
+
8
+@article{masseroli2015genometric,
9
+  title={GenoMetric Query Language: a novel approach to large-scale genomic data management},
10
+  author={Masseroli Marco, Pinoli Pietro, Venco, Francesco and Kaitoua, Abdulrahman and Jalili, Vahid and Palluzzi, Fernando and Muller, Heiko and Ceri, Stefano},
11
+  journal={Bioinformatics},
12
+  year={2015},
13
+  publisher={Oxford Univ Press}
14
+}
0 15
\ No newline at end of file
... ...
@@ -2,22 +2,26 @@
2 2
 title: "GMQL: GenoMetrics Query Language"
3 3
 author: "Simone Pallotta"
4 4
 date: "`r Sys.Date()`"
5
+bibliography: bibliography.bib
5 6
 output: BiocStyle::pdf_document
6 7
 vignette: >
7 8
   %\VignetteIndexEntry{Vignette Title}
8 9
   %\VignetteEngine{knitr::rmarkdown}
9 10
   %\VignetteEncoding{UTF-8}
11
+link-citations: true
10 12
 ---
11 13
 
12 14
 # Introduction
13 15
 
14
-Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions.\newline
16
+Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions.
15 17
 For this purpose GMQL has been proposed a high-level, declarative GenoMetric Query Language (GMQL) and a toolkit for its use.
16 18
 
17 19
 ## Purpose
18 20
 
19
-This package provides a set of functions to create, manipulate and extract genomic data from different datasources from local and remote datasets.\newline
20
-Also, these functios allow performing complex queries without the knowledge of GMQL syntax.
21
+GMQL operations focus on genomic domain-specific operations written as simple queries with implicit iterations over thousands of heterogeneous samples, computed in few minutes over servers [@IEEEACM7484654].
22
+This package provides a set of functions to create, manipulate and extract genomic data from different datasources both from local and remote datasets.
23
+Also, these functios allow performing complex queries without knowledge of GMQL syntax.
24
+
21 25
 
22 26
 # Dataset
23 27
 
... ...
@@ -25,19 +29,16 @@ We usually distinguish two kinds of dataset layout:\newline
25 29
 These contains large number of information describing regions of genome.\newline
26 30
 Data are encoded in human readable format using plain text file.
27 31
 
28
-- GMQL standard layout :\newline\newline
29
-  Dataset is composed basically of three type of file:
30
-  
31
-	1) region files usually terminating in .gtf or .gdm
32
-	2) metadata files terminating in .meta
33
-	3) schema XML file containing regions attributes
34
-\newline\newline
35
-	Each region sample file owns its metadata file.
36
-	All these files must reside in unique folder called files.
32
+* GMQL standard layout:\newline\newline
33
+  GMQL dataset is a collection of samples with the same region schema, is composed basically of three type of file:
34
+    1. region files usually terminating in .gtf or .gdm
35
+	  2. metadata files terminating in .meta
36
+	  3. schema XML file containing regions attributes
37
+	Each region sample file owns its metadata file. All these files must reside in unique folder called files.
37 38
 	
38 39
 ![GMQL dataset folder](dataset_gmql.png)
39 40
 	
40
-- Generic text based dataset:\newline\newline
41
+* Generic text based dataset:\newline\newline
41 42
   Dataset composed by heterogeneous sample organised in simple text files probably 
42 43
 	stem from different medical, biological sytem
43 44
 	Sample files are simply contained on a folder whose name must be 
... ...
@@ -45,17 +46,22 @@ Data are encoded in human readable format using plain text file.
45 46
 	\newline
46 47
 
47 48
 In our package dataset files are considered read-only.
48
-Once read genomic information is represented in abstract structure inside 
49
+Once read, genomic information is represented in abstract structure inside 
49 50
 package.
50 51
 
52
+# Genomic Data Model
53
+
54
+The proposed Genomic Data Model (GDM) is based on the notions of datasets and samples; datasets are collections of samples, and each sample consists of two parts, the region data, which describe portions of the DNA, and the metadata, which describe sample general properties.[@IEEEACM7484654].
51 55
 
52 56
 # Basic Requirements
57
+
58
+The GMQL package requires:
53 59
 	
54
-- javaSE version 8 
55
-- java environment correctly set (i.e JAVA_HOME)
56
-- scala version 2.11.8
57
-- scala environment correctly set (i.e SCALA_HOME)
58
-- network connectivity to web services (if required)
60
+* javaSE version 8 
61
+* java environment correctly set (i.e JAVA_HOME)
62
+* scala version 2.11.8
63
+* scala environment correctly set (i.e SCALA_HOME)
64
+* network connectivity to web services (if required)
59 65
 	
60 66
 # How to Install
61 67
 	
... ...
@@ -67,36 +73,42 @@ biocLite("GMQL")
67 73
   
68 74
 # Processing Environments
69 75
 
70
-This package allow to create, manipulate and extract genomic data from 
71
-different datasets using different processing modes.
76
+This package allows to create, manipulate and extract genomic data from 
77
+different datasets using different processing modes both local and remote.
72 78
 
73 79
 ## Local Processing
74 80
 
75 81
 Query processing consumes computational power directly from local CPUs/system while
76 82
 managing datasets (both GMQL or generic text plain dataset).
77 83
 
78
-### Initialisation 
84
+### Initialization
79 85
 
86
+Load and attach the GMQL package in an R session using library function:
87
+```{r, initialization, eval=FALSE}
88
+library('GMQL')
89
+```
80 90
 Before starting using any GMQL operation we need to initialise the GMQL context 
81 91
 with the following code:
82
-```{r, initialization, eval=FALSE}
92
+```{r, init, eval=FALSE}
83 93
 initGMQL()
84 94
 ```
85
-No parameter means that we are initialising the context with GTF as output format for
86
-our regions sample files and metadata files.
87
-Of Course, other parameter are available. 
95
+Calling initGMQL() with no parameters means we are initialising the context with GTF as output format for sample and metadata files.
96
+Details on this and all other functions are provided in the R documentation for this packag (e.g., help(GMQL)).
88 97
 
89 98
 ### Datasource
90 99
 
91
-After initialisation we need to read the dataset.
92
-We present different source we can get the data from: \newline
93
-we can read local GMQL dataset:
100
+After initialization we need to read the dataset.
101
+In the following section we show how getting data from different sources.\newline
102
+We have four different cases:
103
+
104
+1. Local GMQL dataset:\newline
105
+As data are already in user computer, we simply execute:
94 106
 ```{r,read GMQL dataset, eval=FALSE}
95 107
 gmql_dataset_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
96 108
 data_out = readDataset("gmql_dataset_path")
97 109
 ```
98
-In case of remote datasets, user have to download it locally using 
99
-specifying function:
110
+2. Remote dataset / explicit download:\newline
111
+User can download it locally using:
100 112
 ```{r, download dataset, eval=FALSE}
101 113
 test_url <- "http://130.186.13.219/gmql-rest"
102 114
 login.GMQL(test_url)
... ...
@@ -105,9 +117,16 @@ downloadDataset(test_url,"dataset_test",path = getwd())
105 117
 where *test_url* is a R variable that define URL of remote server where 
106 118
 web services are located.\newline
107 119
 Once local, these datatset behave like local dataset as written above.
108
-\newline
120
+
121
+3. Remote dataset (/ implicit download):
122
+```{r,read remote dataset, eval=FALSE}
123
+data_out = readDataset("dataset_name_on_repo")
124
+```
125
+There is no need to explicitally download data since execution will trigger download automatically.
126
+
127
+4. GrangesList:\newline
109 128
 Also, for better integration in R environment and with other packages, we provide a function
110
-to read a GrangesList:
129
+to read from GrangesList, for example:
111 130
 ```{r, read GRangesList, eval=FALSE}
112 131
 gr1 <- GRanges(seqnames = "chr2",
113 132
 ranges = IRanges(103, 106),
... ...
@@ -122,22 +141,52 @@ score = 3:4, GC = c(0.3, 0.5))
122 141
 grl <- GRangesList("txA" = gr1, "txB" = gr2)
123 142
 data_out <- read(grl)
124 143
 ```
125
-Every read function return a value, this value is used as first step 
126
-for execution the subsequent GMQL operation.
144
+Every read function return a result object a value containing internal details used for executing the subsequent GMQL operation.
127 145
 
128 146
 ### Queries
129 147
 
130
-The core concept of GMQL package is build a query as the  name *GMQL* suggest.
131
-Unfortunatley is not the same as any query language (e.g SQL) where the query is composed by only
132
-select statement with parameter 
133
-Thq query in GMQL is a set of operation che finiscoono con almeno una materialzie.
134
-Vediamone un esempio:
148
+GMQL is not DDL/DML traditional query language:
149
+With "query" we intend a group of operation that together produce result; in that sense GMQL query are more similar to SQL script.
150
+GMQL programming consist of a series of select, union, project, difference (and so on...) command.
135 151
 
152
+If you want to persist result, you can materialize as last step.
153
+Let's see a short example:
154
+```{r,query, eval=FALSE}
155
+initGMQL("gtf")
156
+test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL")
157
+input = readDataset(test_path)
136 158
 
159
+## it selects from input data samples of patients younger than 70 years old, 
160
+## based on filtering on sample metadata attribute Patient_age
161
+s=select(input,"Patient_age < 70")
162
+
163
+## it counts the regions in each sample and stores their number as value of the new metadata 
164
+## RegionCount attribute of the sample.
165
+e = extend(input_data = s, list(RegionCount = COUNT()))
166
+
167
+## materialize the result dataset on disk
168
+m = materialize(e)
169
+```
137 170
 
138 171
 ### Execution
139 172
 
140
-## Remote Environment
173
+GMQL processing does not store results:
174
+They remain in the environment until you invoke *execute* function.
175
+```{r,execute, eval=FALSE}
176
+execute()
177
+```
178
+*execute* can be issued only if at least one *materialize* is present in GMQL query, otherwise an error is generated.
179
+Data are saved in the path specified in every *materialize*.
180
+Besside *execute* we can use 
181
+```{r,take, eval=FALSE}
182
+g <- take(input_data = m, rows = 45)
183
+```
184
+to extract data as GRangesList format and execute all *materialize* commands.
185
+NOTE: GRangesList are contained in R environment and are not saved on disk.
186
+
187
+*rows* parameter specified how many rows will be exported
188
+
189
+## Remote Processing
141 190
 
142 191
 Query processing consumes computational power from remote clusters/system while
143 192
 managing datasets that are only GMQL dataset.\newline
... ...
@@ -153,8 +202,8 @@ Remote processing exits in two flavour:\newline
153 202
 
154 203
 ### REST web services
155 204
 
156
-We talk about only for REST web service porocessing, because batch remote processing is
157
-quite similar to local processing.
205
+This package allows to invoke rest services implementing the commands specified at [link](http://130.186.13.219/gmql-rest/swagger).
206
+
158 207
 
159 208
 #### Initialization
160 209
 
... ...
@@ -171,20 +220,25 @@ login.GMQL(test_url)
171 220
 that saves token in Global R environment with variable named *authToken*.\newline
172 221
 wit this token you can call all the funciton in web services suite.
173 222
 
174
-#### Queries
175
-
176
-
177
-
178 223
 #### Execution
179 224
 
180
-Saved data will be stored in repository and eventually can be downloaded locally.
181
-
225
+User can write the query as in the following example, as the second parameter of *runQuery*.
226
+```{r, eval=FALSE}
227
+test_url = "http://130.186.13.219/gmql-rest"
228
+login.GMQL(test_url)
229
+runQuery(test_url, "query_1", "DATA_SET_VAR = SELECT() HG19_TCGA_dnaseq; 
230
+         MATERIALIZE DATA_SET_VAR INTO RESULT_DS;", output_gtf = FALSE)
231
+```
182 232
 
183
-# Biological Example
233
+Once run, query continues on the server while *runQuery* returns immediately.
234
+User can extract from result the job_id and status.
235
+jod_id can be used to invoke log and trace calls both in this R package.
184 236
 
185
-This section collects several examples where GMQL is used to answer practical questions/tasks of biological and clinical interest.
186
-For each example, after an initial textual statement describing the question/task to be answered, 
187
-the GMQL query is reported with a detailed commented description of the query and its results.
237
+### Batch execution
188 238
 
239
+This function is similar to local processing (syntax, function and so on ...) except:
240
+1. if data is local is uploaded on repository implicitly
241
+2. materialized data only on repository
189 242
 
243
+# References
190 244