Browse code

fix not

Simone authored on 09/11/2017 23:11:46
Showing 18 changed files

... ...
@@ -1,11 +1,10 @@
1 1
 Package: RGMQL
2 2
 Type: Package
3 3
 Title: GenoMetric Query Language for R/Bioconductor
4
-Version: 0.99.24
4
+Version: 0.99.25
5 5
 Author: Simone Pallotta, Marco Masseroli
6 6
 Maintainer: Simone Pallotta <simonepallotta@hotmail.com>
7
-Description: This RGMQL package brings the GenoMetric Query Language (GMQL) 
8
-  functionalities into the R environment.
7
+Description: This RGMQL package brings the GenoMetric Query Language (GMQL) functionalities into the R environment.
9 8
   GMQL is a high-level, declarative language to query and compare multiple and heterogeneous genomic 
10 9
   datasets for biomedical knowledge discovery. It allows expressing easily queries 
11 10
   and processing over genomic regions and their metadata, in a way similar to 
... ...
@@ -64,7 +64,7 @@ extend <-function(input_data, metadata = NULL)
64 64
                                     dispatch = TRUE)
65 65
     else
66 66
         metadata_matrix <- .jnull("java/lang/String")
67
-  
67
+    
68 68
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
69 69
     response <- WrappeR$extend(metadata_matrix,input_data$value)
70 70
     error <- strtoi(response[1])
... ...
@@ -47,7 +47,7 @@
47 47
 #' @export
48 48
 #'
49 49
 filter_and_extract <- function(data, metadata = NULL, 
50
-                               metadata_prefix = NULL, regions = NULL)
50
+                                metadata_prefix = NULL, regions = NULL)
51 51
 {
52 52
     if(is(data,"GRangesList"))
53 53
         .extract_from_GRangesList(data,metadata,metadata_prefix,regions)
... ...
@@ -38,7 +38,7 @@ import_gmql <- function(dataset_path, is_gtf)
38 38
     datasetName <- sub("/*[/]$","",datasetName)
39 39
     if(basename(datasetName) !="files")
40 40
         datasetName <- paste0(datasetName,"/files")
41
-  
41
+    
42 42
     if(!dir.exists(datasetName))
43 43
         stop("Directory does not exists")
44 44
 
... ...
@@ -101,7 +101,7 @@ join <- function(right_input_data, left_input_data,
101 101
                 warning("only 4 element per list, we cut the rest")
102 102
                 length(list_pred)=4
103 103
             }
104
-      
104
+            
105 105
             if(!all(sapply(list_pred, function(x) {is(x,"DISTAL")} )))
106 106
                 stop("All elements should be DISTAL object")
107 107
             })
... ...
@@ -120,18 +120,18 @@ join <- function(right_input_data, left_input_data,
120 120
     }
121 121
     else
122 122
         genomatrix <- .jnull("java/lang/String")
123
-      
123
+    
124 124
     if(!is.null(joinBy))
125
-        join_condition_matrix <- .jarray(.join_condition(joinBy),
125
+        join_condition_matrix <- .jarray(.join_condition(joinBy), 
126 126
                                             dispatch = TRUE)
127 127
     else
128 128
         join_condition_matrix <- .jnull("java/lang/String")
129
-  
129
+    
130 130
     ouput <- toupper(region_output)
131 131
     if(!identical(ouput,"CONTIG") && !identical(ouput,"LEFT") && 
132 132
         !identical(ouput,"RIGHT") && !identical(ouput,"INT"))
133 133
         stop("region_output must be contig,left,right or int (intersection)")
134
-  
134
+    
135 135
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
136 136
     response <- WrappeR$join(genomatrix,join_condition_matrix, 
137 137
                                 ouput,right_input_data$value,
... ...
@@ -88,13 +88,13 @@ map <- function(left_input_data, right_input_data, aggregates = NULL,
88 88
                                     dispatch = TRUE)
89 89
     else
90 90
         metadata_matrix = .jnull("java/lang/String")
91
-
91
+    
92 92
     if(!is.null(joinBy))
93 93
         join_condition_matrix <- .jarray(.join_condition(joinBy),
94 94
                                             dispatch = TRUE)
95 95
     else
96 96
         join_condition_matrix <- .jnull("java/lang/String")
97
-  
97
+    
98 98
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
99 99
     response<-WrappeR$map(join_condition_matrix, metadata_matrix, 
100 100
                             left_input_data$value, right_input_data$value)
... ...
@@ -28,7 +28,7 @@ execute <- function()
28 28
     remote_proc <- WrappeR$is_remote_processing()
29 29
     if(!remote_proc)
30 30
         .download_or_upload()
31
-  
31
+    
32 32
     response <- WrappeR$execute()
33 33
     error <- strtoi(response[1])
34 34
     data <- response[2]
... ...
@@ -143,14 +143,14 @@ take <- function(input_data, rows=0L)
143 143
     rows <- as.integer(rows[1])
144 144
     if(rows<0)
145 145
         stop("rows cannot be negative")
146
-  
146
+    
147 147
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
148 148
     response <- WrappeR$take(input_data$value,rows)
149 149
     error <- strtoi(response[1])
150 150
     data <- response[2]
151 151
     if(error!=0)
152 152
         stop(data)
153
-
153
+    
154 154
     reg <- .jevalArray(WrappeR$get_reg(),simplify = TRUE)
155 155
     if(is.null(reg))
156 156
         stop("no regions defined")
... ...
@@ -160,11 +160,11 @@ take <- function(input_data, rows=0L)
160 160
     schema <- .jevalArray(WrappeR$get_schema(),simplify = TRUE)
161 161
     if(is.null(schema))
162 162
         stop("no schema defined")
163
-
163
+    
164 164
     reg_data_frame <- as.data.frame(reg)
165 165
     list <- split(reg_data_frame, reg_data_frame[1])
166 166
     names <- c("seqname","start","end","strand",schema)
167
-  
167
+    
168 168
     sampleList <- lapply(list, function(x){
169 169
         x <- x[-1]
170 170
         names(x) <- names
... ...
@@ -175,7 +175,7 @@ take <- function(input_data, rows=0L)
175 175
         })
176 176
     gRange_list <- GRangesList(sampleList)
177 177
     meta_list <- .metadata_from_frame_to_list(meta)
178
-  
178
+    
179 179
     S4Vectors::metadata(gRange_list) <- meta_list
180 180
     return(gRange_list)
181 181
 }
... ...
@@ -57,7 +57,7 @@ merge <- function(input_data, groupBy = NULL)
57 57
                                             dispatch = TRUE)
58 58
     else
59 59
         join_condition_matrix <- .jnull("java/lang/String")
60
-  
60
+    
61 61
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
62 62
     response <- WrappeR$merge(join_condition_matrix,input_data$value)
63 63
     error <- strtoi(response[1])
... ...
@@ -70,22 +70,22 @@ order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0,
70 70
     if(!is.numeric(mtop) || !is.numeric(mtopg) || !is.numeric(rtop) || 
71 71
         !is.numeric(rtopg) || !is.numeric(mtopp)|| !is.numeric(rtopp))
72 72
         stop("mtop, rtop, rtopg and mtopg must be integer")
73
-
73
+    
74 74
     if(length(mtop)>1 || length(mtopg)>1 || length(rtop)>1 || length(rtopg)>1
75 75
         || length(mtopp)>1 || length(rtopp)>1)
76 76
         warning("only first element: rtop, mtop, mtopg, rtopg, rtopp, mtopp")
77
-
77
+    
78 78
     # we consider only the first element even if input is a vector of Int
79 79
     # we cut the other arguments
80
-
80
+    
81 81
     mtop = as.integer(mtop[1])
82 82
     mtopg = as.integer(mtopg[1])
83 83
     mtopp = as.integer(mtopp[1])
84
-
84
+    
85 85
     rtop = as.integer(rtop[1])
86 86
     rtopg = as.integer(rtopg[1])
87 87
     rtopp = as.integer(rtopp[1])
88
-
88
+    
89 89
     if(mtop > 0 && mtopg >0)
90 90
     {
91 91
         warning("cannot be used together.\nWe set mtopg = 0")
... ...
@@ -97,43 +97,43 @@ order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0,
97 97
         warning("cannot be used together.\nWe set mtopp = 0")
98 98
         mtopp = 0L
99 99
     }
100
-
100
+    
101 101
     if(mtopg >0 && mtopp>0)
102 102
     {
103 103
         warning("cannot be used together.\nWe set mtopp = 0")
104 104
         mtopp = 0L
105 105
     }
106
-
106
+    
107 107
     if(rtop > 0 && rtopg >0)
108 108
     {
109 109
         warning("cannot be used together.\nWe set rtopg = 0")
110 110
         rtopg = 0L
111 111
     }
112
-
112
+    
113 113
     if(rtop >0 && rtopp>0)
114 114
     {
115 115
         warning("cannot be used together.\nWe set rtopp = 0")
116 116
         rtopp = 0L
117 117
     }
118
-
118
+    
119 119
     if(rtopg >0 && rtopp>0)
120 120
     {
121 121
         warning("cannot be used together.\nWe set rtopp = 0")
122 122
         rtopp = 0L
123 123
     }
124
-
124
+    
125 125
     if(!is.null(metadata_ordering))
126 126
         meta_matrix <- .jarray(.ordering_meta(metadata_ordering),
127 127
                                     dispatch = TRUE)
128 128
     else
129 129
         meta_matrix <- .jnull("java/lang/String")
130
-
130
+    
131 131
     if(!is.null(regions_ordering))
132 132
         region_matrix <- .jarray(.ordering_meta(regions_ordering),
133 133
                                     dispatch = TRUE)
134 134
     else
135 135
         region_matrix <- .jnull("java/lang/String")
136
-  
136
+    
137 137
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
138 138
     response <- WrappeR$order(meta_matrix, mtopg, mtop, mtopp, region_matrix,
139 139
                                 rtopg, rtop, rtopp, input_data$value)
... ...
@@ -169,6 +169,7 @@ order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0,
169 169
     }
170 170
     else
171 171
         stop("only list or character")
172
+    
172 173
 }
173 174
 
174 175
 
... ...
@@ -95,34 +95,34 @@ project <-function(input_data, metadata = NULL, metadata_update=NULL,
95 95
     {
96 96
         if(!is.character(metadata))
97 97
             stop("metadata: no valid input")
98
-
98
+        
99 99
         metadata <- metadata[!metadata %in% ""]
100 100
         metadata <- metadata[!duplicated(metadata)]
101
-
101
+        
102 102
         if(length(metadata)==0)
103 103
             metadata <- .jnull("java/lang/String")
104
-    
104
+        
105 105
         metadata <- .jarray(metadata)
106 106
     }
107 107
     else
108 108
         metadata <- .jnull("java/lang/String")
109
-
109
+    
110 110
     if(!is.null(regions))
111 111
     {
112 112
         if(!is.character(regions))
113 113
             stop("regions: no valid input")
114
-
114
+        
115 115
         regions = regions[!regions %in% ""]
116 116
         regions = regions[!duplicated(regions)]
117
-
117
+        
118 118
         if(length(regions)==0)
119 119
             regions <- .jnull("java/lang/String")
120
-    
120
+        
121 121
         regions <- .jarray(regions)
122 122
     }
123 123
     else
124 124
         regions <- .jnull("java/lang/String")
125
-
125
+    
126 126
     reg_update <- substitute(regions_update)
127 127
     if(!is.null(reg_update))
128 128
     {
... ...
@@ -131,7 +131,7 @@ project <-function(input_data, metadata = NULL, metadata_update=NULL,
131 131
     }
132 132
     else
133 133
         regions_update <- .jnull("java/lang/String")
134
-  
134
+    
135 135
     meta_update <- substitute(metadata_update)
136 136
     if(!is.null(meta_update))
137 137
     {
... ...
@@ -140,15 +140,15 @@ project <-function(input_data, metadata = NULL, metadata_update=NULL,
140 140
     }
141 141
     else
142 142
         metadata_update <- .jnull("java/lang/String")
143
-  
143
+    
144 144
     if(length(all_but_meta)>1)
145 145
         warning("all_but_meta: no multiple values")
146
-  
146
+    
147 147
     if(length(all_but_reg)>1)
148 148
         warning("all_but_reg: no multiple values")
149 149
     all_but_reg <- all_but_reg[1]
150 150
     all_but_meta <- all_but_meta[1]
151
-  
151
+    
152 152
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
153 153
     response <- WrappeR$project(metadata,metadata_update,all_but_meta,
154 154
                                 regions,regions_update,
... ...
@@ -52,12 +52,12 @@ init_gmql <- function(output_format = "gtf", remote_processing = FALSE,
52 52
         !identical(out_format,"COLLECT"))
53 53
         stop("output_format must be TAB, GTF or COLLECT")
54 54
     .check_logical(remote_processing)
55
-  
55
+    
56 56
     # mettere attesa da input keyboard, controllare se token giĆ  esiste 
57 57
     # da sessione precedente
58 58
     if(!is.null(url) && !exists("authToken",envir = .GlobalEnv))
59 59
         login_gmql(url,username,password)
60
-  
60
+    
61 61
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
62 62
     WrappeR$initGMQL(out_format,remote_processing)
63 63
 }
... ...
@@ -132,7 +132,7 @@ read_dataset <- function(dataset, parser = "CustomParser", is_local=TRUE,
132 132
     {
133 133
         if(!dir.exists(dataset))
134 134
             stop("folder does not exist")
135
-    
135
+        
136 136
         dataset <- sub("/*[/]$","",dataset)
137 137
         if(basename(dataset) !="files")
138 138
             dataset <- paste0(dataset,"/files")
... ...
@@ -145,16 +145,16 @@ read_dataset <- function(dataset, parser = "CustomParser", is_local=TRUE,
145 145
         url <- WrappeR$get_url()
146 146
         if(is.null(url))
147 147
             stop("You have to log on using login function")
148
-    
148
+        
149 149
         if(!exists("authToken",envir = .GlobalEnv))
150 150
             stop("You have to log on using login function")
151
-   
151
+        
152 152
         list <- show_schema(url,dataset)
153 153
         schema_names <- sapply(list$fields, function(x){x$name})
154 154
         schema_type <- sapply(list$fields, function(x){x$type})
155 155
         schema_matrix <- cbind(schema_type,schema_names)
156 156
         #schema_type <- list$type
157
-    
157
+        
158 158
         if(is.null(schema_matrix) || length(schema_matrix)==0)
159 159
             schema_matrix <- .jnull("java/lang/String")
160 160
         else
... ...
@@ -203,7 +203,7 @@ read <- function(samples)
203 203
 {
204 204
     if(!is(samples,"GRangesList"))
205 205
         stop("only GrangesList")
206
-
206
+    
207 207
     meta <- S4Vectors::metadata(samples)
208 208
     if(is.null(meta) || length(meta)==0) {
209 209
         #repeat meta for each sample in samples list
... ...
@@ -212,17 +212,19 @@ read <- function(samples)
212 212
 We provide two metadata for you")
213 213
         index_meta <- rep(1:len,each = len)
214 214
         rep_meta <- rep(c("Provider","Polimi", "Application", "R-GMQL"),
215
-                        times=len)
215
+                            times=len)
216 216
         meta_matrix <- matrix(rep_meta,ncol = 2,byrow = TRUE)
217 217
         meta_matrix <- cbind(index_meta,meta_matrix)
218 218
     }
219
-    else {
219
+    else 
220
+    {
220 221
         unlist_meta <- unlist(meta)
221 222
         names_meta <- names(unlist_meta)
222 223
         group_names <- gsub(".*_([0-9]*)\\..*","\\1", names_meta)
223 224
         names(unlist_meta) <- NULL
224 225
         meta_matrix <- cbind(group_names,names_meta,unlist_meta)
225 226
     }
227
+    
226 228
     df <- data.frame(samples)
227 229
     df <- df[-2] #delete group_name
228 230
     region_matrix <- as.matrix(sapply(df, as.character))
... ...
@@ -249,11 +251,11 @@ We provide two metadata for you")
249 251
     }
250 252
     rownames(schema_matrix) <- NULL
251 253
     colnames(schema_matrix) <- NULL
252
-  
254
+    
253 255
     schema_matrix <- .jarray(schema_matrix,dispatch = TRUE)
254 256
     meta_matrix <- .jarray(meta_matrix,dispatch = TRUE)
255 257
     region_matrix <- .jarray(region_matrix,dispatch = TRUE)
256
-  
258
+    
257 259
     WrappeR <- J("it/polimi/genomics/r/Wrapper")
258 260
     response <- WrappeR$read(meta_matrix,region_matrix,schema_matrix)
259 261
     DataSet(response)
... ...
@@ -98,7 +98,7 @@ select <- function(input_data, predicate = NULL, region_predicate = NULL,
98 98
     }
99 99
     else
100 100
         predicate <- .jnull("java/lang/String")
101
-
101
+    
102 102
     reg_pred <- substitute(region_predicate)
103 103
     if(!is.null(reg_pred))
104 104
     {
... ...
@@ -107,7 +107,7 @@ select <- function(input_data, predicate = NULL, region_predicate = NULL,
107 107
     }
108 108
     else
109 109
         region_predicate <- .jnull("java/lang/String")
110
-
110
+    
111 111
     if(is.null(semi_join) && is.null(semi_join_dataset))
112 112
     {
113 113
         join_condition_matrix <- .jnull("java/lang/String")
... ...
@@ -22,7 +22,7 @@
22 22
                                 full.names = TRUE)
23 23
     if(length(schema_name)==0)
24 24
         stop("schema not present")
25
-
25
+    
26 26
     xml_schema <- xml2::read_xml(schema_name)
27 27
     list_field <- xml2::as_list(xml_schema)
28 28
     vector_field <- unlist(list_field)
... ...
@@ -33,10 +33,10 @@
33 33
 {
34 34
     if(!is.list(meta_data))
35 35
         stop("meta_data: invalid input")
36
-
36
+    
37 37
     if(!all(sapply(meta_data, function(x) is(x,class))))
38 38
         stop("All elements must be META_AGGREGATES object")
39
-
39
+    
40 40
     names <- names(meta_data)
41 41
     if(is.null(names))
42 42
     {
... ...
@@ -46,12 +46,13 @@
46 46
     else
47 47
     {
48 48
         if("" %in% names)
49
-        stop("No partial names assignment is allowed")
49
+            stop("No partial names assignment is allowed")
50 50
     }
51 51
     aggregate_matrix <- t(sapply(meta_data, function(x) {
52 52
         new_value = as.character(x)
53 53
         matrix <- matrix(new_value)
54 54
     }))
55
+    
55 56
     m_names <- matrix(names)
56 57
     metadata_matrix <- cbind(m_names,aggregate_matrix)
57 58
 }
... ...
@@ -94,7 +95,7 @@
94 95
 {
95 96
     if(!is.character(value))
96 97
         stop("no valid data")
97
-  
98
+    
98 99
     if(length(value)>1)
99 100
         stop("no multiple string")
100 101
 }
... ...
@@ -103,7 +104,7 @@
103 104
 {
104 105
     if(!is.logical(value))
105 106
         stop("no valid data")
106
-  
107
+    
107 108
     if(length(value)>1)
108 109
         stop("no multiple string")
109 110
 }
... ...
@@ -5,7 +5,6 @@
5 5
     .jpackage(pkgname, lib.loc = libname)
6 6
     # tools::vignetteEngine("knitr", pattern = "[.]Rmd$", package = "knitr")
7 7
     .jinit(force.init = TRUE)
8
- 
9 8
 }
10 9
 
11 10
 .onAttach <- function(libname, pkgname) {
... ...
@@ -17,7 +17,7 @@ check.OPERATOR <- function(value)
17 17
     {
18 18
         if(is.character(value) && length(value)>1)
19 19
             stop("value: no multiple string")
20
-    
20
+        
21 21
         if(!is.character(value))
22 22
             stop("value: is not a string")
23 23
     }
... ...
@@ -5,9 +5,7 @@
5 5
 
6 6
 ORDER <- function(value)
7 7
 {
8
-  op_list <- list(
9
-    value = value
10
-  )
8
+  op_list <- list(value = value)
11 9
   ## Set the name for the class
12 10
   class(op_list) <- "ORDER"
13 11
   return(op_list)
... ...
@@ -17,7 +15,7 @@ check.ORDER <- function(value)
17 15
 {
18 16
   if(is.character(value) && length(value)>1)
19 17
     stop("value: no multiple string")
20
-
18
+    
21 19
   if(!is.character(value))
22 20
     stop("value: is not a string")
23 21
 }
... ...
@@ -26,8 +24,7 @@ print.ORDER <- function(obj) {
26 24
   as.character(obj)
27 25
 }
28 26
 
29
-c.ORDER <- function(...)
30
-{
27
+c.ORDER <- function(...) {
31 28
   a <- list(...)
32 29
 }
33 30
 
... ...
@@ -63,7 +60,7 @@ as.character.ORDER <- function(obj) {
63 60
 DESC <- function(value)
64 61
 {
65 62
     check.ORDER(value)
66
-
63
+    
67 64
     list <- list(value = value)
68 65
     ## Set the name for the class
69 66
     class(list) <- c("DESC","ORDER")
... ...
@@ -98,7 +95,7 @@ DESC <- function(value)
98 95
 ASC <- function(value)
99 96
 {
100 97
     check.ORDER(value)
101
-
98
+    
102 99
     list <- list(value = value)
103 100
     ## Set the name for the class
104 101
     class(list) <- c("ASC","ORDER")
... ...
@@ -201,11 +201,11 @@ run_query <- function(url, fileName, query, output_gtf = TRUE)
201 201
         out <- "GTF"
202 202
     else
203 203
         out <- "TAB"
204
-
204
+    
205 205
     URL <- paste0(url,"/queries/run/",fileName,"/",out)
206 206
     h <- c('Accept' = "Application/json",
207 207
             'Content-Type' = 'text/plain','X-Auth-Token' = authToken)
208
-
208
+    
209 209
     req <- httr::POST(URL,body = query ,httr::add_headers(h),encode = "json")
210 210
     content <- httr::content(req,"parsed")
211 211
     if(req$status_code !=200)
... ...
@@ -257,7 +257,7 @@ run_query_fromfile <- function(url, fileName, filePath, output_gtf = TRUE)
257 257
 {
258 258
     if(!file.exists(filePath))
259 259
         stop("file does not exist")
260
-
260
+    
261 261
     query <- readLines(filePath)
262 262
     run_query(url,fileName,query,output_gtf)
263 263
 }
... ...
@@ -325,7 +325,7 @@ compile_query_fromfile <- function(url ,filePath)
325 325
 {
326 326
     if(!file.exists(filePath))
327 327
         stop("file does not exist")
328
-
328
+    
329 329
     query <- readLines(filePath)
330 330
     compile_query(url,query)
331 331
 }
... ...
@@ -338,11 +338,11 @@ serialize_query <- function(url,output_gtf,base64)
338 338
         out <- "gtf"
339 339
     else
340 340
         out <- "tab"
341
-  
341
+    
342 342
     URL <- paste0(url,"/queries/dag/",out)
343 343
     h <- c('Accept' = "Application/json",
344 344
             'Content-Type' = 'text/plain','X-Auth-Token' = authToken)
345
-  
345
+    
346 346
     req <- httr::POST(URL,body = base64 ,httr::add_headers(h),encode = "json")
347 347
     content <- httr::content(req,"parsed")
348 348
     if(req$status_code !=200)
... ...
@@ -15,73 +15,131 @@ link-citations: true
15 15
 
16 16
 # Introduction
17 17
 
18
-Recent years have seen a tremendous increase in the volume of data generated in the life sciences, especially propelled by the rapid progress of Next Generation Sequencing (NGS) technologies. 
19
-This high-throughput technologies can produce billions of short DNA or RNA fragments in excess of a few terabytes of data in a single run.
20
-Next-generation sequencing refers to the deep, in-parallel DNA sequencing technologies providing massively parallel analysis and extremely high-throughput from multiple samples at much reduced cost. Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions. 
21
-To make effective use of the produced data, the design of big data algorithms and their efficient implementation on modern high performance computing infrastructures,
22
-such as clouds, CPU clusters and network infrastructures, is required in order to achieve scalability and performance. 
23
-For this purpose the GenoMetric Query Language (GMQL) has been proposed as high-level, declarative language to process, query, and compare multiple and heterogeneous genomic datasets for biomedical knowledge discovery [@Bioinformatics2015]
18
+Recent years have seen a tremendous increase in the volume of data generated 
19
+in the life sciences, especially propelled by the rapid progress of 
20
+Next Generation Sequencing (NGS) technologies. 
21
+This high-throughput technologies can produce billions of short DNA or RNA 
22
+fragments in excess of a few terabytes of data in a single run.
23
+Next-generation sequencing refers to the deep, in-parallel DNA sequencing 
24
+technologies providing massively parallel analysis and extremely 
25
+high-throughput from multiple samples at much reduced cost. 
26
+Improvement of sequencing technologies and data processing pipelines 
27
+is rapidly providing sequencing data, with associated high-level features, 
28
+of many individual genomes in multiple biological and clinical conditions. 
29
+To make effective use of the produced data, the design of big data algorithms 
30
+and their efficient implementation on modern high performance 
31
+computing infrastructures, such as clouds, CPU clusters 
32
+and network infrastructures, is required in order to achieve scalability 
33
+and performance. 
34
+For this purpose the GenoMetric Query Language (GMQL) has been proposed 
35
+as high-level, declarative language to process, query, 
36
+and compare multiple and heterogeneous genomic datasets for biomedical 
37
+knowledge discovery [@Bioinformatics2015]
24 38
 
25 39
 ## Purpose
26 40
 
27
-A very important emerging problem is to make sense of the enormous amount and variety of NGS data becoming available, i.e. to discover how different genomic regions and their products interact and cooperate with each other. 
28
-To this aim, the integration of several heterogeneous DNA feature data is required.
29
-Such big genomic feature data are collected within numerous and heterogeneous files, usually distributed within different repositories, lacking an attribute-based organization and a systematic description of their metadata. 
30
-These heterogeneous data can contain the hidden answer to very important biomedical questions.
31
-To inveil them, standard tools already available for knowledge extraction are too specialized or present powerful features, but have a rough interface not well-suited for scientists/biologists.
32
-GMQL addresses these aspects using cloud-based technologies (including Apache Hadoop, mapReduce, and Spark), and focusing on genomic data operations written as simple queries with implicit iterations over thousands of heterogeneous samples, computed efficiently [@IEEE7484654].
33
-This RGMQL package makes easy to take advantage of GMQL functionalities also to scientists and biologists with limited knowledge of query and programming languages, but used to the R/Bioconductor environment. 
34
-This package is built over a GMQL scalable data management engine written in Scala programming language, released as Scala API [@githubrepo] providing a set of functions to combine, manipulate, compare, and extract genomic data from different datasources both from local and remote datasets.
35
-These functions allow performing complex GMQL processing and queries without knowledge of GMQL syntax, but leveraging on R idiomatic paradigm and logic.
41
+A very important emerging problem is to make sense of the enormous amount and 
42
+variety of NGS data becoming available, i.e. to discover how different genomic 
43
+regions and their products interact and cooperate with each other. 
44
+To this aim, the integration of several heterogeneous DNA feature data 
45
+is required.
46
+Such big genomic feature data are collected within numerous and 
47
+heterogeneous files, usually distributed within different repositories, 
48
+lacking an attribute-based organization and a systematic description 
49
+of their metadata. 
50
+These heterogeneous data can contain the hidden answer to very important 
51
+biomedical questions.
52
+To inveil them, standard tools already available for knowledge extraction 
53
+are too specialized or present powerful features, but have a rough interface 
54
+not well-suited for scientists/biologists.
55
+GMQL addresses these aspects using cloud-based technologies 
56
+(including Apache Hadoop, mapReduce, and Spark), and focusing on genomic data 
57
+operations written as simple queries with implicit iterations over thousands 
58
+of heterogeneous samples, computed efficiently [@IEEE7484654].
59
+This RGMQL package makes easy to take advantage of GMQL functionalities also 
60
+to scientists and biologists with limited knowledge of query and 
61
+programming languages, but used to the R/Bioconductor environment. 
62
+This package is built over a GMQL scalable data management engine 
63
+written in Scala programming language, released as Scala API [@githubrepo] 
64
+providing a set of functions to combine, manipulate, compare, and extract 
65
+genomic data from different datasources both from local and remote datasets.
66
+These functions allow performing complex GMQL processing and queries without 
67
+knowledge of GMQL syntax, but leveraging on R idiomatic paradigm and logic.
36 68
 
37 69
 
38 70
 # Genomic Data Model
39 71
 
40
-The Genomic Data Model (GDM) is based on the notions of datasets and samples[@modeling2016] 
41
-Datasets are collections of samples, and each sample consists of two parts, the region data, which describe portions of the genome, and the metadata, which describe sample general properties and how observations are collected.
42
-In contrast to other data models, it clearly divides, and comprehensively manages, observations about genomic regions and metadata.
43
-GDM provides a flat attribute based organization, just requiring that each dataset is associated with a given data schema, which specifies the attributes and their type of region data.
44
-The first attributes of such schema are fixed (chr, start, end, strand); they represent the genomic region identifying coordinates.
72
+The Genomic Data Model (GDM) is based on the notions of datasets 
73
+and samples[@modeling2016] 
74
+Datasets are collections of samples, and each sample consists of two parts, 
75
+the region data, which describe portions of the genome, and the metadata, 
76
+which describe sample general properties and how observations are collected.
77
+In contrast to other data models, it clearly divides, and comprehensively 
78
+manages, observations about genomic regions and metadata.
79
+GDM provides a flat attribute based organization, just requiring that 
80
+each dataset is associated with a given data schema, which specifies 
81
+the attributes and their type of region data.
82
+The first attributes of such schema are fixed (chr, start, end, strand); 
83
+they represent the genomic region identifying coordinates.
45 84
 In addition, metadata have free attribute-value pair format.
46 85
 
47 86
 ## Genomic Region 
48 87
 
49
-Genomic region data describe a broad variety of biomolecular aspects and are very valuable for biomolecular investigation.
50
-A genomic region is a portion of a genome, qualified by a quadruple of values called region coordinates:
88
+Genomic region data describe a broad variety of biomolecular aspects and are 
89
+very valuable for biomolecular investigation.
90
+A genomic region is a portion of a genome, qualified by a quadruple of values 
91
+called region coordinates:
51 92
 $$< chr, left, right, strand >$$
52
-Regions can have an arbitrary number of associated values, according to the processing of DNA, RNA or epigenomic sequencing reads that determined the region.
93
+Regions can have an arbitrary number of associated values, according to 
94
+the processing of DNA, RNA or epigenomic sequencing reads that determined 
95
+the region.
53 96
 
54 97
 ## Metadata
55 98
 
56
-Metadata describe the biological and clinical properties associated with each sample.
57
-They are usually collected in a broad variety of data structures and formats that constitute barriers to their use and comparison GDM models metadata simply as arbitrary semi-structured attribute-value pairs, where attributes may have multiple values.
99
+Metadata describe the biological and clinical properties associated with 
100
+each sample.
101
+They are usually collected in a broad variety of data structures and formats 
102
+that constitute barriers to their use and comparison GDM models metadata 
103
+simply as arbitrary semi-structured attribute-value pairs, 
104
+where attributes may have multiple values.
58 105
 
59 106
 ## Genomic Sample
60 107
 
61
-Formally, a sample s is a collection of genomic regions modeled as the following triple: $$< id, {< r_i,v_i >}, {m_j} >$$ where:
108
+Formally, a sample s is a collection of genomic regions modeled as 
109
+the following triple: $$< id, {< r_i,v_i >}, {m_j} >$$ where:
62 110
 
63 111
 * id is the sample identifier
64 112
 * Each region is a pair of coordinates $r_i$ and values $v_i$
65 113
 * Metadata $m_j$ are attribute-value pairs $< a_j,v_j >$
66 114
 
67
-Note that the sample id attribute provides a many-to-many connection between regions and metadata of a sample.
68
-Through the use of a data type system to express region data, and of arbitrary attribute-value pairs for metadata, GDM provides interoperability across datasets in multiple formats produced by different experimental techniques.
115
+Note that the sample id attribute provides a many-to-many connection between 
116
+regions and metadata of a sample.
117
+Through the use of a data type system to express region data, and of arbitrary 
118
+attribute-value pairs for metadata, GDM provides interoperability across 
119
+datasets in multiple formats produced by different experimental techniques.
69 120
 
70 121
 ## Dataset
71 122
 
72
-A dataset is a collection of samples uniquely identified, with the same region schema and with each sample consisting of two parts:
123
+A dataset is a collection of samples uniquely identified, with the same region 
124
+schema and with each sample consisting of two parts:
73 125
 
74 126
 * region data: describing characteristics and location of genomic portions
75 127
 * metadata: expressing general properties of the sample
76 128
 
77
-Each dataset is typically produced within the same project by using the same or equivalent technology and tools, but with different experimental conditions, described by metadata.
129
+Each dataset is typically produced within the same project by using the same 
130
+or equivalent technology and tools, but with different experimental 
131
+conditions, described by metadata.
78 132
 
79
-Datasets contain large number of information describing regions of a genome, with data encoded in human readable format using plain text files.
133
+Datasets contain large number of information describing regions of a genome, 
134
+with data encoded in human readable format using plain text files.
80 135
 
81
-GMQL datasets are materialized in a standard layout composed of three types of files:
136
+GMQL datasets are materialized in a standard layout composed of three 
137
+types of files:
82 138
   
83
-1. genomic region tab-delimited text files with extension .gdm, or .gtf if in standard GTF format
84
-2. metadata attribute-value tab-delimited text files with the same fullname (name and extension) of the correspondent region file and extension .meta
139
+1. genomic region tab-delimited text files with extension .gdm, or .gtf 
140
+if in standard GTF format
141
+2. metadata attribute-value tab-delimited text files with the same fullname 
142
+(name and extension) of the correspondent region file and extension .meta
85 143
 3. schema XML file containing region attribute names and types
86 144
 
87 145
 All these files reside in unique folder called files.
... ...
@@ -89,14 +147,22 @@ All these files reside in unique folder called files.
89 147
 <!-- ![GMQL dataset folder](dataset_gmql.png) -->
90 148
 
91 149
 In RGMQL package dataset files are considered read-only.
92
-Once read, genomic information is represented in abstract structure inside the package, mapped to a R GRanges data structure at occurency.
150
+Once read, genomic information is represented in abstract structure inside 
151
+the package, mapped to a R GRanges data structure at occurency.
93 152
 
94 153
 
95 154
 # GenoMetric Query Language
96 155
 
97
-The GenoMetric Query Language name stems from the language ability to deal with genomic distances, which are measured as number of nucleotide bases between genomic regions (aligned to the same reference genome) and computed using arithmetic operations between region coordinates.
98
-GMQL is a high-level, declarative language that allows expressing queries easily over genomic regions and their metadata, in a way similar to what can be done with the Structured Query Language (SQL) over a relational database.
99
-GMQL approach exhibits two main differences with respect to other tools based on Hadoop, mapReduce framework, and Spark engine technologies to address similar biomedical problems:\newline
156
+The GenoMetric Query Language name stems from the language ability to deal 
157
+with genomic distances, which are measured as number of nucleotide bases 
158
+between genomic regions (aligned to the same reference genome) and computed 
159
+using arithmetic operations between region coordinates.
160
+GMQL is a high-level, declarative language that allows expressing queries 
161
+easily over genomic regions and their metadata, in a way similar to what can 
162
+be done with the Structured Query Language (SQL) over a relational database.
163
+GMQL approach exhibits two main differences with respect to other tools 
164
+based on Hadoop, mapReduce framework, and Spark engine technologies 
165
+to address similar biomedical problems:\newline
100 166
 
101 167
 * GMQL:
102 168
 
... ...
@@ -108,29 +174,43 @@ GMQL approach exhibits two main differences with respect to other tools based on
108 174
     1. read generally from raw or alligned data from NGS machines
109 175
     2. provide no support for metadata management
110 176
 
111
-GMQL is the appropriate tool for querying numerous processed genomic datasets and very many samples that are becoming available.
112
-Note however that GMQL performs worse than some other available systems on a small number of small-scale datasets, but these other systems are not cloud-based; hence, they are not adequate for efficient big data processing and, in some cases, they are inherently limited in their data management capacity, as they only work as RAM memory resident processes.
177
+GMQL is the appropriate tool for querying numerous processed genomic datasets 
178
+and very many samples that are becoming available.
179
+Note however that GMQL performs worse than some other available systems on a 
180
+small number of small-scale datasets, but these other systems are not 
181
+cloud-based; hence, they are not adequate for efficient big data processing 
182
+and, in some cases, they are inherently limited in their 
183
+data management capacity, as they only work as RAM memory resident processes.
113 184
 
114 185
 ## Query structure
115 186
 
116
-A GMQL operation is expressed as a sequence of GMQL operations with the following structure:
187
+A GMQL operation is expressed as a sequence of GMQL operations with the 
188
+following structure:
117 189
 $$< variable > = operator(< parameters >) < variable >;$$
118 190
 where each $< variable >$ stands for a GDM dataset
119 191
 
120
-This RGMQL package brings GMQL functionalities into R environemnt, allowing users to build directly a GMQL query without knowing the GMQL syntax.
121
-In RGMQL every GMQL operations is translated into a R function and expressed as:
192
+This RGMQL package brings GMQL functionalities into R environemnt, 
193
+allowing users to build directly a GMQL query without knowing the GMQL syntax.
194
+In RGMQL every GMQL operations is translated into a R function 
195
+and expressed as:
122 196
 $$ variable = operator(variable, parameters)$$
123 197
 
124
-It is very similar to the GMQL syntax for operation expression although expressed with the R idiomatic paradigm and logic, with parameters totaly builded using R native data structures such as lists, matrices, vectors or R logic conditions.
198
+It is very similar to the GMQL syntax for operation expression although 
199
+expressed with the R idiomatic paradigm and logic, with parameters totaly 
200
+builded using R native data structures such as lists, matrices, 
201
+vectors or R logic conditions.
125 202
 
126 203
 
127 204
 # Processing Environments
128 205
 
129
-In this section, we show how GMQL processing is built in R, which operations are available in RGMQL, and the difference beetween local and remote dataset processing.
206
+In this section, we show how GMQL processing is built in R, which operations 
207
+are available in RGMQL, and the difference beetween local 
208
+and remote dataset processing.
130 209
 
131 210
 ## Local Processing
132 211
 
133
-RGMQL local processing consumes computational power directly from local CPUs/system while managing datasets (both GMQL or generic text plain datasets).
212
+RGMQL local processing consumes computational power directly from local 
213
+CPUs/system while managing datasets (both GMQL or generic text plain datasets).
134 214
 
135 215
 ### Initialization
136 216
 
... ...
@@ -138,22 +218,27 @@ Load and attach the GMQL package in a R session using library function:
138 218
 ```{r, initialization, eval = TRUE}
139 219
 library('RGMQL')
140 220
 ```
141
-Before starting using any GMQL operation we need to initialise the GMQL context with the following code:
221
+Before starting using any GMQL operation we need to initialise the GMQL 
222
+context with the following code:
142 223
 ```{r, init, eval = TRUE}
143 224
 init_gmql()
144 225
 ```
145
-The function *init_gmql()* initializes the context of scalable data management engine laid upon Spark and Hadoop.
146
-Details on this and all other functions are provided in the R documentation for this package (e.g., help(RGMQL)).
226
+The function *init_gmql()* initializes the context of scalable data management 
227
+engine laid upon Spark and Hadoop.
228
+Details on this and all other functions are provided in the R documentation 
229
+for this package (e.g., help(RGMQL)).
147 230
 
148 231
 ### Read Dataset
149 232
 
150 233
 After initialization we need to read datasets.
151
-We already defined above the formal definition of dataset and the power of GMQL to deal with data in a variety of standard tab-delimited text formats.
234
+We already defined above the formal definition of dataset and the power of 
235
+GMQL to deal with data in a variety of standard tab-delimited text formats.
152 236
 In the following, we show how to get data from different sources.\newline
153 237
 We distinguish two different cases:
154 238
 
155 239
 1. Local dataset:\newline
156
-A local dataset is a folder with sample files (region files and correspondent metadata files) on the user computer.
240
+A local dataset is a folder with sample files (region files and correspondent 
241
+metadata files) on the user computer.
157 242
 As data are already in the user computer, we simply execute:
158 243
 
159 244
 ```{r, read GMQL dataset, eval = TRUE}
... ...
@@ -161,10 +246,13 @@ gmql_dataset_path <- system.file("example", "EXON", package = "RGMQL")
161 246
 data_out = read_dataset(gmql_dataset_path)
162 247
 ```
163 248
 In this case we are reading a dataset named EXON specified by path.
164
-It doens't matter what kind of format the data are, *read_dataset()* read many standard tab-delimited text formats without specified any paramter at input.
249
+It doens't matter what kind of format the data are, *read_dataset()* read many 
250
+standard tab-delimited text formats without specified any paramter at input.
165 251
 
166 252
 2. GRangesList:\newline
167
-For better integration in the R environment and with other packages, we provide a *read()* function to read directly from R memory/environment using GRangesList as input.
253
+For better integration in the R environment and with other packages, 
254
+we provide a *read()* function to read directly from R memory/environment 
255
+using GRangesList as input.
168 256
 
169 257
 ```{r, read GRangesList, eval = TRUE}
170 258
 library("GenomicRanges")
... ...
@@ -182,19 +270,27 @@ grl <- GRangesList("txA" = gr1, "txB" = gr2)
182 270
 data_out <- read(grl)
183 271
 ```
184 272
 In this example we show how versatile RGMQL package are.
185
-As specified above, we can directly read a list of GRanges previously created starting from two GRanges.
186
-Both *read()* and *read_dataset()* functions returns a result object, in this case *data_out* containing an internal R representation of the dataset used as input for executing the subsequent GMQL operation.
273
+As specified above, we can directly read a list of GRanges previously created 
274
+starting from two GRanges.
275
+Both *read()* and *read_dataset()* functions returns a result object, 
276
+in this case *data_out* containing an internal R representation of the dataset 
277
+used as input for executing the subsequent GMQL operation.
187 278
 
188 279
 ### Queries
189 280
 
190 281
 GMQL is not a traditional query language:
191
-With "query" we intend a group of operations that together produce a result; in this sense GMQL queries are more similar to SQL scripts.
192
-GMQL programming consists of a series of select, union, project, difference (and so on ...) commands.
282
+With "query" we intend a group of operations that together produce a result; 
283
+in this sense GMQL queries are more similar to SQL scripts.
284
+GMQL programming consists of a series of select, union, project, 
285
+difference (and so on ...) commands.
193 286
 
194 287
 Let's see some short examples:
195 288
 
196 289
 1) Find somatic mutations in exons.
197
-???Consider mutation data samples of human breast cancer cases. For each sample, quantify the mutations in each exon and select the exons with at least one mutation. Return the list of samples ordered by the number of such exons.???
290
+Consider mutation data samples of human breast cancer cases. 
291
+For each sample, quantify the mutations in each exon and select the exons 
292
+with at least one mutation. Return the list of samples ordered by 
293
+the number of such exons.
198 294
 ```{r, query, eval = TRUE}
199 295
 ## Read EXON dataset containing a single sample with exon regions 
200 296
 ## and MUT dataset containing multiple samples with mutation regions
... ...
@@ -207,7 +303,8 @@ mut_ds  <- read_dataset(mut_path)
207 303
 mut = select(mut_ds, manually_curated__dataType == 'dnaseq' & 
208 304
                 clinical_patient__tumor_tissue_site == 'breast')
209 305
 
210
-exon =  select(exon_ds, annotation_type == 'exons' & original_provider == 'RefSeq')
306
+exon =  select(exon_ds, annotation_type == 'exons' & 
307
+                    original_provider == 'RefSeq')
211 308
 
212 309
 
213 310
 ## For each mutation sample, count mutations within each exon while 
... ...
@@ -230,7 +327,8 @@ exon3 <- extend(exon2, list(exon_count = COUNT()))
230 327
 exon_res = RGMQL::order(exon3, list(DESC("exon_count")))
231 328
 ```
232 329
 
233
-If you want to store persistently the result, you can materialize it into specific path defined as input parameter.
330
+If you want to store persistently the result, you can materialize it into 
331
+specific path defined as input parameter.
234 332
 ```{r, materialize, eval = TRUE}
235 333
 ## Materialize the result dataset on disk
236 334
 materialize(exon_res)
... ...
@@ -239,55 +337,72 @@ by default *materialize()* has R workig directoy as stored path.
239 337
 
240 338
 ### Execution
241 339
 
242
-GMQL processing does not store results: They remain in the environment until you invoke the *execute()* function.
340
+GMQL processing does not store results: They remain in the environment until 
341
+you invoke the *execute()* function.
243 342
 ```{r, execute, eval = FALSE}
244 343
 execute()
245 344
 ```
246
-*execute()* can be issued only if at least one *read()* and one *materialize()* are present in the GMQL query, otherwise an error is generated.
247
-Data are saved in the path specified in every *materialize()* present in the query.
345
+*execute()* can be issued only if at least one *read()* and one *materialize()* 
346
+are present in the GMQL query, otherwise an error is generated.
347
+Data are saved in the path specified in every *materialize()* present 
348
+in the query.
248 349
 
249 350
 Beside *execute()* we can use: 
250 351
 ```{r, take, eval = TRUE}
251 352
 g <- take(input_data = exon_res, rows = 45)
252 353
 ```
253
-to execute all *materialize()* commands in the GMQL query and extract data as GRangesList format, a GRanges for each materialize.
354
+to execute all *materialize()* commands in the GMQL query and extract data 
355
+as GRangesList format, a GRanges for each materialize.
254 356
 NOTE: GRangesList are contained in the R environment and are not saved on disk.
255 357
 
256
-with *rows* parameter is possible to specified how many rows, for each sample inside input dataset, will be exported; by default is $0$, that means all rows will be exported.
257
-Note that, since we are working with big data, exported all row could be very time and space consuming.
358
+with *rows* parameter is possible to specified how many rows, for each sample 
359
+inside input dataset, will be exported; by default is $0$, 
360
+that means all rows will be exported.
361
+Note that, since we are working with big data, 
362
+exported all row could be very time and space consuming.
258 363
 
259 364
 ## Remote Processing
260 365
 
261
-RGMQL remote processing consumes computational power from remote cluster/system while managing GMQL datasets.\newline
366
+RGMQL remote processing consumes computational power from 
367
+remote cluster/system while managing GMQL datasets.\newline
262 368
 
263 369
 Remote processing exits in two flavour:\newline
264 370
 
265 371
 - REST web services: \newline
266
-  User can write GMQL queries (using original GMQL syntax) to be executed remotely on remote data (or local data previously uploaded).
372
+  User can write GMQL queries (using original GMQL syntax) to be executed 
373
+  remotely on remote data (or local data previously uploaded).
267 374
 
268 375
 - BATCH execution: \newline
269
-  Similar to local execution; user reads data and the system automatically uploads them on the
270
-  remote system: once loaded, RGMQL functions can be issued to manage remote data.\newline
376
+  Similar to local execution; user reads data and the system automatically 
377
+  uploads them on the
378
+  remote system: once loaded, RGMQL functions can be issued to manage 
379
+  remote data.\newline
271 380
 
272 381
 ### REST web services
273 382
 
274
-This RGMQL package allows to invoke REST services implementing the commands specified at [link](http://130.186.13.219/gmql-rest/swagger/).
383
+This RGMQL package allows to invoke REST services implementing the 
384
+commands specified at [link](http://130.186.13.219/gmql-rest/swagger/).
275 385
 
276 386
 
277 387
 #### Initialization
278 388
 
279
-GMQL REST services require login; so, the first step is to perform logon with user and password, or as guest.
280
-Upon succesfull logon, you get a request token that you must use in every subsequent REST call.
389
+GMQL REST services require login; so, the first step is to perform logon 
390
+with user and password, or as guest.
391
+Upon succesfull logon, you get a request token that you must use 
392
+in every subsequent REST call.
281 393
 Login can be performed using the function:
282 394
 ```{r, eval = TRUE}
283 395
 test_url = "http://130.186.13.219/gmql-rest"
284 396
 login_gmql(test_url)
285 397
 ```
286
-that saves the token in the Global R environment within the variable named *authToken*. With this token you can call all the functions in the GMQL REST web services suite.
398
+that saves the token in the Global R environment within the variable 
399
+named *authToken*. With this token you can call all the functions 
400
+in the GMQL REST web services suite.
287 401
 
288 402
 #### Execution
289 403
 
290
-User can write a GMQL query as in the following example, and run it as second parameter of the *run_query()* function.
404
+User can write a GMQL query as in the following example, and run it as 
405
+second parameter of the *run_query()* function.
291 406
 ```{r, run, eval = TRUE}
292 407
 test_url = "http://130.186.13.219/gmql-rest"
293 408
 login_gmql(test_url)
... ...
@@ -302,18 +417,23 @@ query_path <- system.file("example","query1.txt", package = "RGMQL")
302 417
 job <- run_query_fromfile(test_url, "query1", query_path, output_gtf = FALSE)
303 418
 ```
304 419
 
305
-Once run, query continues on the remote server while *run_query()* and *run_query_fromfile()* returns immediately.
420
+Once run, query continues on the remote server while *run_query()* 
421
+and *run_query_fromfile()* returns immediately.
306 422
 User can extract from result (job) the job_id and status.
307
-jod_id can then be used to continuosly invoke log and trace calls, both in this RGMQL package, to check for job completed status.
423
+jod_id can then be used to continuosly invoke log and trace calls, 
424
+both in this RGMQL package, to check for job completed status.
308 425
 
309 426
 ```{r, trace, eval = TRUE}
310 427
 jod_id <- job$id
311 428
 trace_job(test_url,jod_id)
312 429
 ```
313 430
 
314
-Then, results materialized on the remote repository can by downloaded locally and imported in a GRangesList using the functions in this RGMQL package.[see import/export](# Utilities)
431
+Then, results materialized on the remote repository can by downloaded 
432
+locally and imported in a GRangesList using the functions 
433
+in this RGMQL package.[see import/export](# Utilities)
315 434
 
316
-the returned *job* contains also the name of dataset, it will be materialized with  
435
+the returned *job* contains also the name of dataset, 
436
+it will be materialized with  
317 437
 ```{r, download, eval = FALSE}
318 438
 name_dataset <- job$datasets[[1]]$name
319 439
 download_dataset(test_url,name_dataset)
... ...
@@ -321,10 +441,14 @@ download_dataset(test_url,name_dataset)
321 441
 
322 442
 ### Batch execution
323 443
 
324
-This execution type is similar to local processing (syntax, functions, and so on ...) except:\newline
325
-1. materialized data are stored only on the remote repository, from where they can be download locally and imported in a GRangesList using the functions in this RGMQL package.[see import/export](# Utilities)
444
+This execution type is similar to local processing 
445
+(syntax, functions, and so on ...) except:\newline
446
+1. materialized data are stored only on the remote repository, 
447
+from where they can be download locally and imported in a GRangesList 
448
+using the functions in this RGMQL package.[see import/export](# Utilities)
326 449
 
327
-Before starting with examples, note that we have to log into remote infrastructure with login function:
450
+Before starting with examples, note that we have to log into remote 
451
+infrastructure with login function:
328 452
 ```{r, login remote, eval = TRUE}
329 453
 login_gmql(test_url)
330 454
 ```
... ...
@@ -346,7 +470,8 @@ mut_ds  <- read_dataset("public.HG19_BED_ANNOTATION",is_local = FALSE)
346 470
 mut = select(mut_ds, manually_curated__dataType == 'dnaseq' & 
347 471
                 clinical_patient__tumor_tissue_site == 'breast')
348 472
 
349
-exon =  select(exon_ds, annotation_type == 'exons' & original_provider == 'RefSeq')
473
+exon =  select(exon_ds, annotation_type == 'exons' & 
474
+                    original_provider == 'RefSeq')
350 475
 
351 476
 ## For each mutation sample, count mutations within each exon while 
352 477
 ## mapping the mutations to the exon regions using the map() function 
... ...
@@ -385,7 +510,9 @@ remote_processing(TRUE)
385 510
 ```
386 511
 An user can switch mode until the first *materialize()* has been performed.
387 512
 
388
-This kind of processing comes from the fact that the *read()* function can accept both local dataset and repository dataset, even in the same query as in the following example:
513
+This kind of processing comes from the fact that the *read()* function can 
514
+accept both local dataset and repository dataset, 
515
+even in the same query as in the following example:
389 516
 ```{r, mixed query, eval = TRUE}
390 517
 
391 518
 ## Read EXON dataset containing a single sample with exon regions 
... ...
@@ -398,7 +525,8 @@ mut_ds  <- read_dataset("public.HG19_BED_ANNOTATION",is_local = FALSE)
398 525
 mut = select(mut_ds, manually_curated__dataType == 'dnaseq' & 
399 526
                 clinical_patient__tumor_tissue_site == 'breast')
400 527
 
401
-exon =  select(exon_ds, annotation_type == 'exons' & original_provider == 'RefSeq')
528
+exon =  select(exon_ds, annotation_type == 'exons' & 
529
+                    original_provider == 'RefSeq')
402 530
 
403 531
 ## For each mutation sample, count mutations within each exon while 
404 532
 ## mapping the mutations to the exon regions using the map() function 
... ...
@@ -429,9 +557,12 @@ materialize(exon_res)
429 557
 execute()
430 558
 ```
431 559
 
432
-As we can see the two *read()* functions read from different sources: *exon_ds* from local dataset, *mut_ds* from repository.
560
+As we can see the two *read()* functions read from different sources: 
561
+*exon_ds* from local dataset, *mut_ds* from repository.
433 562
 
434
-If we set local processing to false (*remote_processing(FALSE)*), the execution is performed locally downloading all remote repositories, otherwise all local dataset are automatically uploaded.
563
+If we set local processing to false (*remote_processing(FALSE)*), 
564
+the execution is performed locally downloading all remote repositories, 
565
+otherwise all local dataset are automatically uploaded.
435 566
 
436 567
 NOTE:
437 568
 
... ...
@@ -439,8 +570,10 @@ The public dataset cannot be downloaded from repositories by design
439 570
 
440 571
 # Utilities
441 572
 
442
-RGMQL package contains functions that allow the user to interface with other packages available in R/Bioconductor repository, e.g. TFARM, GenomicRanges.
443
-These functions return GRangesList or GRanges with metadata associated, if present, as data structure suitable to further processing in other packages.
573
+RGMQL package contains functions that allow the user to interface with other 
574
+packages available in R/Bioconductor repository, e.g. TFARM, GenomicRanges.
575
+These functions return GRangesList or GRanges with metadata associated, 
576
+if present, as data structure suitable to further processing in other packages.
444 577
 
445 578
 ## Import/Export
446 579
 
... ...
@@ -450,21 +583,21 @@ dataset_path <- system.file("example", "EXON", package = "RGMQL")
450 583
 data <- import_gmql(dataset_path, is_gtf = FALSE)
451 584
 data
452 585
 ```
453
-the second parameter *is_gtf* must be specified if the file format are .GTF or .GDM.
586
+the second parameter *is_gtf* must specifies the file format: .GTF or .GDM.
454 587
 
455 588
 We can export a GRangesList as GMQL dataset as follows:
456 589
 ```{r, export, eval = FALSE}
457 590
 
458 591
 dir_out <- system.file("example", "dir_out", package = "RGMQL")
459
-## The third parameter TRUE indicates that the GRangesList data is exported in GTF format
460 592
 export_gmql(data, dir_out, is_gtf = TRUE)
461 593
 
462 594
 ```
463
-the second parameter *is_gtf* specified if the file format are .GTF or .GDM.
595
+the second parameter *is_gtf* specifies the file format: .GTF or .GDM.
464 596
 
465 597
 ## Filter and extract
466 598
 
467
-We can also import only a part of a GMQL dataset into R environment, by filtering its content as follows:
599
+We can also import only a part of a GMQL dataset into R environment, 
600
+by filtering its content as follows:
468 601
 ```{r, filter_extract, eval = TRUE}
469 602
 
470 603
 data_in <- system.file("example", "TEAD", package = "RGMQL")
... ...
@@ -472,11 +605,16 @@ matrix <- filter_and_extract(data_in, metadata = NULL,regions = c("count"))
472 605
 matrix
473 606
 
474 607
 ```
475
-*filter_and_extract()* filters the samples in dataset based of their specified *metadata*, and then extracts as metadata columns of GRanges the vector of region attributes you want to retrieve.
608
+*filter_and_extract()* filters the samples in dataset based of their specified 
609
+*metadata*, and then extracts as metadata columns of GRanges the vector 
610
+of region attributes you want to retrieve.
476 611
 If the argument is NULL, all samples will be taken.
477
-the number of columns will be equal to the number of samples left after filtering.
478
-If *regions* is not specified, only the foundamental elements of GRanges will be shown.
479
-Note that this function works only if every sample in dataset has the same region coordinates, in terms of value and total number.
612
+the number of columns will be equal to the number of samples left after 
613
+filtering.
614
+If *regions* is not specified, only the foundamental elements of GRanges 
615
+will be shown.
616
+Note that this function works only if every sample in dataset has the same 
617
+region coordinates, in terms of value and total number.
480 618
 
481 619
 # References
482 620