Browse code

passing to rJava

Simone authored on 16/10/2017 16:18:37
Showing 73 changed files

... ...
@@ -1,7 +1,7 @@
1 1
 Package: RGMQL
2 2
 Type: Package
3 3
 Title: GenoMetric Query Language for R/Bioconductor
4
-Version: 0.99.2
4
+Version: 0.99.3
5 5
 Author: Simone Pallotta, Marco Masseroli
6 6
 Maintainer: Simone Pallotta <simonepallotta@hotmail.com>
7 7
 Description: This package brings GMQL functionalities into R environemnt.
... ...
@@ -1,30 +1,76 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3
+export(ASC)
4
+export(AVG)
5
+export(BAG)
6
+export(COUNT)
7
+export(DEF)
8
+export(DESC)
9
+export(DG)
10
+export(DGE)
11
+export(DL)
12
+export(DLE)
13
+export(DOWN)
14
+export(EXACT)
15
+export(FULL)
16
+export(MAX)
17
+export(MD)
18
+export(MEDIAN)
19
+export(MIN)
20
+export(Q1)
21
+export(Q2)
22
+export(Q3)
23
+export(STD)
24
+export(SUM)
3 25
 export(TFARMatrix)
4 26
 export(TFARMemAtrix)
27
+export(UP)
28
+export(compileQuery)
29
+export(compileQuery.fromfile)
30
+export(cover)
5 31
 export(deleteDataset)
32
+export(difference)
6 33
 export(downloadDataset)
7 34
 export(downloadDatasetToGrangesList)
35
+export(execute)
8 36
 export(exportGMQL.gdm)
9 37
 export(exportGMQL.gtf)
10 38
 export(extend)
39
+export(flat)
40
+export(histogram)
11 41
 export(importGMQL.gdm)
12 42
 export(importGMQL.gtf)
13 43
 export(initGMQL)
44
+export(join)
14 45
 export(login.GMQL)
15 46
 export(logout.GMQL)
47
+export(map)
48
+export(materialize)
49
+export(merge)
16 50
 export(metadataFromSample)
51
+export(order)
52
+export(project)
17 53
 export(read)
18 54
 export(readDataset)
19 55
 export(regionFromSample)
20 56
 export(register.GMQL)
21 57
 export(remote_processing)
58
+export(runQuery)
59
+export(runQuery.fromfile)
22 60
 export(saveQuery)
23 61
 export(saveQuery.fromfile)
62
+export(select)
24 63
 export(showDatasets)
64
+export(showJobLog)
65
+export(showJobs)
25 66
 export(showQueries)
26 67
 export(showSamplesFromDataset)
27 68
 export(showSchemaFromDataset)
69
+export(stopJob)
70
+export(summit)
71
+export(take)
72
+export(traceJob)
73
+export(union)
28 74
 export(uploadSamples)
29 75
 import(GenomicRanges)
30 76
 import(httr)
... ...
@@ -36,12 +82,15 @@ importFrom(data.table,fread)
36 82
 importFrom(dplyr,bind_cols)
37 83
 importFrom(methods,is)
38 84
 importFrom(plyr,revalue)
85
+importFrom(rJava,.jarray)
86
+importFrom(rJava,.jevalArray)
39 87
 importFrom(rJava,.jinit)
40 88
 importFrom(rJava,.jnull)
41 89
 importFrom(rJava,.jpackage)
42 90
 importFrom(rJava,J)
43 91
 importFrom(rtracklayer,export)
44 92
 importFrom(rtracklayer,import)
93
+importFrom(stats,setNames)
45 94
 importFrom(utils,read.delim)
46 95
 importFrom(utils,unzip)
47 96
 importFrom(utils,write.table)
48 97
new file mode 100644
... ...
@@ -0,0 +1,360 @@
1
+#' GMQL Operation: COVER
2
+#'
3
+#' it takes as input a dataset and returns another dataset (with a single sample, if no \emph{groupby} option is specified)
4
+#' by “collapsing” the input dataset samples and their regions according to certain rules specified by the input parameters.
5
+#' The attributes of the output genomic regions are only the region coordinates, and Jaccard indexes (JaccardIntersect and JaccardResult).
6
+#' Jaccard Indexes are standard measures of similarity of the contributing regions, added as default region attributes.
7
+#' The JaccardIntersect index is calculated as the ratio between the lengths of the intersection
8
+#' and of the union of the contributing regions; the JaccardResult index is calculated as the ratio
9
+#' between the lengths of region and the union of the contributing regions.
10
+#' If aggregate functions are specified, new attributes are added.
11
+#' Output metadata are the union of the input ones.
12
+#' If \emph{groupby} clause is specified, the input samples are partitioned in groups,
13
+#' each with distinct values of the grouping metadata attributes, and the COVER operation is separately
14
+#' applied to each group, yielding to one sample in the result for each group.
15
+#' Input samples that do not satisfy the \emph{groupby} condition are disregarded.
16
+#'
17
+#' @importFrom methods is
18
+#' @importFrom rJava J
19
+#' @importFrom rJava .jnull
20
+#' @importFrom rJava .jarray
21
+#' 
22
+#' @param input_data returned object from any GMQL function
23
+#' @param minAcc minimum number of overlapping regions to be considered during executio.n
24
+#' Is a single integer number, declared also as string.
25
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword 
26
+#' ALL sets the minimum to the number of samples in the input dataset
27
+#' @param maxAcc maximum number of overlapping regions to be considered during execution.
28
+#' Is a single integer number, declared also as string.
29
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword 
30
+#' ALL sets the maximum to the number of samples in the input dataset
31
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping 
32
+#' @param groupBy list of CONDITION objects, or simple string concatenation 
33
+#' (i.e c("cell_type","attribute_tag","size")).
34
+#' Every object contains the name of metadata to be used in \emph{groupby}.
35
+#' For details of CONDITION objects see:
36
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
37
+#' 
38
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") )
39
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
40
+#' 
41
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}.
42
+#' The \emph{function_aggregate} is an object of class OPERATOR
43
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}},
44
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}},
45
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, 
46
+#' \code{\link{Q3}}.
47
+#' Every operator accepts a string value, execet for COUNT that cannot have a value.
48
+#' Argument of 'function_aggregate' must exist in schema
49
+#' Two style are allowed:
50
+#' \itemize{
51
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
52
+#' \item list of values: e.g. SUM("pvalue")
53
+#' }
54
+#' "mixed style" is not allowed
55
+#'
56
+#' @return DAGgraph class object. It contains the value associated to the graph used 
57
+#' as input for the subsequent GMQL function
58
+#' 
59
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
60
+#'
61
+#' @seealso  \code{\link{summit}} \code{\link{flat}} \code{\link{histogram}}
62
+#'
63
+#' @examples
64
+#' 
65
+#' ## This GMQL statement produces an output dataset with a single output sample. 
66
+#' ## The COVER operation considers all areas defined by a minimum of two overlapping regions 
67
+#' ## in the input samples, up to any amount of overlapping regions.
68
+#' 
69
+#' initGMQL("gtf")
70
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
71
+#' exp = readDataset(test_path)
72
+#' res = cover(input_data = exp,2,"ANY")
73
+#'
74
+#' \dontrun{
75
+#' ## This GMQL statement computes the result grouping the input exp samples by the values of 
76
+#' ## their cell metadata attribute, 
77
+#' ## thus one output res sample is generated for each cell type; 
78
+#' ## output regions are produced where at least 2 and at most 3 regions of grouped exp samples 
79
+#' ## overlap, setting as attributes of the resulting regions the minimum pvalue of the overlapping regions 
80
+#' ## (min_pvalue) and their Jaccard indexes (JaccardIntersect and JaccardResult).
81
+#' 
82
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
83
+#' exp = read(test_path)
84
+#' res = cover(input_data = exp,2,3, c("cell"), list(min_pValue = MIN("pvalue")))
85
+#' }
86
+#' @export
87
+#'
88
+cover <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL)
89
+{
90
+  .doVariant("COVER",minAcc,maxAcc,groupBy,aggregates,input_data)
91
+}
92
+
93
+#' GMQL Operation: HISTOGRAM
94
+#'
95
+#' returns the non-overlapping regions contributing to the cover,
96
+#' each with its accumulation index value, which is assigned to the AccIndex region attribute.
97
+#'
98
+#' @importFrom methods is
99
+#' @importFrom rJava J
100
+#' @importFrom rJava .jnull
101
+#' @importFrom rJava .jarray
102
+#' 
103
+#' @param input_data returned object from any GMQL function
104
+#' @param minAcc minimum number of overlapping regions to be considered during execution
105
+#' normally is a single integer number, declared also as string.
106
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword 
107
+#' ALL sets the minimum to the number of samples in the input dataset
108
+#' @param maxAcc maximum number of overlapping regions to be considered during execution
109
+#' normally is a single integer number, declared also as string.
110
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword 
111
+#' ALL sets the maximum to the number of samples in the input dataset
112
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping 
113
+#' @param groupBy list of CONDITION objects, or simple string concatenation 
114
+#' (i.e c("cell_type","attribute_tag","size")).
115
+#' Every object contains the name of metadata to be used in \emph{groupby}.
116
+#' For details of CONDITION objects see:
117
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
118
+#' 
119
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") )
120
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
121
+#' 
122
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}.
123
+#' The \emph{function_aggregate} is an object of class OPERATOR
124
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}},
125
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}},
126
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, 
127
+#' \code{\link{Q3}}.
128
+#' Every operator accepts a string value, execet for COUNT that cannot have a value.
129
+#' Argument of 'function_aggregate' must exist in schema
130
+#' Two style are allowed:
131
+#' \itemize{
132
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
133
+#' \item list of values: e.g. SUM("pvalue")
134
+#' }
135
+#' "mixed style" is not allowed
136
+#'
137
+#' @return DAGgraph class object. It contains the value associated to the graph used 
138
+#' as input for the subsequent GMQL function
139
+#' 
140
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
141
+#' @seealso \code{\link{flat}} \code{\link{cover}} \code{\link{summit}}
142
+#'
143
+#' @examples
144
+#'
145
+#' ## This GMQL statement computes the result grouping the input \emph{exp} samples 
146
+#' ## by the values of their \emph{cell} metadata attribute, 
147
+#' ## thus one output \emph{res} sample is generated for each cell type. 
148
+#' ## Output regions are produced by dividing results from COVER in contiguous subregions 
149
+#' ## according to the varying accumulation values (from 2 to 4 in this case): 
150
+#' ## one region for each accumulation value;
151
+#'
152
+#' initGMQL("gtf")
153
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
154
+#' exp = readDataset(test_path)
155
+#' res = histogram(exp, 2,4,groupBy = c("cell"))
156
+#' 
157
+#' @export
158
+#'
159
+histogram <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL)
160
+{
161
+  .doVariant("HISTOGRAM",minAcc,maxAcc,groupBy,aggregates,input_data)
162
+}
163
+
164
+#' GMQL Operation: SUMMIT
165
+#'
166
+#' returns regions that start from a position
167
+#' where the number of intersecting regions is not increasing afterwards and stops
168
+#' at a position where either the number of intersecting regions decreases,
169
+#' or it violates the max accumulation index).
170
+#'
171
+#' @importFrom methods is
172
+#' @importFrom rJava J
173
+#' @importFrom rJava .jnull
174
+#' @importFrom rJava .jarray
175
+#' 
176
+#' @param input_data returned object from any GMQL function
177
+#' @param minAcc minimum number of overlapping regions to be considered during execution
178
+#' normally is a single integer number, declared also as string.
179
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword 
180
+#' ALL sets the minimum to the number of samples in the input dataset
181
+#' @param maxAcc maximum number of overlapping regions to be considered during execution
182
+#' normally is a single integer number, declared also as string.
183
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword 
184
+#' ALL sets the maximum to the number of samples in the input dataset
185
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping 
186
+#' @param groupBy list of CONDITION objects, or simple string concatenation 
187
+#' (i.e c("cell_type","attribute_tag","size")).
188
+#' Every object contains the name of metadata to be used in \emph{groupby}.
189
+#' For details of CONDITION objects see:
190
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
191
+#' 
192
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") )
193
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
194
+#' 
195
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}.
196
+#' The \emph{function_aggregate} is an object of class OPERATOR
197
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}},
198
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}},
199
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, 
200
+#' \code{\link{Q3}}.
201
+#' Every operator accepts a string value, execet for COUNT that cannot have a value.
202
+#' Argument of 'function_aggregate' must exist in schema
203
+#' Two style are allowed:
204
+#' \itemize{
205
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
206
+#' \item list of values: e.g. SUM("pvalue")
207
+#' }
208
+#' "mixed style" is not allowed
209
+#'
210
+#' @return DAGgraph class object. It contains the value associated to the graph used 
211
+#' as input for the subsequent GMQL function
212
+#' 
213
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
214
+#' @seealso \code{\link{flat}} \code{\link{cover}} \code{\link{histogram}}
215
+#'
216
+#' @examples
217
+#'
218
+#' ## This GMQL statement computes the result grouping the input \emph{exp} samples by the values 
219
+#' ## of their \emph{cell} metadata attribute, thus one output \emph{res} sample is generated 
220
+#' ## for each cell type.
221
+#' ## Output regions are produced by extracting the highest accumulation overlapping 
222
+#' ## (sub)regions according to the methodologies described above;
223
+#'
224
+#'
225
+#' initGMQL("gtf")
226
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
227
+#' exp = readDataset(test_path)
228
+#' res = summit(input_data = exp,2,4, c("cell"))
229
+#' 
230
+#' @export
231
+#'
232
+summit <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL)
233
+{
234
+  .doVariant("SUMMIT",minAcc,maxAcc,groupBy,aggregates,input_data)
235
+}
236
+
237
+#' GMQL Operation: FLAT
238
+#'
239
+#' returns the contiguous region that starts from the first end and stops at
240
+#' the last end of the regions which would contribute to each region of the COVER
241
+#'
242
+#' @importFrom methods is
243
+#' @importFrom rJava J
244
+#' @importFrom rJava .jnull
245
+#' @importFrom rJava .jarray
246
+#' 
247
+#' @param input_data returned object from any GMQL function
248
+#' @param minAcc minimum number of overlapping regions to be considered during execution
249
+#' normally is a single integer number, declared also as string.
250
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword 
251
+#' ALL sets the minimum to the number of samples in the input dataset
252
+#' @param maxAcc maximum number of overlapping regions to be considered during execution
253
+#' normally is a single integer number, declared also as string.
254
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword 
255
+#' ALL sets the maximum to the number of samples in the input dataset
256
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping 
257
+#' @param groupBy list of CONDITION objects, or simple string concatenation 
258
+#' (i.e c("cell_type","attribute_tag","size")).
259
+#' Every object contains the name of metadata to be used in \emph{groupBy}.
260
+#' For details of CONDITION objects see:
261
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
262
+#' 
263
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") )
264
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
265
+#' 
266
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}.
267
+#' The \emph{function_aggregate} is an object of class OPERATOR
268
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}},
269
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}},
270
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, 
271
+#' \code{\link{Q3}}.
272
+#' Every operator accepts a string value, execet for COUNT that cannot have a value.
273
+#' Argument of 'function_aggregate' must exist in schema
274
+#' Two style are allowed:
275
+#' \itemize{
276
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
277
+#' \item list of values: e.g. SUM("pvalue")
278
+#' }
279
+#' "mixed style" is not allowed
280
+#'
281
+#' @return DAGgraph class object. It contains the value associated to the graph used 
282
+#' as input for the subsequent GMQL function
283
+#' 
284
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
285
+#' @seealso \code{\link{summit}} \code{\link{cover}} \code{\link{histogram}}
286
+#'
287
+#' @examples
288
+#' 
289
+#' ## This GMQL statement computes the result grouping the input \emph{exp} samples by 
290
+#' ## the values of their \emph{cell} metadata attribute, thus one output \emph{res} sample 
291
+#' ## is generated for each cell type. 
292
+#' ## Output regions are produced by concatenating all regions which would have been used 
293
+#' ## to construct a COVER(2,4) statement on the same dataset; 
294
+#' 
295
+#' initGMQL("gtf")
296
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
297
+#' exp = readDataset(test_path)
298
+#' res = flat(input_data = exp,2,4, c("cell"))
299
+#'
300
+#' @export
301
+#'
302
+flat <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL)
303
+{
304
+  .doVariant("FLAT",minAcc,maxAcc,groupBy,aggregates,input_data)
305
+}
306
+
307
+.doVariant <- function(flag,minAcc,maxAcc,groupBy,aggregates,input_data)
308
+{
309
+  min <- .check_cover_param(minAcc,TRUE)
310
+  max <- .check_cover_param(maxAcc,FALSE)
311
+  
312
+  if(!is.null(groupBy))
313
+    join_condition_matrix <- .jarray(.join_condition(groupBy),dispatch = TRUE)
314
+  else
315
+    join_condition_matrix <- .jnull("java/lang/String")
316
+  
317
+  if(!is.null(aggregates))
318
+    metadata_matrix <- .jarray(.aggregates(aggregates,"OPERATOR"),dispatch = TRUE)
319
+  else
320
+    metadata_matrix <- .jnull("java/lang/String")
321
+
322
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
323
+  response <- switch(flag,
324
+                "COVER" = WrappeR$cover(min,max,join_condition_matrix,metadata_matrix,input_data$value),
325
+                "FLAT" = WrappeR$flat(min,max,join_condition_matrix,metadata_matrix,input_data$value),
326
+                "SUMMIT" = WrappeR$summit(min,max,join_condition_matrix,metadata_matrix,input_data$value),
327
+                "HISTOGRAM" = WrappeR$histogram(min,max,join_condition_matrix,metadata_matrix,input_data$value))
328
+
329
+  error <- strtoi(response[1])
330
+  data <- response[2]
331
+  
332
+  if(error!=0)
333
+    stop(data)
334
+  else
335
+    DAGgraph(data)
336
+}
337
+
338
+.check_cover_param <- function(param,is_min)
339
+{
340
+  if(length(param)>1)
341
+    stop("length > 1")
342
+
343
+  if(is.numeric(param))
344
+  {
345
+    if(param<=0)
346
+      stop("No negative value")
347
+    else
348
+      return(as.character(param))
349
+  }
350
+  else if(is.character(param))
351
+  {
352
+    if(is_min && identical(param,"ANY"))
353
+      stop("min cannot assume ANY as value")
354
+    return(param)
355
+  }
356
+  else
357
+    stop("invalid input data")
358
+}
359
+
360
+
0 361
new file mode 100644
... ...
@@ -0,0 +1,83 @@
1
+#' GMQL Operation: DIFFERENCE
2
+#'
3
+#' It produces one sample in the result for each sample of the left operand,
4
+#' by keeping the same metadata of the left input sample and only those regions
5
+#' (with their schema and values) of the left input sample which do not intersect with any region
6
+#' in the right operand sample.
7
+#' The optional \emph{joinby} clause is used to extract a subset of couples
8
+#' from the cartesian product of two dataset \emph{left_input_data} x \emph{right_input_data}
9
+#' on which to apply the DIFFERENCE operator:
10
+#' only those samples that have the same value for each attribute
11
+#' are considered when performing the difference.
12
+#'
13
+#' @importFrom rJava J
14
+#' @importFrom rJava .jnull
15
+#' @importFrom rJava .jarray
16
+#' 
17
+#' @param right_input_data returned object from any GMQL function
18
+#' @param left_input_data returned object from any GMQL function
19
+#' @param joinBy list of CONDITION objects, or simple string concatenation 
20
+#' (i.e c("cell_type","attribute_tag","size")).
21
+#' Every object contains the name of metadata to be used in \emph{groupby}.
22
+#' For details of CONDITION objects see:
23
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
24
+#' 
25
+#' Every condition accepts only one string value (e.g. DEF("cell_type") )
26
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
27
+#' 
28
+#' @param is_exact single logical value: TRUE means that the region difference is executed only 
29
+#' on regions in left_input_data with exactly the same coordinates of at least one region present 
30
+#' in right_input_data; if is_exact = FALSE, the difference is executed on all regions in 
31
+#' left_input_data that overlap with at least one region in right_input_data (even just one base).
32
+#'
33
+#' @return DAGgraph class object. It contains the value associated to the graph used 
34
+#' as input for the subsequent GMQL function
35
+#'
36
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
37
+#'
38
+#' @examples
39
+#'
40
+#' ## This GMQL statement returns all the regions in the first dataset that do not 
41
+#' ## overlap any region in the second dataset.
42
+#' 
43
+#' initGMQL("gtf")
44
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
45
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL")
46
+#' r_left = readDataset(test_path)
47
+#' r_right = readDataset(test_path2)
48
+#' out = difference(r_left,r_right)
49
+#' 
50
+#' \dontrun{
51
+#' ## This GMQL statement extracts for every pair of samples s1 in EXP1 and s2 in EXP2
52
+#' ## having the same value of the metadata attribute 'antibody_target'
53
+#' ## the regions that appear in s1 but do not overlap any region in s2; 
54
+#' ## metadata of the result are the same as the metadata of s1.
55
+#' 
56
+#' initGMQL("gtf")
57
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
58
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL")
59
+#' exp1 = readDataset(test_path)
60
+#' exp2 = readDataset(test_path2)
61
+#' out = difference(exp1,exp2, c("antibody_target"))
62
+#'
63
+#' }
64
+#'
65
+#' @export
66
+#'
67
+difference <- function(left_input_data, right_input_data, joinBy = NULL,is_exact = FALSE)
68
+{
69
+  if(!is.null(joinBy))
70
+    join_condition_matrix <- .jarray(.join_condition(joinBy),dispatch = TRUE)
71
+  else
72
+    join_condition_matrix <- .jnull("java/lang/String")
73
+  
74
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
75
+  response <- WrappeR$difference(join_condition_matrix,right_input_data$value,left_input_data$value,is_exact)
76
+  error <- strtoi(response[1])
77
+  data <- response[2]
78
+  if(error!=0)
79
+    stop(data)
80
+  else
81
+    DAGgraph(data)
82
+}
83
+
... ...
@@ -6,6 +6,7 @@
6 6
 #'
7 7
 #' @importFrom rJava .jnull
8 8
 #' @importFrom rJava J
9
+#' @importFrom rJava .jarray
9 10
 #'
10 11
 #' @param input_data returned object from any GMQL function
11 12
 #' @param metadata list of element in the form \emph{key} = \emph{function_aggregate}.
... ...
@@ -49,7 +50,7 @@
49 50
 #' ##  2. MinP is the minimum pvalue of the sample regions.
50 51
 #' ## res sample regions are the same as the ones in exp.
51 52
 #' 
52
-#' res = extend(input_data = exp, list(RegionCount = COUNT(),MinP = MIN(pvalue)))
53
+#' res = extend(input_data = exp, list(RegionCount = COUNT(),MinP = MIN("pvalue")))
53 54
 #' 
54 55
 #' }
55 56
 #' 
... ...
@@ -58,7 +59,7 @@
58 59
 extend <-function(input_data, metadata = NULL)
59 60
 {
60 61
   if(!is.null(metadata))
61
-    metadata_matrix <- .aggregates(metadata,"META_OPERATOR")
62
+    metadata_matrix <- .jarray(.aggregates(metadata,"META_OPERATOR"),dispatch = TRUE)
62 63
   else
63 64
     metadata_matrix <- .jnull("java/lang/String")
64 65
   
65 66
new file mode 100644
... ...
@@ -0,0 +1,124 @@
1
+#' GMQL Operation: JOIN
2
+#'
3
+#' It takes in input two datasets, respectively known as nchor (left) and experiment (right) and returns
4
+#' a dataset of samples consisting of regions extracted from the operands according to the specified condition
5
+#' (a.k.a genometric_predicate).
6
+#' The number of generated output samples is the Cartesian product of the number of samples
7
+#' in the anchor and in the experiment dataset (if joinBy is not specified).
8
+#' The output metadata are the union of the input metadata, with their attribute names prefixed with
9
+#' left or right respectively.
10
+#'
11
+#' @importFrom rJava .jnull
12
+#' @importFrom rJava J
13
+#' @importFrom rJava .jarray
14
+#' 
15
+#' @param left_input_data returned object from any GMQL function
16
+#' @param right_input_data returned object from any GMQL function
17
+#' @param genometric_predicate is a list of lists of DISTAL object by means of logical ANDs
18
+#' For details of DISTAL objects see:
19
+#' \code{\link{DLE}}, \code{\link{DGE}}, \code{\link{MD}}, \code{\link{UP}}, 
20
+#' \code{\link{DOWN}}, \code{\link{DL}}, \code{\link{DG}}
21
+#' 
22
+#' @param joinBy list of CONDITION objects, or simple string concatenation 
23
+#' (i.e c("cell_type","attribute_tag","size")).
24
+#' Every object contains the name of metadata to be used in \emph{groupby}.
25
+#' For details of CONDITION objects see:
26
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
27
+#' 
28
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") )
29
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
30
+#' 
31
+#' @param region_output single string that declare which region is given in output for each input pair of left dataset
32
+#' right dataset regions satisfying the genometric predicate:
33
+#' \itemize{
34
+#' \item{left: outputs the anchor regions from left_input_data that satisfy the genometric predicate}
35
+#' \item{right: outputs the experiment regions from right_input_data that satisfy the genometric predicate}
36
+#' \item{int (intersection): outputs the overlapping part (intersection) of the left_input_data and right_input_data
37
+#' regions that satisfy the genometric predicate; if the intersection is empty, no output is produced}
38
+#' \item{contig: outputs the concatenation between the left_input_data and right_input_data regions that satisfy
39
+#' the genometric predicate, (i.e. the output regionis defined as having left (right) coordinates
40
+#' equal to the minimum (maximum) of the corresponding coordinate values in the left_input_data and right_input_data
41
+#' regions satisfying the genometric predicate)}
42
+#' }
43
+#'
44
+#' @return DAGgraph class object. It contains the value associated to the graph used 
45
+#' as input for the subsequent GMQL function
46
+#'
47
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
48
+#'
49
+#'
50
+#' @examples
51
+#' 
52
+#' ## Given a dataset 'hm' and one called 'tss' with a sample including Transcription Start Site annotations,
53
+#' ## it searches for those regions of hm that are at a minimal distance from a transcription start site (TSS) 
54
+#' ## and takes the first/closest one for each TSS, 
55
+#' ## provided that such distance is lesser than 120K bases and joined 'tss' and 'hm' samples are obtained 
56
+#' ## from the same provider (joinby clause).
57
+#' 
58
+#' initGMQL("gtf")
59
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
60
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL")
61
+#' TSS = readDataset(test_path)
62
+#' HM = readDataset(test_path2)
63
+#' join_data = join(TSS,HM,genometric_predicate=list(list(MD(1),DLE(120000))),c("provider"),region_output="RIGHT")
64
+#'
65
+#' @export
66
+#'
67
+join <- function(right_input_data, left_input_data, genometric_predicate = NULL,
68
+                 joinBy = NULL, region_output="contig")
69
+{
70
+  
71
+  if(!is.null(genometric_predicate))
72
+  {
73
+    if(!is.list(genometric_predicate))
74
+      stop("genometric_predicate must be list of lists")
75
+    
76
+    if(!all(sapply(genometric_predicate, function(x) is.list(x) )))
77
+      stop("genometric_predicate must be list of lists")
78
+    
79
+    lapply(genometric_predicate, function(list_pred) {
80
+      if(length(list_pred)>4)
81
+      {
82
+        warning("only 4 element per list, we cut the rest")
83
+        length(list_pred)=4
84
+      }
85
+      
86
+      if(!all(sapply(list_pred, function(x) {is(x,"DISTAL")} )))
87
+        stop("All elements should be DISTAL object")
88
+
89
+    })
90
+    
91
+    genomatrix <- t(sapply(genometric_predicate, function(list_pred) {
92
+      dist_array <- sapply(list_pred, function(x) {
93
+        new_value = as.character(x)
94
+        array <- c(new_value)
95
+      })
96
+      dist_array = c(dist_array,c("NA","NA"),c("NA","NA"),c("NA","NA"))
97
+      length(dist_array) = 8
98
+      dist_array
99
+    }))
100
+    
101
+    genomatrix <- .jarray(genomatrix, dispatch = TRUE)
102
+  }
103
+  else
104
+    genomatrix <- .jnull("java/lang/String")
105
+      
106
+  if(!is.null(joinBy))
107
+    join_condition_matrix <- .jarray(.join_condition(joinBy),dispatch = TRUE)
108
+  else
109
+    join_condition_matrix <- .jnull("java/lang/String")
110
+  
111
+  ouput <- toupper(region_output)
112
+  if(!identical(ouput,"CONTIG") && !identical(ouput,"LEFT") && !identical(ouput,"RIGHT")
113
+     && !identical(ouput,"INT"))
114
+    stop("region_output must be contig,left,right or int (intersection)")
115
+  
116
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
117
+  response <- WrappeR$join(genomatrix,join_condition_matrix, ouput,right_input_data$value, left_input_data$value)
118
+  error <- strtoi(response[1])
119
+  data <- response[2]
120
+  if(error!=0)
121
+    stop(data)
122
+  else
123
+    DAGgraph(data)
124
+}
0 125
new file mode 100644
... ...
@@ -0,0 +1,88 @@
1
+#' GMQL Operation: MAP
2
+#'
3
+#' It computes, for each sample in the right dataset, aggregates over the values of the right regions
4
+#' that intersect with a region in a left sample, for each region of each sample in the left dataset;
5
+#' The number of generated output samples is the Cartesian product of the samples in the two input datasets;
6
+#' each output sample has the same regions as the related input left sample, with their attributes and values,
7
+#' plus the attributes computed as aggregates over right region values.
8
+#' Output sample metadata are the union of the related input sample metadata,
9
+#' whose attribute names are prefixed with "left" or "right" respectively.
10
+#'
11
+#' When the joinby clause is present, only pairs of samples of left_input_data and of right_input_data with
12
+#' metadata M1 and M2 respectively that satisfy the joinby condition are considered.
13
+#'
14
+#' The clause consists of a list of metadata attribute names that must be present with equal values
15
+#' in both M1 and  M2
16
+#'
17
+#'
18
+#' @param left_input_data returned object from any GMQL function
19
+#' @param right_input_data returned object from any GMQL function
20
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}.
21
+#' The \emph{function_aggregate} is an object of class OPERATOR
22
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}},
23
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}},
24
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, \code{\link{Q3}}.
25
+#' Every operator accepts a string value, execet for COUNT that cannot have a value.
26
+#' Argument of 'function_aggregate' must exist in schema
27
+#' Two style are allowed:
28
+#' \itemize{
29
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue")
30
+#' \item list of values: e.g. SUM("pvalue")
31
+#' }
32
+#' "mixed style" is not allowed
33
+#'
34
+#' @param joinBy list of CONDITION objects, or simple string concatenation 
35
+#' (i.e c("cell_type","attribute_tag","size")).
36
+#' Every object contains the name of metadata to be used in \emph{groupby}.
37
+#' For details of CONDITION objects see:
38
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
39
+#' 
40
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") )
41
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
42
+#' 
43
+#' @return DAGgraph class object. It contains the value associated to the graph used 
44
+#' as input for the subsequent GMQL function
45
+#'
46
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
47
+#'
48
+#' @examples
49
+#'
50
+#' ## it counts the number of regions in each sample from exp that overlap with a ref region, 
51
+#' ## and for each ref region it computes the minimum score of all the regions in each exp sample 
52
+#' ## that overlap with it. 
53
+#' ## The MAP joinby option ensures that only the exp samples referring to the same 'cell_tissue' 
54
+#' ## of a ref sample are mapped on such ref sample; 
55
+#' ## exp samples with no cell_tissue metadata attribute, or with such metadata 
56
+#' ## but with a different value from the one(s) of ref sample(s), are disregarded.
57
+#' 
58
+#' initGMQL("gtf")
59
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
60
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL")
61
+#' exp = readDataset(test_path)
62
+#' ref = readDataset(test_path2)
63
+#' out = map(ref,exp, list(minScore = MIN("score")), joinBy = c("cell_tissue") )
64
+#' 
65
+#' 
66
+#' @export
67
+#'
68
+map <- function(left_input_data, right_input_data, aggregates = NULL, joinBy = NULL)
69
+{
70
+  if(!is.null(aggregates))
71
+    metadata_matrix <- .jarray(.aggregates(aggregates,"OPERATOR"),dispatch = TRUE)
72
+  else
73
+    metadata_matrix = .jnull("java/lang/String")
74
+
75
+  if(!is.null(joinBy))
76
+    join_condition_matrix <- .jarray(.join_condition(joinBy),dispatch = TRUE)
77
+  else
78
+    join_condition_matrix <- .jnull("java/lang/String")
79
+  
80
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
81
+  response<-WrappeR$map(join_condition_matrix,metadata_matrix,left_input_data$value,right_input_data$value)
82
+  error <- strtoi(response[1])
83
+  data <- response[2]
84
+  if(error!=0)
85
+    stop(data)
86
+  else
87
+    DAGgraph(data)
88
+}
0 89
new file mode 100644
... ...
@@ -0,0 +1,197 @@
1
+#' GMQL Function: EXECUTE
2
+#'
3
+#' execute GMQL query.
4
+#' The function works only after invoking at least one materialize
5
+#'
6
+#' @details 
7
+#' 
8
+#' After invoking execution function, all varialbe associated to DAG will be removed
9
+#' from scala enviroment, although the associated R variable will remain stored in R environment
10
+#'
11
+#' @importFrom rJava J
12
+#' 
13
+#' @return None
14
+#'
15
+#' @examples
16
+#'
17
+#' initGMQL()
18
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
19
+#' r = readDataset(test_path)
20
+#' s = select(input_data = r)
21
+#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s)
22
+#' materialize(input_data = m, dir_out = test_path)
23
+#' 
24
+#' \dontrun{
25
+#' execute()
26
+#' }
27
+#' @export
28
+#'
29
+execute <- function()
30
+{
31
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
32
+  remote_proc <- WrappeR$is_remote_processing()
33
+  if(!remote_proc)
34
+    .download_or_upload()
35
+  
36
+  response <- WrappeR$execute()
37
+  error <- strtoi(response[1])
38
+  data <- response[2]
39
+  if(error!=0)
40
+    stop(data)
41
+  else
42
+  {
43
+    if(remote_proc)
44
+    {
45
+      url <- WrappeR$get_url()
46
+      .download_or_upload()
47
+      serializeQuery(url,FALSE,data)
48
+    }
49
+  }
50
+}
51
+
52
+.download_or_upload <- function()
53
+{
54
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
55
+  data <- WrappeR$get_dataset_list()
56
+  data_list <- apply(data, 1, as.list)
57
+  url <- WrappeR$get_url()
58
+  remote <- WrappeR$is_remote_processing()
59
+  if(remote)
60
+  {
61
+    sapply(data_list,function(x){
62
+      uploadSamples(url,x[[2]],x[[1]],x[[3]],FALSE)
63
+    })
64
+  }
65
+  else
66
+  {
67
+    sapply(data_list,function(x){
68
+      downloadDataset(url,x[[2]],x[[1]])
69
+    })
70
+  }
71
+}
72
+
73
+#' GMQL Operation: MATERIALIZE
74
+#'
75
+#' It saves the contents of a dataset that contains samples metadata and samples regions.
76
+#' It is normally used to persist the contents of any dataset generated during a GMQL query.
77
+#' Any dataset can be materialized, but the operation can be very time-consuming.
78
+#' For best performance, materialize the relevant data only.
79
+#'
80
+#' @importFrom rJava J
81
+#' 
82
+#' @param input_data returned object from any GMQL function
83
+#' @param dir_out destination folder path.
84
+#' by default is current working directory of the R process
85
+#'
86
+#' @return None
87
+#'
88
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
89
+#'
90
+#' @examples
91
+#'
92
+#' initGMQL("gtf")
93
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
94
+#' r = readDataset(test_path)
95
+#' s = select(input_data = r)
96
+#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s)
97
+#' materialize(input_data = m, dir_out = test_path)
98
+#' 
99
+#' @export
100
+#'
101
+materialize <- function(input_data, dir_out = getwd())
102
+{
103
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
104
+  response <- WrappeR$materialize(input_data$value,dir_out)
105
+  error <- strtoi(response[1])
106
+  data <- response[2]
107
+  if(error!=0)
108
+    stop(data)
109
+  else
110
+    invisible(NULL)
111
+}
112
+
113
+
114
+#' GMQL Operation: TAKE
115
+#'
116
+#' It saves the contents of a dataset that contains samples metadata and samples regions.
117
+#' It is normally used to store in memoery the contents of any dataset generated during a GMQL query.
118
+#' the operation can be very time-consuming.
119
+#' If you have invoked any materialization before take function, all those dataset will be materialized 
120
+#' as folder (like if execution was invoked)
121
+#'
122
+#' @import GenomicRanges
123
+#' @importFrom stats setNames
124
+#' @importFrom rJava J
125
+#' @importFrom rJava .jevalArray
126
+#' 
127
+#' @param input_data returned object from any GMQL function
128
+#' @param rows number of rows for each sample regions that you want to retrieve and stored in memory
129
+#' by default is 0 that means take all rows for each sample
130
+#'
131
+#' @return GrangesList with associated metadata
132
+#'
133
+#' @examples
134
+#'
135
+#' initGMQL()
136
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
137
+#' r = readDataset(test_path)
138
+#' m = merge(groupBy = c("antibody_target","cell_karyotype"),input_data = r)
139
+#' g <- take(input_data = m, rows = 45)
140
+#' 
141
+#' @export
142
+#'
143
+take <- function(input_data, rows=0L)
144
+{
145
+  rows <- as.integer(rows[1])
146
+  if(rows<0)
147
+    stop("rows cannot be negative")
148
+  
149
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
150
+  response <- WrappeR$take(input_data$value,rows)
151
+  error <- strtoi(response[1])
152
+  data <- response[2]
153
+  if(error!=0)
154
+    stop(data)
155
+
156
+  reg <- .jevalArray(WrappeR$get_reg(),simplify = TRUE)
157
+  if(is.null(reg))
158
+    stop("no regions defined")
159
+  meta <- .jevalArray(WrappeR$get_meta(),simplify = TRUE)
160
+  if(is.null(meta))
161
+    stop("no metadata defined")
162
+  schema <- .jevalArray(WrappeR$get_schema(),simplify = TRUE)
163
+  if(is.null(schema))
164
+    stop("no schema defined")
165
+
166
+  reg_data_frame <- as.data.frame(reg)
167
+  list <- split(reg_data_frame, reg_data_frame[1])
168
+  names <- c("seqname","start","end","strand",schema)
169
+  
170
+  sampleList <- lapply(list, function(x){
171
+    x <- x[-1]
172
+    names(x) <- names
173
+    g <- GenomicRanges::makeGRangesFromDataFrame(x,keep.extra.columns = TRUE,
174
+                                                 start.field = "start",end.field = "end")
175
+  })
176
+  gRange_list <- GRangesList(sampleList)
177
+  
178
+  meta_list <- .metadata_from_frame_to_list(meta)
179
+  
180
+  S4Vectors::metadata(gRange_list) <- meta_list
181
+  return(gRange_list)
182
+}
183
+
184
+.metadata_from_frame_to_list <- function(metadata_frame)
185
+{
186
+  meta_frame <- as.data.frame(metadata_frame)
187
+  list <- split(meta_frame, meta_frame[1])
188
+  name_value_list <- lapply(list, function(x){
189
+    x <- x[-1]
190
+  })
191
+  meta_list <- lapply(name_value_list, function(x){
192
+    stats::setNames(as.list(as.character(x[[2]])), x[[1]])
193
+  })
194
+}
195
+
196
+
197
+
0 198
new file mode 100644
... ...
@@ -0,0 +1,60 @@
1
+#' GMQL Operation: MERGE
2
+#'
3
+#' It builds a dataset consisting of a single sample having as many regions
4
+#' as the numebr of regions of the input data and as many metadata as the union of
5
+#' the 'attribute-value' tuples of the input samples.
6
+#' A groupby clause can be specified on metadata: the samples are then partitioned in groups,
7
+#' each with a distinct value of the grouping metadata attributes.
8
+#' The operation is separately applied to each group, yielding one sample in the result for each group.
9
+#' Samples whose names are not present in the grouping metadata parameter are disregarded.
10
+#'
11
+#' @importFrom rJava J
12
+#' @importFrom rJava .jnull
13
+#' @importFrom rJava .jarray
14
+#'  
15
+#' @param input_data returned object from any GMQL function
16
+#' @param groupBy list of CONDITION objects, or simple string concatenation 
17
+#' (i.e c("cell_type","attribute_tag","size")).
18
+#' Every object contains the name of metadata to be used in \emph{groupBy}.
19
+#' For details of CONDITION objects see:
20
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}}
21
+#' 
22
+#' Every condition accepts only one string value (e.g. DEF("cell_type") )
23
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF
24
+#' 
25
+#' @return DAGgraph class object. It contains the value associated to the graph used 
26
+#' as input for the subsequent GMQL function
27
+#'
28
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
29
+#'
30
+#' @examples
31
+#' 
32
+#' ## it creates a dataset called merged which contains one sample for each antibody_target value 
33
+#' ## found within the metadata of the exp dataset sample; 
34
+#' ## each created sample contains all regions from all 'exp' samples with a specific value for their 
35
+#' ## antibody_target metadata attribute.
36
+#' 
37
+#' initGMQL("gtf")
38
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
39
+#' exp = readDataset(test_path)
40
+#' merged = merge(input_data = exp, groupBy = c("antibody_target"))
41
+#' 
42
+#' @export
43
+#'
44
+merge <- function(input_data, groupBy = NULL)
45
+{
46
+  if(!is.null(groupBy))
47
+    join_condition_matrix <- .jarray(.join_condition(groupBy),dispatch = TRUE)
48
+  else
49
+    join_condition_matrix <- .jnull("java/lang/String")
50
+  
51
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
52
+  response <- WrappeR$merge(join_condition_matrix,input_data$value)
53
+  error <- strtoi(response[1])
54
+  data <- response[2]
55
+  if(error!=0)
56
+    stop(data)
57
+  else
58
+    DAGgraph(data)
59
+}
60
+
0 61
new file mode 100644
... ...
@@ -0,0 +1,166 @@
1
+#' GMQL operation: ORDER
2
+#'
3
+#' It is used to order either samples or sample regions or both,
4
+#' according to a set of metadata and/or region attributes, and/or region coordinates.
5
+#' Order can be specified as ascending / descending for every attribute
6
+#' The number of samples and their regions remain the same (unless mtop/rtop parameters specified)
7
+#' but a new ordering metadata and/or region attribute is added.
8
+#' Sorted samples or regions have a new attribute "order", added to either metadata, or regions,
9
+#' or both of them as specified in input
10
+#' The input mtop = k and rtop = m extracts the first k samples and m regions respectively,
11
+#' the clause mtopg = k and rtopg = m performs grouping operation,
12
+#' grouping by identical values of ordering attributes
13
+#' and then selects the first k samples or regions of each group
14
+#'
15
+#' @importFrom rJava J
16
+#' @importFrom rJava .jnull
17
+#' @importFrom rJava .jarray
18
+#' 
19
+#' @param input_data "url-like" string taken from GMQL function
20
+#' @param metadata_ordering list of ORDER objects where every object contains the name of metadata
21
+#' The ORDER's available are: \code{\link{ASC}}, \code{\link{DESC}}
22
+#' Every condition accepts only one string value. (e.g. ASC("cell_type") )
23
+#' @param mtop integer value specifying the first k samples.
24
+#' default is 0 that means every sample must be considered
25
+#' @param mtopg integer value specifying the first j samples in each group.
26
+#' default is 0 that means every sample must be considered
27
+#' @param mtopp integer value specifying the first j samples in each group.
28
+#' default is 0 that means every sample must be considered
29
+#' @param regions_ordering list of ORDER objects where every object contains the name of region schema value
30
+#' The ORDER's available are: ASC, DESC.
31
+#' Every condition accepts only one string value. (e.g. DESC("pvalue") )
32
+#' @param rtop integer value specifying the first m samples in each group.
33
+#' default is 0 that means every sample must be considered
34
+#' @param rtopg integer value specifying the first i samples in each group.
35
+#' default is 0 that means every sample must be considered
36
+#' @param rtopp integer value specifying the first i samples in each group.
37
+#' default is 0 that means every sample must be considered
38
+#'
39
+#'
40
+#' @return DAGgraph class object. It contains the value associated to the graph used 
41
+#' as input for the subsequent GMQL function
42
+#' 
43
+#' @details
44
+#' mtop, mtopg,mtopp, rtop, rtopg and rtopp are normally numbers: if you specify a vector,
45
+#' only the first element will be used
46
+#' mtop and mtopg and mtopp are mutalbe exclusive, so rtop and rtopg and rtopp
47
+#'
48
+#'
49
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
50
+#'
51
+#' @examples
52
+#' 
53
+#' ## it orders the samples according to the Region_count metadata attribute and takes the two samples 
54
+#' ## that have the highest count. 
55
+#'
56
+#' initGMQL("gtf")
57
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
58
+#' r = readDataset(test_path)
59
+#' o = order(r,list(DESC("Region_Count")), mtop = 2)
60
+#'
61
+#' @export
62
+#'
63
+order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0,mtopp = 0,
64
+                  regions_ordering = NULL,rtop = 0,rtopg = 0,rtopp = 0)
65
+{
66
+  if(!is.numeric(mtop) || !is.numeric(mtopg) || !is.numeric(rtop) || !is.numeric(rtopg)
67
+     || !is.numeric(mtopp)|| !is.numeric(rtopp))
68
+    stop("mtop, rtop, rtopg and mtopg must be integer")
69
+
70
+  if(length(mtop)>0 || length(mtopg)>0 || length(rtop)>0 || length(rtopg)>0
71
+     || length(mtopp)>0 || length(rtopp)>0)
72
+    warning("only the first element is taken by rtop, mtop, mtopg, rtopg, rtopp, mtopp")
73
+
74
+  # we consider only the first element even if input is a vector of Int
75
+  # we cut the other arguments
76
+
77
+  mtop = as.integer(mtop[1])
78
+  mtopg = as.integer(mtopg[1])
79
+  mtopp = as.integer(mtopp[1])
80
+
81
+  rtop = as.integer(rtop[1])
82
+  rtopg = as.integer(rtopg[1])
83
+  rtopp = as.integer(rtopp[1])
84
+
85
+  if(mtop > 0 && mtopg >0)
86
+  {
87
+    warning("cannot be used together.\nWe set mtopg = 0")
88
+    mtopg = 0L
89
+  }
90
+
91
+  if(mtop >0 && mtopp>0)
92
+  {
93
+    warning("cannot be used together.\nWe set mtopp = 0")
94
+    mtopp = 0L
95
+  }
96
+
97
+  if(mtopg >0 && mtopp>0)
98
+  {
99
+    warning("cannot be used together.\nWe set mtopp = 0")
100
+    mtopp = 0L
101
+  }
102
+
103
+  if(rtop > 0 && rtopg >0)
104
+  {
105
+    warning("cannot be used together.\nWe set rtopg = 0")
106
+    rtopg = 0L
107
+  }
108
+
109
+  if(rtop >0 && rtopp>0)
110
+  {
111
+    warning("cannot be used together.\nWe set rtopp = 0")
112
+    rtopp = 0L
113
+  }
114
+
115
+  if(rtopg >0 && rtopp>0)
116
+  {
117
+    warning("cannot be used together.\nWe set rtopp = 0")
118
+    rtopp = 0L
119
+  }
120
+
121
+  if(!is.null(metadata_ordering))
122
+    meta_matrix <- .jarray(.ordering_meta(metadata_ordering),dispatch = TRUE)
123
+  else
124
+    meta_matrix <- .jnull("java/lang/String")
125
+
126
+  if(!is.null(regions_ordering))
127
+    region_matrix <- .jarray(.ordering_meta(regions_ordering),dispatch = TRUE)
128
+  else
129
+    region_matrix <- .jnull("java/lang/String")
130
+  
131
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
132
+  response <- WrappeR$order(meta_matrix,mtopg,mtop,mtopp,region_matrix,rtopg,rtop,rtopp,input_data$value)
133
+  error <- strtoi(response[1])
134
+  data <- response[2]
135
+  if(error!=0)
136
+    stop(data)
137
+  else
138
+    DAGgraph(data)
139
+}
140
+
141
+
142
+.ordering_meta <- function(ordering)
143
+{
144
+  if(is.list(ordering))
145
+  {
146
+    order_matrix <- t(sapply(ordering,function(x){
147
+      new_value <- as.character(x)
148
+      if(length(new_value)==1)
149
+        new_value = c("ASC",new_value)
150
+      else if(!identical("ASC",new_value[1]) && !identical("DESC",new_value[1]))
151
+        stop("no more than one value")
152
+      matrix <- matrix(new_value)
153
+    }))
154
+  }
155
+  else if(is.character(ordering))
156
+  {
157
+    order_matrix <- t(sapply(ordering, function(x) {
158
+      new_value = c("ASC",x)
159
+      matrix <- matrix(new_value)
160
+    }))
161
+  }
162
+  else
163
+    stop("only list or character")
164
+}
165
+
166
+
0 167
new file mode 100644
... ...
@@ -0,0 +1,132 @@
1
+#' GMQL Operation: PROJECT
2
+#'
3
+#' It creates, from an existing dataset, a new dataset with all the samples from input dataset
4
+#' but keeping for each sample in the input dataset only those metadata and/or region attributes
5
+#' expressed in the operator parameter list.
6
+#' Region coordinates and values of the remaining metadata remain equal to those in the input dataset.
7
+#' It allows to:
8
+#' \itemize{
9
+#' \item{Remove existing metadata and/or region attributes from a dataset}
10
+#' \item{Create new metadata and/or region attributes in the result}
11
+#' }
12
+#'
13
+#' @importFrom rJava J
14
+#' @importFrom rJava .jnull
15
+#' @importFrom rJava .jarray
16
+#' 
17
+#' @param input_data string pointer taken from GMQL function
18
+#' @param metadata vector of string made up by metadata attribute
19
+#' @param regions vector of string made up by schema field attribute
20
+#' @param all_but_reg logical value indicating which schema filed attribute you want to exclude.
21
+#' If FALSE only the regions you choose is kept in the output of the project operation,
22
+#' if TRUE the schema region are all except ones include in region parameter.
23
+#' if regions is not defined \emph{all_but_reg} is not considerd.
24
+#' @param all_but_meta logical value indicating which metadata you want to exclude.
25
+#' If FALSE only the metadata you choose is kept in the output of the project operation,
26
+#' if TRUE the metadata are all except ones include in region parameter.
27
+#' if metadata is not defined \emph{all_but_meta} is not considerd.
28
+#' @param regions_update single string predicate made up by operation on schema field attribute
29
+#' @param metadata_update single string predicate made up by operation on metadata attribute
30
+#'
31
+#' @return DAGgraph class object. It contains the value associated to the graph used 
32
+#' as input for the subsequent GMQL function#'
33
+#'
34
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf}
35
+#'
36
+#'
37
+#' @examples
38
+#' 
39
+#' ## it creates a new dataset called CTCF_NORM_SCORE by preserving all region attributes apart from score,
40
+#' ## and creating a new region attribute called new_score by dividing the existing score value 
41
+#' ## of each region by 1000.0 and incrementing it by 100.
42
+#' ## It also generates, for each sample of the new dataset, 
43
+#' ## a new metadata attribute called normalized with value 1, which can be used in future selections.
44
+#' 
45
+#' initGMQL("gtf")
46
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
47
+#' input = readDataset(test_path)
48
+#' CTCF_NORM_SCORE = project(input,metadata_update="normalized AS 1", regions_update="new_score AS (score / 1000.0) + 100" , regions=c("score"), all_but_reg=TRUE)
49
+#' 
50
+#' 
51
+#' \dontrun{
52
+#' 
53
+#' ## it produces an output dataset that contains the same samples as the input dataset. 
54
+#' ## Each output sample only contains, as region attributes, 
55
+#' ## the four basic coordinates (chr, left, right, strand) and the specified region attributes 
56
+#' ## 'variant_classification' and 'variant_type', and as metadata attributes only the specified ones, 
57
+#' ## i.e. manually_curated__tissue_status and manually_curated__tumor_tag.
58
+#' 
59
+#' initGMQL("gtf")
60
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL")
61
+#' DS_in = readDataset(test_path)
62
+#' DS_out = project(DS_in,regions=c("variant_classification", "variant_type"), 
63
+#' metadata=c("manually_curated__tissue_status","manually_curated__tumor_tag"))
64
+#' 
65
+#' }
66
+#' 
67
+#' @export
68
+#'
69
+#'
70
+project <-function(input_data, metadata = NULL,metadata_update=NULL,all_but_meta = FALSE,
71
+                   regions = NULL, regions_update = NULL,all_but_reg=FALSE)
72
+{
73
+  if(!is.null(metadata))
74
+  {
75
+    if(!is.character(metadata))
76
+      stop("metadata: no valid input")
77
+
78
+    metadata <- metadata[!metadata %in% ""]
79
+    metadata <- metadata[!duplicated(metadata)]
80
+
81
+    if(length(metadata)==0)
82
+      metadata <- .jnull("java/lang/String")
83
+    
84
+    metadata <- .jarray(metadata)
85
+  }
86
+  else
87
+    metadata <- .jnull("java/lang/String")
88
+
89
+  if(!is.null(regions))
90
+  {
91
+    if(!is.character(regions))
92
+      stop("regions: no valid input")
93
+
94
+    regions = regions[!regions %in% ""]
95
+    regions = regions[!duplicated(regions)]
96
+
97
+    if(length(regions)==0)
98
+      regions <- .jnull("java/lang/String")
99
+    
100
+    regions <- .jarray(regions)
101
+  }
102
+  else
103
+    regions <- .jnull("java/lang/String")
104
+
105
+  if(!is.null(regions_update))
106
+    .check_predicate(regions_update)
107
+  else
108
+    regions_update <- .jnull("java/lang/String")
109
+  
110
+  if(!is.null(metadata_update))
111
+    .check_predicate(metadata_update)
112
+  else
113
+    metadata_update <- .jnull("java/lang/String")
114
+  
115
+  if(length(all_but_meta)>1)
116
+    warning("all_but_meta: no multiple values")
117
+  
118
+  if(length(all_but_reg)>1)
119
+    warning("all_but_reg: no multiple values")
120
+  all_but_reg <- all_but_reg[1]
121
+  all_but_meta <- all_but_meta[1]
122
+  
123
+  WrappeR <- J("it/polimi/genomics/r/Wrapper")
124
+  response <- WrappeR$project(metadata,metadata_update,all_but_meta,
125
+                              regions,regions_update,all_but_reg,input_data$value)
126
+  error <- strtoi(response[1])
127
+  data <- response[2]
128
+  if(error!=0)
129
+    stop(data)
130
+  else
131
+    DAGgraph(data)
132
+}
... ...
@@ -120,7 +120,7 @@ readDataset <- function(dataset, parser = "CustomParser", is_local=TRUE,
120 120
   }
121 121
   else
122 122
   {
123
-    url <- Wrapper$get_url()
123
+    url <- WrappeR$get_url()
124 124
     if(is.null(url))
125 125
       stop("You have to log on using login function")
126 126
     
127 127
new file mode 100644
... ...
@@ -0,0 +1,120 @@
1
+#' GMQL Operation: SELECT
2
+#'
3
+#' It extracts a subset of samples from the input dataset.
4
+#' It returns all the samples satisfying the predicate on metadata.
5
+#' If regions are specified, returns regions satisfying the predicate on regions.
6
+#' If semijoin clauses are specified they are applied, too.
7
+#' When semijoin is defined, it extracts those samples containing all metadata attribute defined in semijoin clause
8
+#' with at least one metadata value in common with semi join dataset