... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: RGMQL |
2 | 2 |
Type: Package |
3 | 3 |
Title: GenoMetric Query Language for R/Bioconductor |
4 |
-Version: 0.99.2 |
|
4 |
+Version: 0.99.3 |
|
5 | 5 |
Author: Simone Pallotta, Marco Masseroli |
6 | 6 |
Maintainer: Simone Pallotta <simonepallotta@hotmail.com> |
7 | 7 |
Description: This package brings GMQL functionalities into R environemnt. |
... | ... |
@@ -1,30 +1,76 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
+export(ASC) |
|
4 |
+export(AVG) |
|
5 |
+export(BAG) |
|
6 |
+export(COUNT) |
|
7 |
+export(DEF) |
|
8 |
+export(DESC) |
|
9 |
+export(DG) |
|
10 |
+export(DGE) |
|
11 |
+export(DL) |
|
12 |
+export(DLE) |
|
13 |
+export(DOWN) |
|
14 |
+export(EXACT) |
|
15 |
+export(FULL) |
|
16 |
+export(MAX) |
|
17 |
+export(MD) |
|
18 |
+export(MEDIAN) |
|
19 |
+export(MIN) |
|
20 |
+export(Q1) |
|
21 |
+export(Q2) |
|
22 |
+export(Q3) |
|
23 |
+export(STD) |
|
24 |
+export(SUM) |
|
3 | 25 |
export(TFARMatrix) |
4 | 26 |
export(TFARMemAtrix) |
27 |
+export(UP) |
|
28 |
+export(compileQuery) |
|
29 |
+export(compileQuery.fromfile) |
|
30 |
+export(cover) |
|
5 | 31 |
export(deleteDataset) |
32 |
+export(difference) |
|
6 | 33 |
export(downloadDataset) |
7 | 34 |
export(downloadDatasetToGrangesList) |
35 |
+export(execute) |
|
8 | 36 |
export(exportGMQL.gdm) |
9 | 37 |
export(exportGMQL.gtf) |
10 | 38 |
export(extend) |
39 |
+export(flat) |
|
40 |
+export(histogram) |
|
11 | 41 |
export(importGMQL.gdm) |
12 | 42 |
export(importGMQL.gtf) |
13 | 43 |
export(initGMQL) |
44 |
+export(join) |
|
14 | 45 |
export(login.GMQL) |
15 | 46 |
export(logout.GMQL) |
47 |
+export(map) |
|
48 |
+export(materialize) |
|
49 |
+export(merge) |
|
16 | 50 |
export(metadataFromSample) |
51 |
+export(order) |
|
52 |
+export(project) |
|
17 | 53 |
export(read) |
18 | 54 |
export(readDataset) |
19 | 55 |
export(regionFromSample) |
20 | 56 |
export(register.GMQL) |
21 | 57 |
export(remote_processing) |
58 |
+export(runQuery) |
|
59 |
+export(runQuery.fromfile) |
|
22 | 60 |
export(saveQuery) |
23 | 61 |
export(saveQuery.fromfile) |
62 |
+export(select) |
|
24 | 63 |
export(showDatasets) |
64 |
+export(showJobLog) |
|
65 |
+export(showJobs) |
|
25 | 66 |
export(showQueries) |
26 | 67 |
export(showSamplesFromDataset) |
27 | 68 |
export(showSchemaFromDataset) |
69 |
+export(stopJob) |
|
70 |
+export(summit) |
|
71 |
+export(take) |
|
72 |
+export(traceJob) |
|
73 |
+export(union) |
|
28 | 74 |
export(uploadSamples) |
29 | 75 |
import(GenomicRanges) |
30 | 76 |
import(httr) |
... | ... |
@@ -36,12 +82,15 @@ importFrom(data.table,fread) |
36 | 82 |
importFrom(dplyr,bind_cols) |
37 | 83 |
importFrom(methods,is) |
38 | 84 |
importFrom(plyr,revalue) |
85 |
+importFrom(rJava,.jarray) |
|
86 |
+importFrom(rJava,.jevalArray) |
|
39 | 87 |
importFrom(rJava,.jinit) |
40 | 88 |
importFrom(rJava,.jnull) |
41 | 89 |
importFrom(rJava,.jpackage) |
42 | 90 |
importFrom(rJava,J) |
43 | 91 |
importFrom(rtracklayer,export) |
44 | 92 |
importFrom(rtracklayer,import) |
93 |
+importFrom(stats,setNames) |
|
45 | 94 |
importFrom(utils,read.delim) |
46 | 95 |
importFrom(utils,unzip) |
47 | 96 |
importFrom(utils,write.table) |
48 | 97 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,360 @@ |
1 |
+#' GMQL Operation: COVER |
|
2 |
+#' |
|
3 |
+#' it takes as input a dataset and returns another dataset (with a single sample, if no \emph{groupby} option is specified) |
|
4 |
+#' by “collapsing” the input dataset samples and their regions according to certain rules specified by the input parameters. |
|
5 |
+#' The attributes of the output genomic regions are only the region coordinates, and Jaccard indexes (JaccardIntersect and JaccardResult). |
|
6 |
+#' Jaccard Indexes are standard measures of similarity of the contributing regions, added as default region attributes. |
|
7 |
+#' The JaccardIntersect index is calculated as the ratio between the lengths of the intersection |
|
8 |
+#' and of the union of the contributing regions; the JaccardResult index is calculated as the ratio |
|
9 |
+#' between the lengths of region and the union of the contributing regions. |
|
10 |
+#' If aggregate functions are specified, new attributes are added. |
|
11 |
+#' Output metadata are the union of the input ones. |
|
12 |
+#' If \emph{groupby} clause is specified, the input samples are partitioned in groups, |
|
13 |
+#' each with distinct values of the grouping metadata attributes, and the COVER operation is separately |
|
14 |
+#' applied to each group, yielding to one sample in the result for each group. |
|
15 |
+#' Input samples that do not satisfy the \emph{groupby} condition are disregarded. |
|
16 |
+#' |
|
17 |
+#' @importFrom methods is |
|
18 |
+#' @importFrom rJava J |
|
19 |
+#' @importFrom rJava .jnull |
|
20 |
+#' @importFrom rJava .jarray |
|
21 |
+#' |
|
22 |
+#' @param input_data returned object from any GMQL function |
|
23 |
+#' @param minAcc minimum number of overlapping regions to be considered during executio.n |
|
24 |
+#' Is a single integer number, declared also as string. |
|
25 |
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
26 |
+#' ALL sets the minimum to the number of samples in the input dataset |
|
27 |
+#' @param maxAcc maximum number of overlapping regions to be considered during execution. |
|
28 |
+#' Is a single integer number, declared also as string. |
|
29 |
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
30 |
+#' ALL sets the maximum to the number of samples in the input dataset |
|
31 |
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
32 |
+#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
33 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
34 |
+#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
35 |
+#' For details of CONDITION objects see: |
|
36 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
37 |
+#' |
|
38 |
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
39 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
40 |
+#' |
|
41 |
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
42 |
+#' The \emph{function_aggregate} is an object of class OPERATOR |
|
43 |
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
44 |
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
45 |
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
46 |
+#' \code{\link{Q3}}. |
|
47 |
+#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
48 |
+#' Argument of 'function_aggregate' must exist in schema |
|
49 |
+#' Two style are allowed: |
|
50 |
+#' \itemize{ |
|
51 |
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
52 |
+#' \item list of values: e.g. SUM("pvalue") |
|
53 |
+#' } |
|
54 |
+#' "mixed style" is not allowed |
|
55 |
+#' |
|
56 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
57 |
+#' as input for the subsequent GMQL function |
|
58 |
+#' |
|
59 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
60 |
+#' |
|
61 |
+#' @seealso \code{\link{summit}} \code{\link{flat}} \code{\link{histogram}} |
|
62 |
+#' |
|
63 |
+#' @examples |
|
64 |
+#' |
|
65 |
+#' ## This GMQL statement produces an output dataset with a single output sample. |
|
66 |
+#' ## The COVER operation considers all areas defined by a minimum of two overlapping regions |
|
67 |
+#' ## in the input samples, up to any amount of overlapping regions. |
|
68 |
+#' |
|
69 |
+#' initGMQL("gtf") |
|
70 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
71 |
+#' exp = readDataset(test_path) |
|
72 |
+#' res = cover(input_data = exp,2,"ANY") |
|
73 |
+#' |
|
74 |
+#' \dontrun{ |
|
75 |
+#' ## This GMQL statement computes the result grouping the input exp samples by the values of |
|
76 |
+#' ## their cell metadata attribute, |
|
77 |
+#' ## thus one output res sample is generated for each cell type; |
|
78 |
+#' ## output regions are produced where at least 2 and at most 3 regions of grouped exp samples |
|
79 |
+#' ## overlap, setting as attributes of the resulting regions the minimum pvalue of the overlapping regions |
|
80 |
+#' ## (min_pvalue) and their Jaccard indexes (JaccardIntersect and JaccardResult). |
|
81 |
+#' |
|
82 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
83 |
+#' exp = read(test_path) |
|
84 |
+#' res = cover(input_data = exp,2,3, c("cell"), list(min_pValue = MIN("pvalue"))) |
|
85 |
+#' } |
|
86 |
+#' @export |
|
87 |
+#' |
|
88 |
+cover <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
89 |
+{ |
|
90 |
+ .doVariant("COVER",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
91 |
+} |
|
92 |
+ |
|
93 |
+#' GMQL Operation: HISTOGRAM |
|
94 |
+#' |
|
95 |
+#' returns the non-overlapping regions contributing to the cover, |
|
96 |
+#' each with its accumulation index value, which is assigned to the AccIndex region attribute. |
|
97 |
+#' |
|
98 |
+#' @importFrom methods is |
|
99 |
+#' @importFrom rJava J |
|
100 |
+#' @importFrom rJava .jnull |
|
101 |
+#' @importFrom rJava .jarray |
|
102 |
+#' |
|
103 |
+#' @param input_data returned object from any GMQL function |
|
104 |
+#' @param minAcc minimum number of overlapping regions to be considered during execution |
|
105 |
+#' normally is a single integer number, declared also as string. |
|
106 |
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
107 |
+#' ALL sets the minimum to the number of samples in the input dataset |
|
108 |
+#' @param maxAcc maximum number of overlapping regions to be considered during execution |
|
109 |
+#' normally is a single integer number, declared also as string. |
|
110 |
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
111 |
+#' ALL sets the maximum to the number of samples in the input dataset |
|
112 |
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
113 |
+#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
114 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
115 |
+#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
116 |
+#' For details of CONDITION objects see: |
|
117 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
118 |
+#' |
|
119 |
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
120 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
121 |
+#' |
|
122 |
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
123 |
+#' The \emph{function_aggregate} is an object of class OPERATOR |
|
124 |
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
125 |
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
126 |
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
127 |
+#' \code{\link{Q3}}. |
|
128 |
+#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
129 |
+#' Argument of 'function_aggregate' must exist in schema |
|
130 |
+#' Two style are allowed: |
|
131 |
+#' \itemize{ |
|
132 |
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
133 |
+#' \item list of values: e.g. SUM("pvalue") |
|
134 |
+#' } |
|
135 |
+#' "mixed style" is not allowed |
|
136 |
+#' |
|
137 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
138 |
+#' as input for the subsequent GMQL function |
|
139 |
+#' |
|
140 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
141 |
+#' @seealso \code{\link{flat}} \code{\link{cover}} \code{\link{summit}} |
|
142 |
+#' |
|
143 |
+#' @examples |
|
144 |
+#' |
|
145 |
+#' ## This GMQL statement computes the result grouping the input \emph{exp} samples |
|
146 |
+#' ## by the values of their \emph{cell} metadata attribute, |
|
147 |
+#' ## thus one output \emph{res} sample is generated for each cell type. |
|
148 |
+#' ## Output regions are produced by dividing results from COVER in contiguous subregions |
|
149 |
+#' ## according to the varying accumulation values (from 2 to 4 in this case): |
|
150 |
+#' ## one region for each accumulation value; |
|
151 |
+#' |
|
152 |
+#' initGMQL("gtf") |
|
153 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
154 |
+#' exp = readDataset(test_path) |
|
155 |
+#' res = histogram(exp, 2,4,groupBy = c("cell")) |
|
156 |
+#' |
|
157 |
+#' @export |
|
158 |
+#' |
|
159 |
+histogram <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
160 |
+{ |
|
161 |
+ .doVariant("HISTOGRAM",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
162 |
+} |
|
163 |
+ |
|
164 |
+#' GMQL Operation: SUMMIT |
|
165 |
+#' |
|
166 |
+#' returns regions that start from a position |
|
167 |
+#' where the number of intersecting regions is not increasing afterwards and stops |
|
168 |
+#' at a position where either the number of intersecting regions decreases, |
|
169 |
+#' or it violates the max accumulation index). |
|
170 |
+#' |
|
171 |
+#' @importFrom methods is |
|
172 |
+#' @importFrom rJava J |
|
173 |
+#' @importFrom rJava .jnull |
|
174 |
+#' @importFrom rJava .jarray |
|
175 |
+#' |
|
176 |
+#' @param input_data returned object from any GMQL function |
|
177 |
+#' @param minAcc minimum number of overlapping regions to be considered during execution |
|
178 |
+#' normally is a single integer number, declared also as string. |
|
179 |
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
180 |
+#' ALL sets the minimum to the number of samples in the input dataset |
|
181 |
+#' @param maxAcc maximum number of overlapping regions to be considered during execution |
|
182 |
+#' normally is a single integer number, declared also as string. |
|
183 |
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
184 |
+#' ALL sets the maximum to the number of samples in the input dataset |
|
185 |
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
186 |
+#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
187 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
188 |
+#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
189 |
+#' For details of CONDITION objects see: |
|
190 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
191 |
+#' |
|
192 |
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
193 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
194 |
+#' |
|
195 |
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
196 |
+#' The \emph{function_aggregate} is an object of class OPERATOR |
|
197 |
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
198 |
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
199 |
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
200 |
+#' \code{\link{Q3}}. |
|
201 |
+#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
202 |
+#' Argument of 'function_aggregate' must exist in schema |
|
203 |
+#' Two style are allowed: |
|
204 |
+#' \itemize{ |
|
205 |
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
206 |
+#' \item list of values: e.g. SUM("pvalue") |
|
207 |
+#' } |
|
208 |
+#' "mixed style" is not allowed |
|
209 |
+#' |
|
210 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
211 |
+#' as input for the subsequent GMQL function |
|
212 |
+#' |
|
213 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
214 |
+#' @seealso \code{\link{flat}} \code{\link{cover}} \code{\link{histogram}} |
|
215 |
+#' |
|
216 |
+#' @examples |
|
217 |
+#' |
|
218 |
+#' ## This GMQL statement computes the result grouping the input \emph{exp} samples by the values |
|
219 |
+#' ## of their \emph{cell} metadata attribute, thus one output \emph{res} sample is generated |
|
220 |
+#' ## for each cell type. |
|
221 |
+#' ## Output regions are produced by extracting the highest accumulation overlapping |
|
222 |
+#' ## (sub)regions according to the methodologies described above; |
|
223 |
+#' |
|
224 |
+#' |
|
225 |
+#' initGMQL("gtf") |
|
226 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
227 |
+#' exp = readDataset(test_path) |
|
228 |
+#' res = summit(input_data = exp,2,4, c("cell")) |
|
229 |
+#' |
|
230 |
+#' @export |
|
231 |
+#' |
|
232 |
+summit <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
233 |
+{ |
|
234 |
+ .doVariant("SUMMIT",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
235 |
+} |
|
236 |
+ |
|
237 |
+#' GMQL Operation: FLAT |
|
238 |
+#' |
|
239 |
+#' returns the contiguous region that starts from the first end and stops at |
|
240 |
+#' the last end of the regions which would contribute to each region of the COVER |
|
241 |
+#' |
|
242 |
+#' @importFrom methods is |
|
243 |
+#' @importFrom rJava J |
|
244 |
+#' @importFrom rJava .jnull |
|
245 |
+#' @importFrom rJava .jarray |
|
246 |
+#' |
|
247 |
+#' @param input_data returned object from any GMQL function |
|
248 |
+#' @param minAcc minimum number of overlapping regions to be considered during execution |
|
249 |
+#' normally is a single integer number, declared also as string. |
|
250 |
+#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
251 |
+#' ALL sets the minimum to the number of samples in the input dataset |
|
252 |
+#' @param maxAcc maximum number of overlapping regions to be considered during execution |
|
253 |
+#' normally is a single integer number, declared also as string. |
|
254 |
+#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
255 |
+#' ALL sets the maximum to the number of samples in the input dataset |
|
256 |
+#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
257 |
+#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
258 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
259 |
+#' Every object contains the name of metadata to be used in \emph{groupBy}. |
|
260 |
+#' For details of CONDITION objects see: |
|
261 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
262 |
+#' |
|
263 |
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
264 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
265 |
+#' |
|
266 |
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
267 |
+#' The \emph{function_aggregate} is an object of class OPERATOR |
|
268 |
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
269 |
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
270 |
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
271 |
+#' \code{\link{Q3}}. |
|
272 |
+#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
273 |
+#' Argument of 'function_aggregate' must exist in schema |
|
274 |
+#' Two style are allowed: |
|
275 |
+#' \itemize{ |
|
276 |
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
277 |
+#' \item list of values: e.g. SUM("pvalue") |
|
278 |
+#' } |
|
279 |
+#' "mixed style" is not allowed |
|
280 |
+#' |
|
281 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
282 |
+#' as input for the subsequent GMQL function |
|
283 |
+#' |
|
284 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
285 |
+#' @seealso \code{\link{summit}} \code{\link{cover}} \code{\link{histogram}} |
|
286 |
+#' |
|
287 |
+#' @examples |
|
288 |
+#' |
|
289 |
+#' ## This GMQL statement computes the result grouping the input \emph{exp} samples by |
|
290 |
+#' ## the values of their \emph{cell} metadata attribute, thus one output \emph{res} sample |
|
291 |
+#' ## is generated for each cell type. |
|
292 |
+#' ## Output regions are produced by concatenating all regions which would have been used |
|
293 |
+#' ## to construct a COVER(2,4) statement on the same dataset; |
|
294 |
+#' |
|
295 |
+#' initGMQL("gtf") |
|
296 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
297 |
+#' exp = readDataset(test_path) |
|
298 |
+#' res = flat(input_data = exp,2,4, c("cell")) |
|
299 |
+#' |
|
300 |
+#' @export |
|
301 |
+#' |
|
302 |
+flat <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
303 |
+{ |
|
304 |
+ .doVariant("FLAT",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
305 |
+} |
|
306 |
+ |
|
307 |
+.doVariant <- function(flag,minAcc,maxAcc,groupBy,aggregates,input_data) |
|
308 |
+{ |
|
309 |
+ min <- .check_cover_param(minAcc,TRUE) |
|
310 |
+ max <- .check_cover_param(maxAcc,FALSE) |
|
311 |
+ |
|
312 |
+ if(!is.null(groupBy)) |
|
313 |
+ join_condition_matrix <- .jarray(.join_condition(groupBy),dispatch = TRUE) |
|
314 |
+ else |
|
315 |
+ join_condition_matrix <- .jnull("java/lang/String") |
|
316 |
+ |
|
317 |
+ if(!is.null(aggregates)) |
|
318 |
+ metadata_matrix <- .jarray(.aggregates(aggregates,"OPERATOR"),dispatch = TRUE) |
|
319 |
+ else |
|
320 |
+ metadata_matrix <- .jnull("java/lang/String") |
|
321 |
+ |
|
322 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
323 |
+ response <- switch(flag, |
|
324 |
+ "COVER" = WrappeR$cover(min,max,join_condition_matrix,metadata_matrix,input_data$value), |
|
325 |
+ "FLAT" = WrappeR$flat(min,max,join_condition_matrix,metadata_matrix,input_data$value), |
|
326 |
+ "SUMMIT" = WrappeR$summit(min,max,join_condition_matrix,metadata_matrix,input_data$value), |
|
327 |
+ "HISTOGRAM" = WrappeR$histogram(min,max,join_condition_matrix,metadata_matrix,input_data$value)) |
|
328 |
+ |
|
329 |
+ error <- strtoi(response[1]) |
|
330 |
+ data <- response[2] |
|
331 |
+ |
|
332 |
+ if(error!=0) |
|
333 |
+ stop(data) |
|
334 |
+ else |
|
335 |
+ DAGgraph(data) |
|
336 |
+} |
|
337 |
+ |
|
338 |
+.check_cover_param <- function(param,is_min) |
|
339 |
+{ |
|
340 |
+ if(length(param)>1) |
|
341 |
+ stop("length > 1") |
|
342 |
+ |
|
343 |
+ if(is.numeric(param)) |
|
344 |
+ { |
|
345 |
+ if(param<=0) |
|
346 |
+ stop("No negative value") |
|
347 |
+ else |
|
348 |
+ return(as.character(param)) |
|
349 |
+ } |
|
350 |
+ else if(is.character(param)) |
|
351 |
+ { |
|
352 |
+ if(is_min && identical(param,"ANY")) |
|
353 |
+ stop("min cannot assume ANY as value") |
|
354 |
+ return(param) |
|
355 |
+ } |
|
356 |
+ else |
|
357 |
+ stop("invalid input data") |
|
358 |
+} |
|
359 |
+ |
|
360 |
+ |
0 | 361 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,83 @@ |
1 |
+#' GMQL Operation: DIFFERENCE |
|
2 |
+#' |
|
3 |
+#' It produces one sample in the result for each sample of the left operand, |
|
4 |
+#' by keeping the same metadata of the left input sample and only those regions |
|
5 |
+#' (with their schema and values) of the left input sample which do not intersect with any region |
|
6 |
+#' in the right operand sample. |
|
7 |
+#' The optional \emph{joinby} clause is used to extract a subset of couples |
|
8 |
+#' from the cartesian product of two dataset \emph{left_input_data} x \emph{right_input_data} |
|
9 |
+#' on which to apply the DIFFERENCE operator: |
|
10 |
+#' only those samples that have the same value for each attribute |
|
11 |
+#' are considered when performing the difference. |
|
12 |
+#' |
|
13 |
+#' @importFrom rJava J |
|
14 |
+#' @importFrom rJava .jnull |
|
15 |
+#' @importFrom rJava .jarray |
|
16 |
+#' |
|
17 |
+#' @param right_input_data returned object from any GMQL function |
|
18 |
+#' @param left_input_data returned object from any GMQL function |
|
19 |
+#' @param joinBy list of CONDITION objects, or simple string concatenation |
|
20 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
21 |
+#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
22 |
+#' For details of CONDITION objects see: |
|
23 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
24 |
+#' |
|
25 |
+#' Every condition accepts only one string value (e.g. DEF("cell_type") ) |
|
26 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
27 |
+#' |
|
28 |
+#' @param is_exact single logical value: TRUE means that the region difference is executed only |
|
29 |
+#' on regions in left_input_data with exactly the same coordinates of at least one region present |
|
30 |
+#' in right_input_data; if is_exact = FALSE, the difference is executed on all regions in |
|
31 |
+#' left_input_data that overlap with at least one region in right_input_data (even just one base). |
|
32 |
+#' |
|
33 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
34 |
+#' as input for the subsequent GMQL function |
|
35 |
+#' |
|
36 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
37 |
+#' |
|
38 |
+#' @examples |
|
39 |
+#' |
|
40 |
+#' ## This GMQL statement returns all the regions in the first dataset that do not |
|
41 |
+#' ## overlap any region in the second dataset. |
|
42 |
+#' |
|
43 |
+#' initGMQL("gtf") |
|
44 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
45 |
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL") |
|
46 |
+#' r_left = readDataset(test_path) |
|
47 |
+#' r_right = readDataset(test_path2) |
|
48 |
+#' out = difference(r_left,r_right) |
|
49 |
+#' |
|
50 |
+#' \dontrun{ |
|
51 |
+#' ## This GMQL statement extracts for every pair of samples s1 in EXP1 and s2 in EXP2 |
|
52 |
+#' ## having the same value of the metadata attribute 'antibody_target' |
|
53 |
+#' ## the regions that appear in s1 but do not overlap any region in s2; |
|
54 |
+#' ## metadata of the result are the same as the metadata of s1. |
|
55 |
+#' |
|
56 |
+#' initGMQL("gtf") |
|
57 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
58 |
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL") |
|
59 |
+#' exp1 = readDataset(test_path) |
|
60 |
+#' exp2 = readDataset(test_path2) |
|
61 |
+#' out = difference(exp1,exp2, c("antibody_target")) |
|
62 |
+#' |
|
63 |
+#' } |
|
64 |
+#' |
|
65 |
+#' @export |
|
66 |
+#' |
|
67 |
+difference <- function(left_input_data, right_input_data, joinBy = NULL,is_exact = FALSE) |
|
68 |
+{ |
|
69 |
+ if(!is.null(joinBy)) |
|
70 |
+ join_condition_matrix <- .jarray(.join_condition(joinBy),dispatch = TRUE) |
|
71 |
+ else |
|
72 |
+ join_condition_matrix <- .jnull("java/lang/String") |
|
73 |
+ |
|
74 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
75 |
+ response <- WrappeR$difference(join_condition_matrix,right_input_data$value,left_input_data$value,is_exact) |
|
76 |
+ error <- strtoi(response[1]) |
|
77 |
+ data <- response[2] |
|
78 |
+ if(error!=0) |
|
79 |
+ stop(data) |
|
80 |
+ else |
|
81 |
+ DAGgraph(data) |
|
82 |
+} |
|
83 |
+ |
... | ... |
@@ -6,6 +6,7 @@ |
6 | 6 |
#' |
7 | 7 |
#' @importFrom rJava .jnull |
8 | 8 |
#' @importFrom rJava J |
9 |
+#' @importFrom rJava .jarray |
|
9 | 10 |
#' |
10 | 11 |
#' @param input_data returned object from any GMQL function |
11 | 12 |
#' @param metadata list of element in the form \emph{key} = \emph{function_aggregate}. |
... | ... |
@@ -49,7 +50,7 @@ |
49 | 50 |
#' ## 2. MinP is the minimum pvalue of the sample regions. |
50 | 51 |
#' ## res sample regions are the same as the ones in exp. |
51 | 52 |
#' |
52 |
-#' res = extend(input_data = exp, list(RegionCount = COUNT(),MinP = MIN(pvalue))) |
|
53 |
+#' res = extend(input_data = exp, list(RegionCount = COUNT(),MinP = MIN("pvalue"))) |
|
53 | 54 |
#' |
54 | 55 |
#' } |
55 | 56 |
#' |
... | ... |
@@ -58,7 +59,7 @@ |
58 | 59 |
extend <-function(input_data, metadata = NULL) |
59 | 60 |
{ |
60 | 61 |
if(!is.null(metadata)) |
61 |
- metadata_matrix <- .aggregates(metadata,"META_OPERATOR") |
|
62 |
+ metadata_matrix <- .jarray(.aggregates(metadata,"META_OPERATOR"),dispatch = TRUE) |
|
62 | 63 |
else |
63 | 64 |
metadata_matrix <- .jnull("java/lang/String") |
64 | 65 |
|
65 | 66 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,124 @@ |
1 |
+#' GMQL Operation: JOIN |
|
2 |
+#' |
|
3 |
+#' It takes in input two datasets, respectively known as nchor (left) and experiment (right) and returns |
|
4 |
+#' a dataset of samples consisting of regions extracted from the operands according to the specified condition |
|
5 |
+#' (a.k.a genometric_predicate). |
|
6 |
+#' The number of generated output samples is the Cartesian product of the number of samples |
|
7 |
+#' in the anchor and in the experiment dataset (if joinBy is not specified). |
|
8 |
+#' The output metadata are the union of the input metadata, with their attribute names prefixed with |
|
9 |
+#' left or right respectively. |
|
10 |
+#' |
|
11 |
+#' @importFrom rJava .jnull |
|
12 |
+#' @importFrom rJava J |
|
13 |
+#' @importFrom rJava .jarray |
|
14 |
+#' |
|
15 |
+#' @param left_input_data returned object from any GMQL function |
|
16 |
+#' @param right_input_data returned object from any GMQL function |
|
17 |
+#' @param genometric_predicate is a list of lists of DISTAL object by means of logical ANDs |
|
18 |
+#' For details of DISTAL objects see: |
|
19 |
+#' \code{\link{DLE}}, \code{\link{DGE}}, \code{\link{MD}}, \code{\link{UP}}, |
|
20 |
+#' \code{\link{DOWN}}, \code{\link{DL}}, \code{\link{DG}} |
|
21 |
+#' |
|
22 |
+#' @param joinBy list of CONDITION objects, or simple string concatenation |
|
23 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
24 |
+#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
25 |
+#' For details of CONDITION objects see: |
|
26 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
27 |
+#' |
|
28 |
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
29 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
30 |
+#' |
|
31 |
+#' @param region_output single string that declare which region is given in output for each input pair of left dataset |
|
32 |
+#' right dataset regions satisfying the genometric predicate: |
|
33 |
+#' \itemize{ |
|
34 |
+#' \item{left: outputs the anchor regions from left_input_data that satisfy the genometric predicate} |
|
35 |
+#' \item{right: outputs the experiment regions from right_input_data that satisfy the genometric predicate} |
|
36 |
+#' \item{int (intersection): outputs the overlapping part (intersection) of the left_input_data and right_input_data |
|
37 |
+#' regions that satisfy the genometric predicate; if the intersection is empty, no output is produced} |
|
38 |
+#' \item{contig: outputs the concatenation between the left_input_data and right_input_data regions that satisfy |
|
39 |
+#' the genometric predicate, (i.e. the output regionis defined as having left (right) coordinates |
|
40 |
+#' equal to the minimum (maximum) of the corresponding coordinate values in the left_input_data and right_input_data |
|
41 |
+#' regions satisfying the genometric predicate)} |
|
42 |
+#' } |
|
43 |
+#' |
|
44 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
45 |
+#' as input for the subsequent GMQL function |
|
46 |
+#' |
|
47 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
48 |
+#' |
|
49 |
+#' |
|
50 |
+#' @examples |
|
51 |
+#' |
|
52 |
+#' ## Given a dataset 'hm' and one called 'tss' with a sample including Transcription Start Site annotations, |
|
53 |
+#' ## it searches for those regions of hm that are at a minimal distance from a transcription start site (TSS) |
|
54 |
+#' ## and takes the first/closest one for each TSS, |
|
55 |
+#' ## provided that such distance is lesser than 120K bases and joined 'tss' and 'hm' samples are obtained |
|
56 |
+#' ## from the same provider (joinby clause). |
|
57 |
+#' |
|
58 |
+#' initGMQL("gtf") |
|
59 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
60 |
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL") |
|
61 |
+#' TSS = readDataset(test_path) |
|
62 |
+#' HM = readDataset(test_path2) |
|
63 |
+#' join_data = join(TSS,HM,genometric_predicate=list(list(MD(1),DLE(120000))),c("provider"),region_output="RIGHT") |
|
64 |
+#' |
|
65 |
+#' @export |
|
66 |
+#' |
|
67 |
+join <- function(right_input_data, left_input_data, genometric_predicate = NULL, |
|
68 |
+ joinBy = NULL, region_output="contig") |
|
69 |
+{ |
|
70 |
+ |
|
71 |
+ if(!is.null(genometric_predicate)) |
|
72 |
+ { |
|
73 |
+ if(!is.list(genometric_predicate)) |
|
74 |
+ stop("genometric_predicate must be list of lists") |
|
75 |
+ |
|
76 |
+ if(!all(sapply(genometric_predicate, function(x) is.list(x) ))) |
|
77 |
+ stop("genometric_predicate must be list of lists") |
|
78 |
+ |
|
79 |
+ lapply(genometric_predicate, function(list_pred) { |
|
80 |
+ if(length(list_pred)>4) |
|
81 |
+ { |
|
82 |
+ warning("only 4 element per list, we cut the rest") |
|
83 |
+ length(list_pred)=4 |
|
84 |
+ } |
|
85 |
+ |
|
86 |
+ if(!all(sapply(list_pred, function(x) {is(x,"DISTAL")} ))) |
|
87 |
+ stop("All elements should be DISTAL object") |
|
88 |
+ |
|
89 |
+ }) |
|
90 |
+ |
|
91 |
+ genomatrix <- t(sapply(genometric_predicate, function(list_pred) { |
|
92 |
+ dist_array <- sapply(list_pred, function(x) { |
|
93 |
+ new_value = as.character(x) |
|
94 |
+ array <- c(new_value) |
|
95 |
+ }) |
|
96 |
+ dist_array = c(dist_array,c("NA","NA"),c("NA","NA"),c("NA","NA")) |
|
97 |
+ length(dist_array) = 8 |
|
98 |
+ dist_array |
|
99 |
+ })) |
|
100 |
+ |
|
101 |
+ genomatrix <- .jarray(genomatrix, dispatch = TRUE) |
|
102 |
+ } |
|
103 |
+ else |
|
104 |
+ genomatrix <- .jnull("java/lang/String") |
|
105 |
+ |
|
106 |
+ if(!is.null(joinBy)) |
|
107 |
+ join_condition_matrix <- .jarray(.join_condition(joinBy),dispatch = TRUE) |
|
108 |
+ else |
|
109 |
+ join_condition_matrix <- .jnull("java/lang/String") |
|
110 |
+ |
|
111 |
+ ouput <- toupper(region_output) |
|
112 |
+ if(!identical(ouput,"CONTIG") && !identical(ouput,"LEFT") && !identical(ouput,"RIGHT") |
|
113 |
+ && !identical(ouput,"INT")) |
|
114 |
+ stop("region_output must be contig,left,right or int (intersection)") |
|
115 |
+ |
|
116 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
117 |
+ response <- WrappeR$join(genomatrix,join_condition_matrix, ouput,right_input_data$value, left_input_data$value) |
|
118 |
+ error <- strtoi(response[1]) |
|
119 |
+ data <- response[2] |
|
120 |
+ if(error!=0) |
|
121 |
+ stop(data) |
|
122 |
+ else |
|
123 |
+ DAGgraph(data) |
|
124 |
+} |
0 | 125 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,88 @@ |
1 |
+#' GMQL Operation: MAP |
|
2 |
+#' |
|
3 |
+#' It computes, for each sample in the right dataset, aggregates over the values of the right regions |
|
4 |
+#' that intersect with a region in a left sample, for each region of each sample in the left dataset; |
|
5 |
+#' The number of generated output samples is the Cartesian product of the samples in the two input datasets; |
|
6 |
+#' each output sample has the same regions as the related input left sample, with their attributes and values, |
|
7 |
+#' plus the attributes computed as aggregates over right region values. |
|
8 |
+#' Output sample metadata are the union of the related input sample metadata, |
|
9 |
+#' whose attribute names are prefixed with "left" or "right" respectively. |
|
10 |
+#' |
|
11 |
+#' When the joinby clause is present, only pairs of samples of left_input_data and of right_input_data with |
|
12 |
+#' metadata M1 and M2 respectively that satisfy the joinby condition are considered. |
|
13 |
+#' |
|
14 |
+#' The clause consists of a list of metadata attribute names that must be present with equal values |
|
15 |
+#' in both M1 and M2 |
|
16 |
+#' |
|
17 |
+#' |
|
18 |
+#' @param left_input_data returned object from any GMQL function |
|
19 |
+#' @param right_input_data returned object from any GMQL function |
|
20 |
+#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
21 |
+#' The \emph{function_aggregate} is an object of class OPERATOR |
|
22 |
+#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
23 |
+#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
24 |
+#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, \code{\link{Q3}}. |
|
25 |
+#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
26 |
+#' Argument of 'function_aggregate' must exist in schema |
|
27 |
+#' Two style are allowed: |
|
28 |
+#' \itemize{ |
|
29 |
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
30 |
+#' \item list of values: e.g. SUM("pvalue") |
|
31 |
+#' } |
|
32 |
+#' "mixed style" is not allowed |
|
33 |
+#' |
|
34 |
+#' @param joinBy list of CONDITION objects, or simple string concatenation |
|
35 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
36 |
+#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
37 |
+#' For details of CONDITION objects see: |
|
38 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
39 |
+#' |
|
40 |
+#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
41 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
42 |
+#' |
|
43 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
44 |
+#' as input for the subsequent GMQL function |
|
45 |
+#' |
|
46 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
47 |
+#' |
|
48 |
+#' @examples |
|
49 |
+#' |
|
50 |
+#' ## it counts the number of regions in each sample from exp that overlap with a ref region, |
|
51 |
+#' ## and for each ref region it computes the minimum score of all the regions in each exp sample |
|
52 |
+#' ## that overlap with it. |
|
53 |
+#' ## The MAP joinby option ensures that only the exp samples referring to the same 'cell_tissue' |
|
54 |
+#' ## of a ref sample are mapped on such ref sample; |
|
55 |
+#' ## exp samples with no cell_tissue metadata attribute, or with such metadata |
|
56 |
+#' ## but with a different value from the one(s) of ref sample(s), are disregarded. |
|
57 |
+#' |
|
58 |
+#' initGMQL("gtf") |
|
59 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
60 |
+#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "RGMQL") |
|
61 |
+#' exp = readDataset(test_path) |
|
62 |
+#' ref = readDataset(test_path2) |
|
63 |
+#' out = map(ref,exp, list(minScore = MIN("score")), joinBy = c("cell_tissue") ) |
|
64 |
+#' |
|
65 |
+#' |
|
66 |
+#' @export |
|
67 |
+#' |
|
68 |
+map <- function(left_input_data, right_input_data, aggregates = NULL, joinBy = NULL) |
|
69 |
+{ |
|
70 |
+ if(!is.null(aggregates)) |
|
71 |
+ metadata_matrix <- .jarray(.aggregates(aggregates,"OPERATOR"),dispatch = TRUE) |
|
72 |
+ else |
|
73 |
+ metadata_matrix = .jnull("java/lang/String") |
|
74 |
+ |
|
75 |
+ if(!is.null(joinBy)) |
|
76 |
+ join_condition_matrix <- .jarray(.join_condition(joinBy),dispatch = TRUE) |
|
77 |
+ else |
|
78 |
+ join_condition_matrix <- .jnull("java/lang/String") |
|
79 |
+ |
|
80 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
81 |
+ response<-WrappeR$map(join_condition_matrix,metadata_matrix,left_input_data$value,right_input_data$value) |
|
82 |
+ error <- strtoi(response[1]) |
|
83 |
+ data <- response[2] |
|
84 |
+ if(error!=0) |
|
85 |
+ stop(data) |
|
86 |
+ else |
|
87 |
+ DAGgraph(data) |
|
88 |
+} |
0 | 89 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,197 @@ |
1 |
+#' GMQL Function: EXECUTE |
|
2 |
+#' |
|
3 |
+#' execute GMQL query. |
|
4 |
+#' The function works only after invoking at least one materialize |
|
5 |
+#' |
|
6 |
+#' @details |
|
7 |
+#' |
|
8 |
+#' After invoking execution function, all varialbe associated to DAG will be removed |
|
9 |
+#' from scala enviroment, although the associated R variable will remain stored in R environment |
|
10 |
+#' |
|
11 |
+#' @importFrom rJava J |
|
12 |
+#' |
|
13 |
+#' @return None |
|
14 |
+#' |
|
15 |
+#' @examples |
|
16 |
+#' |
|
17 |
+#' initGMQL() |
|
18 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
19 |
+#' r = readDataset(test_path) |
|
20 |
+#' s = select(input_data = r) |
|
21 |
+#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s) |
|
22 |
+#' materialize(input_data = m, dir_out = test_path) |
|
23 |
+#' |
|
24 |
+#' \dontrun{ |
|
25 |
+#' execute() |
|
26 |
+#' } |
|
27 |
+#' @export |
|
28 |
+#' |
|
29 |
+execute <- function() |
|
30 |
+{ |
|
31 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
32 |
+ remote_proc <- WrappeR$is_remote_processing() |
|
33 |
+ if(!remote_proc) |
|
34 |
+ .download_or_upload() |
|
35 |
+ |
|
36 |
+ response <- WrappeR$execute() |
|
37 |
+ error <- strtoi(response[1]) |
|
38 |
+ data <- response[2] |
|
39 |
+ if(error!=0) |
|
40 |
+ stop(data) |
|
41 |
+ else |
|
42 |
+ { |
|
43 |
+ if(remote_proc) |
|
44 |
+ { |
|
45 |
+ url <- WrappeR$get_url() |
|
46 |
+ .download_or_upload() |
|
47 |
+ serializeQuery(url,FALSE,data) |
|
48 |
+ } |
|
49 |
+ } |
|
50 |
+} |
|
51 |
+ |
|
52 |
+.download_or_upload <- function() |
|
53 |
+{ |
|
54 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
55 |
+ data <- WrappeR$get_dataset_list() |
|
56 |
+ data_list <- apply(data, 1, as.list) |
|
57 |
+ url <- WrappeR$get_url() |
|
58 |
+ remote <- WrappeR$is_remote_processing() |
|
59 |
+ if(remote) |
|
60 |
+ { |
|
61 |
+ sapply(data_list,function(x){ |
|
62 |
+ uploadSamples(url,x[[2]],x[[1]],x[[3]],FALSE) |
|
63 |
+ }) |
|
64 |
+ } |
|
65 |
+ else |
|
66 |
+ { |
|
67 |
+ sapply(data_list,function(x){ |
|
68 |
+ downloadDataset(url,x[[2]],x[[1]]) |
|
69 |
+ }) |
|
70 |
+ } |
|
71 |
+} |
|
72 |
+ |
|
73 |
+#' GMQL Operation: MATERIALIZE |
|
74 |
+#' |
|
75 |
+#' It saves the contents of a dataset that contains samples metadata and samples regions. |
|
76 |
+#' It is normally used to persist the contents of any dataset generated during a GMQL query. |
|
77 |
+#' Any dataset can be materialized, but the operation can be very time-consuming. |
|
78 |
+#' For best performance, materialize the relevant data only. |
|
79 |
+#' |
|
80 |
+#' @importFrom rJava J |
|
81 |
+#' |
|
82 |
+#' @param input_data returned object from any GMQL function |
|
83 |
+#' @param dir_out destination folder path. |
|
84 |
+#' by default is current working directory of the R process |
|
85 |
+#' |
|
86 |
+#' @return None |
|
87 |
+#' |
|
88 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
89 |
+#' |
|
90 |
+#' @examples |
|
91 |
+#' |
|
92 |
+#' initGMQL("gtf") |
|
93 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
94 |
+#' r = readDataset(test_path) |
|
95 |
+#' s = select(input_data = r) |
|
96 |
+#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s) |
|
97 |
+#' materialize(input_data = m, dir_out = test_path) |
|
98 |
+#' |
|
99 |
+#' @export |
|
100 |
+#' |
|
101 |
+materialize <- function(input_data, dir_out = getwd()) |
|
102 |
+{ |
|
103 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
104 |
+ response <- WrappeR$materialize(input_data$value,dir_out) |
|
105 |
+ error <- strtoi(response[1]) |
|
106 |
+ data <- response[2] |
|
107 |
+ if(error!=0) |
|
108 |
+ stop(data) |
|
109 |
+ else |
|
110 |
+ invisible(NULL) |
|
111 |
+} |
|
112 |
+ |
|
113 |
+ |
|
114 |
+#' GMQL Operation: TAKE |
|
115 |
+#' |
|
116 |
+#' It saves the contents of a dataset that contains samples metadata and samples regions. |
|
117 |
+#' It is normally used to store in memoery the contents of any dataset generated during a GMQL query. |
|
118 |
+#' the operation can be very time-consuming. |
|
119 |
+#' If you have invoked any materialization before take function, all those dataset will be materialized |
|
120 |
+#' as folder (like if execution was invoked) |
|
121 |
+#' |
|
122 |
+#' @import GenomicRanges |
|
123 |
+#' @importFrom stats setNames |
|
124 |
+#' @importFrom rJava J |
|
125 |
+#' @importFrom rJava .jevalArray |
|
126 |
+#' |
|
127 |
+#' @param input_data returned object from any GMQL function |
|
128 |
+#' @param rows number of rows for each sample regions that you want to retrieve and stored in memory |
|
129 |
+#' by default is 0 that means take all rows for each sample |
|
130 |
+#' |
|
131 |
+#' @return GrangesList with associated metadata |
|
132 |
+#' |
|
133 |
+#' @examples |
|
134 |
+#' |
|
135 |
+#' initGMQL() |
|
136 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
137 |
+#' r = readDataset(test_path) |
|
138 |
+#' m = merge(groupBy = c("antibody_target","cell_karyotype"),input_data = r) |
|
139 |
+#' g <- take(input_data = m, rows = 45) |
|
140 |
+#' |
|
141 |
+#' @export |
|
142 |
+#' |
|
143 |
+take <- function(input_data, rows=0L) |
|
144 |
+{ |
|
145 |
+ rows <- as.integer(rows[1]) |
|
146 |
+ if(rows<0) |
|
147 |
+ stop("rows cannot be negative") |
|
148 |
+ |
|
149 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
150 |
+ response <- WrappeR$take(input_data$value,rows) |
|
151 |
+ error <- strtoi(response[1]) |
|
152 |
+ data <- response[2] |
|
153 |
+ if(error!=0) |
|
154 |
+ stop(data) |
|
155 |
+ |
|
156 |
+ reg <- .jevalArray(WrappeR$get_reg(),simplify = TRUE) |
|
157 |
+ if(is.null(reg)) |
|
158 |
+ stop("no regions defined") |
|
159 |
+ meta <- .jevalArray(WrappeR$get_meta(),simplify = TRUE) |
|
160 |
+ if(is.null(meta)) |
|
161 |
+ stop("no metadata defined") |
|
162 |
+ schema <- .jevalArray(WrappeR$get_schema(),simplify = TRUE) |
|
163 |
+ if(is.null(schema)) |
|
164 |
+ stop("no schema defined") |
|
165 |
+ |
|
166 |
+ reg_data_frame <- as.data.frame(reg) |
|
167 |
+ list <- split(reg_data_frame, reg_data_frame[1]) |
|
168 |
+ names <- c("seqname","start","end","strand",schema) |
|
169 |
+ |
|
170 |
+ sampleList <- lapply(list, function(x){ |
|
171 |
+ x <- x[-1] |
|
172 |
+ names(x) <- names |
|
173 |
+ g <- GenomicRanges::makeGRangesFromDataFrame(x,keep.extra.columns = TRUE, |
|
174 |
+ start.field = "start",end.field = "end") |
|
175 |
+ }) |
|
176 |
+ gRange_list <- GRangesList(sampleList) |
|
177 |
+ |
|
178 |
+ meta_list <- .metadata_from_frame_to_list(meta) |
|
179 |
+ |
|
180 |
+ S4Vectors::metadata(gRange_list) <- meta_list |
|
181 |
+ return(gRange_list) |
|
182 |
+} |
|
183 |
+ |
|
184 |
+.metadata_from_frame_to_list <- function(metadata_frame) |
|
185 |
+{ |
|
186 |
+ meta_frame <- as.data.frame(metadata_frame) |
|
187 |
+ list <- split(meta_frame, meta_frame[1]) |
|
188 |
+ name_value_list <- lapply(list, function(x){ |
|
189 |
+ x <- x[-1] |
|
190 |
+ }) |
|
191 |
+ meta_list <- lapply(name_value_list, function(x){ |
|
192 |
+ stats::setNames(as.list(as.character(x[[2]])), x[[1]]) |
|
193 |
+ }) |
|
194 |
+} |
|
195 |
+ |
|
196 |
+ |
|
197 |
+ |
0 | 198 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,60 @@ |
1 |
+#' GMQL Operation: MERGE |
|
2 |
+#' |
|
3 |
+#' It builds a dataset consisting of a single sample having as many regions |
|
4 |
+#' as the numebr of regions of the input data and as many metadata as the union of |
|
5 |
+#' the 'attribute-value' tuples of the input samples. |
|
6 |
+#' A groupby clause can be specified on metadata: the samples are then partitioned in groups, |
|
7 |
+#' each with a distinct value of the grouping metadata attributes. |
|
8 |
+#' The operation is separately applied to each group, yielding one sample in the result for each group. |
|
9 |
+#' Samples whose names are not present in the grouping metadata parameter are disregarded. |
|
10 |
+#' |
|
11 |
+#' @importFrom rJava J |
|
12 |
+#' @importFrom rJava .jnull |
|
13 |
+#' @importFrom rJava .jarray |
|
14 |
+#' |
|
15 |
+#' @param input_data returned object from any GMQL function |
|
16 |
+#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
17 |
+#' (i.e c("cell_type","attribute_tag","size")). |
|
18 |
+#' Every object contains the name of metadata to be used in \emph{groupBy}. |
|
19 |
+#' For details of CONDITION objects see: |
|
20 |
+#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
21 |
+#' |
|
22 |
+#' Every condition accepts only one string value (e.g. DEF("cell_type") ) |
|
23 |
+#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
24 |
+#' |
|
25 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
26 |
+#' as input for the subsequent GMQL function |
|
27 |
+#' |
|
28 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
29 |
+#' |
|
30 |
+#' @examples |
|
31 |
+#' |
|
32 |
+#' ## it creates a dataset called merged which contains one sample for each antibody_target value |
|
33 |
+#' ## found within the metadata of the exp dataset sample; |
|
34 |
+#' ## each created sample contains all regions from all 'exp' samples with a specific value for their |
|
35 |
+#' ## antibody_target metadata attribute. |
|
36 |
+#' |
|
37 |
+#' initGMQL("gtf") |
|
38 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
39 |
+#' exp = readDataset(test_path) |
|
40 |
+#' merged = merge(input_data = exp, groupBy = c("antibody_target")) |
|
41 |
+#' |
|
42 |
+#' @export |
|
43 |
+#' |
|
44 |
+merge <- function(input_data, groupBy = NULL) |
|
45 |
+{ |
|
46 |
+ if(!is.null(groupBy)) |
|
47 |
+ join_condition_matrix <- .jarray(.join_condition(groupBy),dispatch = TRUE) |
|
48 |
+ else |
|
49 |
+ join_condition_matrix <- .jnull("java/lang/String") |
|
50 |
+ |
|
51 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
52 |
+ response <- WrappeR$merge(join_condition_matrix,input_data$value) |
|
53 |
+ error <- strtoi(response[1]) |
|
54 |
+ data <- response[2] |
|
55 |
+ if(error!=0) |
|
56 |
+ stop(data) |
|
57 |
+ else |
|
58 |
+ DAGgraph(data) |
|
59 |
+} |
|
60 |
+ |
0 | 61 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,166 @@ |
1 |
+#' GMQL operation: ORDER |
|
2 |
+#' |
|
3 |
+#' It is used to order either samples or sample regions or both, |
|
4 |
+#' according to a set of metadata and/or region attributes, and/or region coordinates. |
|
5 |
+#' Order can be specified as ascending / descending for every attribute |
|
6 |
+#' The number of samples and their regions remain the same (unless mtop/rtop parameters specified) |
|
7 |
+#' but a new ordering metadata and/or region attribute is added. |
|
8 |
+#' Sorted samples or regions have a new attribute "order", added to either metadata, or regions, |
|
9 |
+#' or both of them as specified in input |
|
10 |
+#' The input mtop = k and rtop = m extracts the first k samples and m regions respectively, |
|
11 |
+#' the clause mtopg = k and rtopg = m performs grouping operation, |
|
12 |
+#' grouping by identical values of ordering attributes |
|
13 |
+#' and then selects the first k samples or regions of each group |
|
14 |
+#' |
|
15 |
+#' @importFrom rJava J |
|
16 |
+#' @importFrom rJava .jnull |
|
17 |
+#' @importFrom rJava .jarray |
|
18 |
+#' |
|
19 |
+#' @param input_data "url-like" string taken from GMQL function |
|
20 |
+#' @param metadata_ordering list of ORDER objects where every object contains the name of metadata |
|
21 |
+#' The ORDER's available are: \code{\link{ASC}}, \code{\link{DESC}} |
|
22 |
+#' Every condition accepts only one string value. (e.g. ASC("cell_type") ) |
|
23 |
+#' @param mtop integer value specifying the first k samples. |
|
24 |
+#' default is 0 that means every sample must be considered |
|
25 |
+#' @param mtopg integer value specifying the first j samples in each group. |
|
26 |
+#' default is 0 that means every sample must be considered |
|
27 |
+#' @param mtopp integer value specifying the first j samples in each group. |
|
28 |
+#' default is 0 that means every sample must be considered |
|
29 |
+#' @param regions_ordering list of ORDER objects where every object contains the name of region schema value |
|
30 |
+#' The ORDER's available are: ASC, DESC. |
|
31 |
+#' Every condition accepts only one string value. (e.g. DESC("pvalue") ) |
|
32 |
+#' @param rtop integer value specifying the first m samples in each group. |
|
33 |
+#' default is 0 that means every sample must be considered |
|
34 |
+#' @param rtopg integer value specifying the first i samples in each group. |
|
35 |
+#' default is 0 that means every sample must be considered |
|
36 |
+#' @param rtopp integer value specifying the first i samples in each group. |
|
37 |
+#' default is 0 that means every sample must be considered |
|
38 |
+#' |
|
39 |
+#' |
|
40 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
41 |
+#' as input for the subsequent GMQL function |
|
42 |
+#' |
|
43 |
+#' @details |
|
44 |
+#' mtop, mtopg,mtopp, rtop, rtopg and rtopp are normally numbers: if you specify a vector, |
|
45 |
+#' only the first element will be used |
|
46 |
+#' mtop and mtopg and mtopp are mutalbe exclusive, so rtop and rtopg and rtopp |
|
47 |
+#' |
|
48 |
+#' |
|
49 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
50 |
+#' |
|
51 |
+#' @examples |
|
52 |
+#' |
|
53 |
+#' ## it orders the samples according to the Region_count metadata attribute and takes the two samples |
|
54 |
+#' ## that have the highest count. |
|
55 |
+#' |
|
56 |
+#' initGMQL("gtf") |
|
57 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
58 |
+#' r = readDataset(test_path) |
|
59 |
+#' o = order(r,list(DESC("Region_Count")), mtop = 2) |
|
60 |
+#' |
|
61 |
+#' @export |
|
62 |
+#' |
|
63 |
+order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0,mtopp = 0, |
|
64 |
+ regions_ordering = NULL,rtop = 0,rtopg = 0,rtopp = 0) |
|
65 |
+{ |
|
66 |
+ if(!is.numeric(mtop) || !is.numeric(mtopg) || !is.numeric(rtop) || !is.numeric(rtopg) |
|
67 |
+ || !is.numeric(mtopp)|| !is.numeric(rtopp)) |
|
68 |
+ stop("mtop, rtop, rtopg and mtopg must be integer") |
|
69 |
+ |
|
70 |
+ if(length(mtop)>0 || length(mtopg)>0 || length(rtop)>0 || length(rtopg)>0 |
|
71 |
+ || length(mtopp)>0 || length(rtopp)>0) |
|
72 |
+ warning("only the first element is taken by rtop, mtop, mtopg, rtopg, rtopp, mtopp") |
|
73 |
+ |
|
74 |
+ # we consider only the first element even if input is a vector of Int |
|
75 |
+ # we cut the other arguments |
|
76 |
+ |
|
77 |
+ mtop = as.integer(mtop[1]) |
|
78 |
+ mtopg = as.integer(mtopg[1]) |
|
79 |
+ mtopp = as.integer(mtopp[1]) |
|
80 |
+ |
|
81 |
+ rtop = as.integer(rtop[1]) |
|
82 |
+ rtopg = as.integer(rtopg[1]) |
|
83 |
+ rtopp = as.integer(rtopp[1]) |
|
84 |
+ |
|
85 |
+ if(mtop > 0 && mtopg >0) |
|
86 |
+ { |
|
87 |
+ warning("cannot be used together.\nWe set mtopg = 0") |
|
88 |
+ mtopg = 0L |
|
89 |
+ } |
|
90 |
+ |
|
91 |
+ if(mtop >0 && mtopp>0) |
|
92 |
+ { |
|
93 |
+ warning("cannot be used together.\nWe set mtopp = 0") |
|
94 |
+ mtopp = 0L |
|
95 |
+ } |
|
96 |
+ |
|
97 |
+ if(mtopg >0 && mtopp>0) |
|
98 |
+ { |
|
99 |
+ warning("cannot be used together.\nWe set mtopp = 0") |
|
100 |
+ mtopp = 0L |
|
101 |
+ } |
|
102 |
+ |
|
103 |
+ if(rtop > 0 && rtopg >0) |
|
104 |
+ { |
|
105 |
+ warning("cannot be used together.\nWe set rtopg = 0") |
|
106 |
+ rtopg = 0L |
|
107 |
+ } |
|
108 |
+ |
|
109 |
+ if(rtop >0 && rtopp>0) |
|
110 |
+ { |
|
111 |
+ warning("cannot be used together.\nWe set rtopp = 0") |
|
112 |
+ rtopp = 0L |
|
113 |
+ } |
|
114 |
+ |
|
115 |
+ if(rtopg >0 && rtopp>0) |
|
116 |
+ { |
|
117 |
+ warning("cannot be used together.\nWe set rtopp = 0") |
|
118 |
+ rtopp = 0L |
|
119 |
+ } |
|
120 |
+ |
|
121 |
+ if(!is.null(metadata_ordering)) |
|
122 |
+ meta_matrix <- .jarray(.ordering_meta(metadata_ordering),dispatch = TRUE) |
|
123 |
+ else |
|
124 |
+ meta_matrix <- .jnull("java/lang/String") |
|
125 |
+ |
|
126 |
+ if(!is.null(regions_ordering)) |
|
127 |
+ region_matrix <- .jarray(.ordering_meta(regions_ordering),dispatch = TRUE) |
|
128 |
+ else |
|
129 |
+ region_matrix <- .jnull("java/lang/String") |
|
130 |
+ |
|
131 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
132 |
+ response <- WrappeR$order(meta_matrix,mtopg,mtop,mtopp,region_matrix,rtopg,rtop,rtopp,input_data$value) |
|
133 |
+ error <- strtoi(response[1]) |
|
134 |
+ data <- response[2] |
|
135 |
+ if(error!=0) |
|
136 |
+ stop(data) |
|
137 |
+ else |
|
138 |
+ DAGgraph(data) |
|
139 |
+} |
|
140 |
+ |
|
141 |
+ |
|
142 |
+.ordering_meta <- function(ordering) |
|
143 |
+{ |
|
144 |
+ if(is.list(ordering)) |
|
145 |
+ { |
|
146 |
+ order_matrix <- t(sapply(ordering,function(x){ |
|
147 |
+ new_value <- as.character(x) |
|
148 |
+ if(length(new_value)==1) |
|
149 |
+ new_value = c("ASC",new_value) |
|
150 |
+ else if(!identical("ASC",new_value[1]) && !identical("DESC",new_value[1])) |
|
151 |
+ stop("no more than one value") |
|
152 |
+ matrix <- matrix(new_value) |
|
153 |
+ })) |
|
154 |
+ } |
|
155 |
+ else if(is.character(ordering)) |
|
156 |
+ { |
|
157 |
+ order_matrix <- t(sapply(ordering, function(x) { |
|
158 |
+ new_value = c("ASC",x) |
|
159 |
+ matrix <- matrix(new_value) |
|
160 |
+ })) |
|
161 |
+ } |
|
162 |
+ else |
|
163 |
+ stop("only list or character") |
|
164 |
+} |
|
165 |
+ |
|
166 |
+ |
0 | 167 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,132 @@ |
1 |
+#' GMQL Operation: PROJECT |
|
2 |
+#' |
|
3 |
+#' It creates, from an existing dataset, a new dataset with all the samples from input dataset |
|
4 |
+#' but keeping for each sample in the input dataset only those metadata and/or region attributes |
|
5 |
+#' expressed in the operator parameter list. |
|
6 |
+#' Region coordinates and values of the remaining metadata remain equal to those in the input dataset. |
|
7 |
+#' It allows to: |
|
8 |
+#' \itemize{ |
|
9 |
+#' \item{Remove existing metadata and/or region attributes from a dataset} |
|
10 |
+#' \item{Create new metadata and/or region attributes in the result} |
|
11 |
+#' } |
|
12 |
+#' |
|
13 |
+#' @importFrom rJava J |
|
14 |
+#' @importFrom rJava .jnull |
|
15 |
+#' @importFrom rJava .jarray |
|
16 |
+#' |
|
17 |
+#' @param input_data string pointer taken from GMQL function |
|
18 |
+#' @param metadata vector of string made up by metadata attribute |
|
19 |
+#' @param regions vector of string made up by schema field attribute |
|
20 |
+#' @param all_but_reg logical value indicating which schema filed attribute you want to exclude. |
|
21 |
+#' If FALSE only the regions you choose is kept in the output of the project operation, |
|
22 |
+#' if TRUE the schema region are all except ones include in region parameter. |
|
23 |
+#' if regions is not defined \emph{all_but_reg} is not considerd. |
|
24 |
+#' @param all_but_meta logical value indicating which metadata you want to exclude. |
|
25 |
+#' If FALSE only the metadata you choose is kept in the output of the project operation, |
|
26 |
+#' if TRUE the metadata are all except ones include in region parameter. |
|
27 |
+#' if metadata is not defined \emph{all_but_meta} is not considerd. |
|
28 |
+#' @param regions_update single string predicate made up by operation on schema field attribute |
|
29 |
+#' @param metadata_update single string predicate made up by operation on metadata attribute |
|
30 |
+#' |
|
31 |
+#' @return DAGgraph class object. It contains the value associated to the graph used |
|
32 |
+#' as input for the subsequent GMQL function#' |
|
33 |
+#' |
|
34 |
+#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
35 |
+#' |
|
36 |
+#' |
|
37 |
+#' @examples |
|
38 |
+#' |
|
39 |
+#' ## it creates a new dataset called CTCF_NORM_SCORE by preserving all region attributes apart from score, |
|
40 |
+#' ## and creating a new region attribute called new_score by dividing the existing score value |
|
41 |
+#' ## of each region by 1000.0 and incrementing it by 100. |
|
42 |
+#' ## It also generates, for each sample of the new dataset, |
|
43 |
+#' ## a new metadata attribute called normalized with value 1, which can be used in future selections. |
|
44 |
+#' |
|
45 |
+#' initGMQL("gtf") |
|
46 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
47 |
+#' input = readDataset(test_path) |
|
48 |
+#' CTCF_NORM_SCORE = project(input,metadata_update="normalized AS 1", regions_update="new_score AS (score / 1000.0) + 100" , regions=c("score"), all_but_reg=TRUE) |
|
49 |
+#' |
|
50 |
+#' |
|
51 |
+#' \dontrun{ |
|
52 |
+#' |
|
53 |
+#' ## it produces an output dataset that contains the same samples as the input dataset. |
|
54 |
+#' ## Each output sample only contains, as region attributes, |
|
55 |
+#' ## the four basic coordinates (chr, left, right, strand) and the specified region attributes |
|
56 |
+#' ## 'variant_classification' and 'variant_type', and as metadata attributes only the specified ones, |
|
57 |
+#' ## i.e. manually_curated__tissue_status and manually_curated__tumor_tag. |
|
58 |
+#' |
|
59 |
+#' initGMQL("gtf") |
|
60 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
61 |
+#' DS_in = readDataset(test_path) |
|
62 |
+#' DS_out = project(DS_in,regions=c("variant_classification", "variant_type"), |
|
63 |
+#' metadata=c("manually_curated__tissue_status","manually_curated__tumor_tag")) |
|
64 |
+#' |
|
65 |
+#' } |
|
66 |
+#' |
|
67 |
+#' @export |
|
68 |
+#' |
|
69 |
+#' |
|
70 |
+project <-function(input_data, metadata = NULL,metadata_update=NULL,all_but_meta = FALSE, |
|
71 |
+ regions = NULL, regions_update = NULL,all_but_reg=FALSE) |
|
72 |
+{ |
|
73 |
+ if(!is.null(metadata)) |
|
74 |
+ { |
|
75 |
+ if(!is.character(metadata)) |
|
76 |
+ stop("metadata: no valid input") |
|
77 |
+ |
|
78 |
+ metadata <- metadata[!metadata %in% ""] |
|
79 |
+ metadata <- metadata[!duplicated(metadata)] |
|
80 |
+ |
|
81 |
+ if(length(metadata)==0) |
|
82 |
+ metadata <- .jnull("java/lang/String") |
|
83 |
+ |
|
84 |
+ metadata <- .jarray(metadata) |
|
85 |
+ } |
|
86 |
+ else |
|
87 |
+ metadata <- .jnull("java/lang/String") |
|
88 |
+ |
|
89 |
+ if(!is.null(regions)) |
|
90 |
+ { |
|
91 |
+ if(!is.character(regions)) |
|
92 |
+ stop("regions: no valid input") |
|
93 |
+ |
|
94 |
+ regions = regions[!regions %in% ""] |
|
95 |
+ regions = regions[!duplicated(regions)] |
|
96 |
+ |
|
97 |
+ if(length(regions)==0) |
|
98 |
+ regions <- .jnull("java/lang/String") |
|
99 |
+ |
|
100 |
+ regions <- .jarray(regions) |
|
101 |
+ } |
|
102 |
+ else |
|
103 |
+ regions <- .jnull("java/lang/String") |
|
104 |
+ |
|
105 |
+ if(!is.null(regions_update)) |
|
106 |
+ .check_predicate(regions_update) |
|
107 |
+ else |
|
108 |
+ regions_update <- .jnull("java/lang/String") |
|
109 |
+ |
|
110 |
+ if(!is.null(metadata_update)) |
|
111 |
+ .check_predicate(metadata_update) |
|
112 |
+ else |
|
113 |
+ metadata_update <- .jnull("java/lang/String") |
|
114 |
+ |
|
115 |
+ if(length(all_but_meta)>1) |
|
116 |
+ warning("all_but_meta: no multiple values") |
|
117 |
+ |
|
118 |
+ if(length(all_but_reg)>1) |
|
119 |
+ warning("all_but_reg: no multiple values") |
|
120 |
+ all_but_reg <- all_but_reg[1] |
|
121 |
+ all_but_meta <- all_but_meta[1] |
|
122 |
+ |
|
123 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
124 |
+ response <- WrappeR$project(metadata,metadata_update,all_but_meta, |
|
125 |
+ regions,regions_update,all_but_reg,input_data$value) |
|
126 |
+ error <- strtoi(response[1]) |
|
127 |
+ data <- response[2] |
|
128 |
+ if(error!=0) |
|
129 |
+ stop(data) |
|
130 |
+ else |
|
131 |
+ DAGgraph(data) |
|
132 |
+} |
127 | 127 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,120 @@ |
1 |
+#' GMQL Operation: SELECT |
|
2 |
+#' |
|
3 |
+#' It extracts a subset of samples from the input dataset. |
|
4 |
+#' It returns all the samples satisfying the predicate on metadata. |
|
5 |
+#' If regions are specified, returns regions satisfying the predicate on regions. |
|
6 |
+#' If semijoin clauses are specified they are applied, too. |
|
7 |
+#' When semijoin is defined, it extracts those samples containing all metadata attribute defined in semijoin clause |
|
8 |
+#' with at least one metadata value in common with semi join dataset |
|