... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: RGMQL |
2 | 2 |
Type: Package |
3 |
-Title: GMQL function |
|
4 |
-Version: 0.99.0 |
|
3 |
+Title: GenoMetric Query Language for R/Bioconductor |
|
4 |
+Version: 0.99.1 |
|
5 | 5 |
Author: Simone Pallotta, Marco Masseroli |
6 | 6 |
Maintainer: Simone Pallotta <simonepallotta@hotmail.com> |
7 | 7 |
Description: This package brings GMQL functionalities into R environemnt. |
... | ... |
@@ -21,7 +21,7 @@ License: Artistic-2.0 |
21 | 21 |
Encoding: UTF-8 |
22 | 22 |
LazyData: true |
23 | 23 |
RoxygenNote: 6.0.1 |
24 |
-Imports: rscala(>= 2.4.0), httr, GenomicRanges, rtracklayer, data.table, utils, plyr, xml2, methods, S4Vectors, dplyr, stats |
|
24 |
+Imports: httr, rJava,GenomicRanges, rtracklayer, data.table, utils, plyr, xml2, methods, S4Vectors, dplyr, stats |
|
25 | 25 |
Depends: R(>= 3.3.2) |
26 | 26 |
VignetteBuilder: knitr |
27 | 27 |
Suggests: BiocStyle, knitr, rmarkdown |
... | ... |
@@ -1,78 +1,34 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
-export(ASC) |
|
4 |
-export(AVG) |
|
5 |
-export(BAG) |
|
6 |
-export(COUNT) |
|
7 |
-export(DEF) |
|
8 |
-export(DESC) |
|
9 |
-export(DGE) |
|
10 |
-export(DLE) |
|
11 |
-export(DOWN) |
|
12 |
-export(EXACT) |
|
13 |
-export(FULL) |
|
14 |
-export(MAX) |
|
15 |
-export(MD) |
|
16 |
-export(MEDIAN) |
|
17 |
-export(MIN) |
|
18 |
-export(Q1) |
|
19 |
-export(Q2) |
|
20 |
-export(Q3) |
|
21 |
-export(STD) |
|
22 |
-export(SUM) |
|
23 | 3 |
export(TFARMatrix) |
24 | 4 |
export(TFARMemAtrix) |
25 |
-export(UP) |
|
26 |
-export(compileQuery) |
|
27 |
-export(compileQuery.fromfile) |
|
28 |
-export(cover) |
|
29 | 5 |
export(deleteDataset) |
30 |
-export(difference) |
|
31 | 6 |
export(downloadDataset) |
32 | 7 |
export(downloadDatasetToGrangesList) |
33 |
-export(execute) |
|
34 | 8 |
export(exportGMQL.gdm) |
35 | 9 |
export(exportGMQL.gtf) |
36 | 10 |
export(extend) |
37 |
-export(flat) |
|
38 |
-export(histogram) |
|
39 | 11 |
export(importGMQL.gdm) |
40 | 12 |
export(importGMQL.gtf) |
41 | 13 |
export(initGMQL) |
42 |
-export(join) |
|
43 | 14 |
export(login.GMQL) |
44 | 15 |
export(logout.GMQL) |
45 |
-export(map) |
|
46 |
-export(materialize) |
|
47 |
-export(merge) |
|
48 | 16 |
export(metadataFromSample) |
49 |
-export(order) |
|
50 |
-export(project) |
|
51 | 17 |
export(read) |
52 | 18 |
export(readDataset) |
53 | 19 |
export(regionFromSample) |
54 | 20 |
export(register.GMQL) |
55 | 21 |
export(remote_processing) |
56 |
-export(runQuery) |
|
57 |
-export(runQuery.fromfile) |
|
58 | 22 |
export(saveQuery) |
59 | 23 |
export(saveQuery.fromfile) |
60 |
-export(select) |
|
61 | 24 |
export(showDatasets) |
62 |
-export(showJobLog) |
|
63 |
-export(showJobs) |
|
64 | 25 |
export(showQueries) |
65 | 26 |
export(showSamplesFromDataset) |
66 | 27 |
export(showSchemaFromDataset) |
67 |
-export(stopJob) |
|
68 |
-export(summit) |
|
69 |
-export(take) |
|
70 |
-export(traceJob) |
|
71 |
-export(union) |
|
72 | 28 |
export(uploadSamples) |
73 | 29 |
import(GenomicRanges) |
74 | 30 |
import(httr) |
75 |
-import(rscala) |
|
31 |
+import(rJava) |
|
76 | 32 |
import(xml2) |
77 | 33 |
importClassesFrom(GenomicRanges,GRangesList) |
78 | 34 |
importFrom(GenomicRanges,makeGRangesFromDataFrame) |
... | ... |
@@ -81,9 +37,11 @@ importFrom(data.table,fread) |
81 | 37 |
importFrom(dplyr,bind_cols) |
82 | 38 |
importFrom(methods,is) |
83 | 39 |
importFrom(plyr,revalue) |
40 |
+importFrom(rJava,.jinit) |
|
41 |
+importFrom(rJava,.jnew) |
|
42 |
+importFrom(rJava,.jpackage) |
|
84 | 43 |
importFrom(rtracklayer,export) |
85 | 44 |
importFrom(rtracklayer,import) |
86 |
-importFrom(stats,setNames) |
|
87 | 45 |
importFrom(utils,read.delim) |
88 | 46 |
importFrom(utils,unzip) |
89 | 47 |
importFrom(utils,write.table) |
90 | 48 |
deleted file mode 100644 |
... | ... |
@@ -1,348 +0,0 @@ |
1 |
-#' GMQL Operation: COVER |
|
2 |
-#' |
|
3 |
-#' it takes as input a dataset and returns another dataset (with a single sample, if no \emph{groupby} option is specified) |
|
4 |
-#' by “collapsing” the input dataset samples and their regions according to certain rules specified by the input parameters. |
|
5 |
-#' The attributes of the output genomic regions are only the region coordinates, and Jaccard indexes (JaccardIntersect and JaccardResult). |
|
6 |
-#' Jaccard Indexes are standard measures of similarity of the contributing regions, added as default region attributes. |
|
7 |
-#' The JaccardIntersect index is calculated as the ratio between the lengths of the intersection |
|
8 |
-#' and of the union of the contributing regions; the JaccardResult index is calculated as the ratio |
|
9 |
-#' between the lengths of region and the union of the contributing regions. |
|
10 |
-#' If aggregate functions are specified, new attributes are added. |
|
11 |
-#' Output metadata are the union of the input ones. |
|
12 |
-#' If \emph{groupby} clause is specified, the input samples are partitioned in groups, |
|
13 |
-#' each with distinct values of the grouping metadata attributes, and the COVER operation is separately |
|
14 |
-#' applied to each group, yielding to one sample in the result for each group. |
|
15 |
-#' Input samples that do not satisfy the \emph{groupby} condition are disregarded. |
|
16 |
-#' |
|
17 |
-#' @importFrom methods is |
|
18 |
-#' |
|
19 |
-#' @param input_data returned object from any GMQL function |
|
20 |
-#' @param minAcc minimum number of overlapping regions to be considered during executio.n |
|
21 |
-#' Is a single integer number, declared also as string. |
|
22 |
-#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
23 |
-#' ALL sets the minimum to the number of samples in the input dataset |
|
24 |
-#' @param maxAcc maximum number of overlapping regions to be considered during execution. |
|
25 |
-#' Is a single integer number, declared also as string. |
|
26 |
-#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
27 |
-#' ALL sets the maximum to the number of samples in the input dataset |
|
28 |
-#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
29 |
-#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
30 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
31 |
-#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
32 |
-#' For details of CONDITION objects see: |
|
33 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
34 |
-#' |
|
35 |
-#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
36 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
37 |
-#' |
|
38 |
-#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
39 |
-#' The \emph{function_aggregate} is an object of class OPERATOR |
|
40 |
-#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
41 |
-#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
42 |
-#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
43 |
-#' \code{\link{Q3}}. |
|
44 |
-#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
45 |
-#' Argument of 'function_aggregate' must exist in schema |
|
46 |
-#' Two style are allowed: |
|
47 |
-#' \itemize{ |
|
48 |
-#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
49 |
-#' \item list of values: e.g. SUM("pvalue") |
|
50 |
-#' } |
|
51 |
-#' "mixed style" is not allowed |
|
52 |
-#' |
|
53 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
54 |
-#' as input for the subsequent GMQL function |
|
55 |
-#' |
|
56 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
57 |
-#' |
|
58 |
-#' @seealso \code{\link{summit}} \code{\link{flat}} \code{\link{histogram}} |
|
59 |
-#' |
|
60 |
-#' @examples |
|
61 |
-#' |
|
62 |
-#' ## This GMQL statement produces an output dataset with a single output sample. |
|
63 |
-#' ## The COVER operation considers all areas defined by a minimum of two overlapping regions |
|
64 |
-#' ## in the input samples, up to any amount of overlapping regions. |
|
65 |
-#' |
|
66 |
-#' initGMQL("gtf") |
|
67 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
68 |
-#' exp = readDataset(test_path) |
|
69 |
-#' res = cover(input_data = exp,2,"ANY") |
|
70 |
-#' |
|
71 |
-#' \dontrun{ |
|
72 |
-#' ## This GMQL statement computes the result grouping the input exp samples by the values of |
|
73 |
-#' ## their cell metadata attribute, |
|
74 |
-#' ## thus one output res sample is generated for each cell type; |
|
75 |
-#' ## output regions are produced where at least 2 and at most 3 regions of grouped exp samples |
|
76 |
-#' ## overlap, setting as attributes of the resulting regions the minimum pvalue of the overlapping regions |
|
77 |
-#' ## (min_pvalue) and their Jaccard indexes (JaccardIntersect and JaccardResult). |
|
78 |
-#' |
|
79 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
80 |
-#' exp = read(test_path) |
|
81 |
-#' res = cover(input_data = exp,2,3, c("cell"), list(min_pValue = MIN("pvalue"))) |
|
82 |
-#' } |
|
83 |
-#' @export |
|
84 |
-#' |
|
85 |
-cover <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
86 |
-{ |
|
87 |
- .doVariant("COVER",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
88 |
-} |
|
89 |
- |
|
90 |
-#' GMQL Operation: HISTOGRAM |
|
91 |
-#' |
|
92 |
-#' returns the non-overlapping regions contributing to the cover, |
|
93 |
-#' each with its accumulation index value, which is assigned to the AccIndex region attribute. |
|
94 |
-#' |
|
95 |
-#' @importFrom methods is |
|
96 |
-#' |
|
97 |
-#' @param input_data returned object from any GMQL function |
|
98 |
-#' @param minAcc minimum number of overlapping regions to be considered during execution |
|
99 |
-#' normally is a single integer number, declared also as string. |
|
100 |
-#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
101 |
-#' ALL sets the minimum to the number of samples in the input dataset |
|
102 |
-#' @param maxAcc maximum number of overlapping regions to be considered during execution |
|
103 |
-#' normally is a single integer number, declared also as string. |
|
104 |
-#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
105 |
-#' ALL sets the maximum to the number of samples in the input dataset |
|
106 |
-#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
107 |
-#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
108 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
109 |
-#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
110 |
-#' For details of CONDITION objects see: |
|
111 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
112 |
-#' |
|
113 |
-#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
114 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
115 |
-#' |
|
116 |
-#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
117 |
-#' The \emph{function_aggregate} is an object of class OPERATOR |
|
118 |
-#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
119 |
-#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
120 |
-#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
121 |
-#' \code{\link{Q3}}. |
|
122 |
-#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
123 |
-#' Argument of 'function_aggregate' must exist in schema |
|
124 |
-#' Two style are allowed: |
|
125 |
-#' \itemize{ |
|
126 |
-#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
127 |
-#' \item list of values: e.g. SUM("pvalue") |
|
128 |
-#' } |
|
129 |
-#' "mixed style" is not allowed |
|
130 |
-#' |
|
131 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
132 |
-#' as input for the subsequent GMQL function |
|
133 |
-#' |
|
134 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
135 |
-#' @seealso \code{\link{flat}} \code{\link{cover}} \code{\link{summit}} |
|
136 |
-#' |
|
137 |
-#' @examples |
|
138 |
-#' |
|
139 |
-#' ## This GMQL statement computes the result grouping the input \emph{exp} samples |
|
140 |
-#' ## by the values of their \emph{cell} metadata attribute, |
|
141 |
-#' ## thus one output \emph{res} sample is generated for each cell type. |
|
142 |
-#' ## Output regions are produced by dividing results from COVER in contiguous subregions |
|
143 |
-#' ## according to the varying accumulation values (from 2 to 4 in this case): |
|
144 |
-#' ## one region for each accumulation value; |
|
145 |
-#' |
|
146 |
-#' initGMQL("gtf") |
|
147 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
148 |
-#' exp = readDataset(test_path) |
|
149 |
-#' res = histogram(exp, 2,4,groupBy = c("cell")) |
|
150 |
-#' |
|
151 |
-#' @export |
|
152 |
-#' |
|
153 |
-histogram <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
154 |
-{ |
|
155 |
- .doVariant("HISTOGRAM",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
156 |
-} |
|
157 |
- |
|
158 |
-#' GMQL Operation: SUMMIT |
|
159 |
-#' |
|
160 |
-#' returns regions that start from a position |
|
161 |
-#' where the number of intersecting regions is not increasing afterwards and stops |
|
162 |
-#' at a position where either the number of intersecting regions decreases, |
|
163 |
-#' or it violates the max accumulation index). |
|
164 |
-#' |
|
165 |
-#' @importFrom methods is |
|
166 |
-#' |
|
167 |
-#' @param input_data returned object from any GMQL function |
|
168 |
-#' @param minAcc minimum number of overlapping regions to be considered during execution |
|
169 |
-#' normally is a single integer number, declared also as string. |
|
170 |
-#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
171 |
-#' ALL sets the minimum to the number of samples in the input dataset |
|
172 |
-#' @param maxAcc maximum number of overlapping regions to be considered during execution |
|
173 |
-#' normally is a single integer number, declared also as string. |
|
174 |
-#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
175 |
-#' ALL sets the maximum to the number of samples in the input dataset |
|
176 |
-#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
177 |
-#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
178 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
179 |
-#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
180 |
-#' For details of CONDITION objects see: |
|
181 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
182 |
-#' |
|
183 |
-#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
184 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
185 |
-#' |
|
186 |
-#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
187 |
-#' The \emph{function_aggregate} is an object of class OPERATOR |
|
188 |
-#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
189 |
-#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
190 |
-#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
191 |
-#' \code{\link{Q3}}. |
|
192 |
-#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
193 |
-#' Argument of 'function_aggregate' must exist in schema |
|
194 |
-#' Two style are allowed: |
|
195 |
-#' \itemize{ |
|
196 |
-#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
197 |
-#' \item list of values: e.g. SUM("pvalue") |
|
198 |
-#' } |
|
199 |
-#' "mixed style" is not allowed |
|
200 |
-#' |
|
201 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
202 |
-#' as input for the subsequent GMQL function |
|
203 |
-#' |
|
204 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
205 |
-#' @seealso \code{\link{flat}} \code{\link{cover}} \code{\link{histogram}} |
|
206 |
-#' |
|
207 |
-#' @examples |
|
208 |
-#' |
|
209 |
-#' ## This GMQL statement computes the result grouping the input \emph{exp} samples by the values |
|
210 |
-#' ## of their \emph{cell} metadata attribute, thus one output \emph{res} sample is generated |
|
211 |
-#' ## for each cell type. |
|
212 |
-#' ## Output regions are produced by extracting the highest accumulation overlapping |
|
213 |
-#' ## (sub)regions according to the methodologies described above; |
|
214 |
-#' |
|
215 |
-#' |
|
216 |
-#' initGMQL("gtf") |
|
217 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
218 |
-#' exp = readDataset(test_path) |
|
219 |
-#' res = summit(input_data = exp,2,4, c("cell")) |
|
220 |
-#' |
|
221 |
-#' @export |
|
222 |
-#' |
|
223 |
-summit <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
224 |
-{ |
|
225 |
- .doVariant("SUMMIT",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
226 |
-} |
|
227 |
- |
|
228 |
-#' GMQL Operation: FLAT |
|
229 |
-#' |
|
230 |
-#' returns the contiguous region that starts from the first end and stops at |
|
231 |
-#' the last end of the regions which would contribute to each region of the COVER |
|
232 |
-#' |
|
233 |
-#' @importFrom methods is |
|
234 |
-#' |
|
235 |
-#' @param input_data returned object from any GMQL function |
|
236 |
-#' @param minAcc minimum number of overlapping regions to be considered during execution |
|
237 |
-#' normally is a single integer number, declared also as string. |
|
238 |
-#' minAcc accept ALL and string like (ALL+N)/K as special keyword |
|
239 |
-#' ALL sets the minimum to the number of samples in the input dataset |
|
240 |
-#' @param maxAcc maximum number of overlapping regions to be considered during execution |
|
241 |
-#' normally is a single integer number, declared also as string. |
|
242 |
-#' maxAcc accept ALL, ANY and string like (ALL+N)/K as special keyword |
|
243 |
-#' ALL sets the maximum to the number of samples in the input dataset |
|
244 |
-#' ANY acts as a wildcard, consider all areas defined to any amount of overlapping |
|
245 |
-#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
246 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
247 |
-#' Every object contains the name of metadata to be used in \emph{groupBy}. |
|
248 |
-#' For details of CONDITION objects see: |
|
249 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
250 |
-#' |
|
251 |
-#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
252 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
253 |
-#' |
|
254 |
-#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
255 |
-#' The \emph{function_aggregate} is an object of class OPERATOR |
|
256 |
-#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
257 |
-#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
258 |
-#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, |
|
259 |
-#' \code{\link{Q3}}. |
|
260 |
-#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
261 |
-#' Argument of 'function_aggregate' must exist in schema |
|
262 |
-#' Two style are allowed: |
|
263 |
-#' \itemize{ |
|
264 |
-#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
265 |
-#' \item list of values: e.g. SUM("pvalue") |
|
266 |
-#' } |
|
267 |
-#' "mixed style" is not allowed |
|
268 |
-#' |
|
269 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
270 |
-#' as input for the subsequent GMQL function |
|
271 |
-#' |
|
272 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
273 |
-#' @seealso \code{\link{summit}} \code{\link{cover}} \code{\link{histogram}} |
|
274 |
-#' |
|
275 |
-#' @examples |
|
276 |
-#' |
|
277 |
-#' ## This GMQL statement computes the result grouping the input \emph{exp} samples by |
|
278 |
-#' ## the values of their \emph{cell} metadata attribute, thus one output \emph{res} sample |
|
279 |
-#' ## is generated for each cell type. |
|
280 |
-#' ## Output regions are produced by concatenating all regions which would have been used |
|
281 |
-#' ## to construct a COVER(2,4) statement on the same dataset; |
|
282 |
-#' |
|
283 |
-#' initGMQL("gtf") |
|
284 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
285 |
-#' exp = readDataset(test_path) |
|
286 |
-#' res = flat(input_data = exp,2,4, c("cell")) |
|
287 |
-#' |
|
288 |
-#' @export |
|
289 |
-#' |
|
290 |
-flat <- function(input_data, minAcc, maxAcc, groupBy = NULL, aggregates = NULL) |
|
291 |
-{ |
|
292 |
- .doVariant("FLAT",minAcc,maxAcc,groupBy,aggregates,input_data) |
|
293 |
-} |
|
294 |
- |
|
295 |
-.doVariant <- function(flag,minAcc,maxAcc,groupBy,aggregates,input_data) |
|
296 |
-{ |
|
297 |
- min <- .check_cover_param(minAcc,TRUE) |
|
298 |
- max <- .check_cover_param(maxAcc,FALSE) |
|
299 |
- |
|
300 |
- if(!is.null(groupBy)) |
|
301 |
- join_condition_matrix <- .join_condition(groupBy) |
|
302 |
- else |
|
303 |
- join_condition_matrix <- scalaNull("Array[Array[String]]") |
|
304 |
- |
|
305 |
- if(!is.null(aggregates)) |
|
306 |
- metadata_matrix <- .aggregates(aggregates,"OPERATOR") |
|
307 |
- else |
|
308 |
- metadata_matrix <- scalaNull("Array[Array[String]]") |
|
309 |
- |
|
310 |
- |
|
311 |
- response <- switch(flag, |
|
312 |
- "COVER" = WrappeR$cover(min,max,join_condition_matrix,metadata_matrix,input_data$value), |
|
313 |
- "FLAT" = WrappeR$flat(min,max,join_condition_matrix,metadata_matrix,input_data$value), |
|
314 |
- "SUMMIT" = WrappeR$summit(min,max,join_condition_matrix,metadata_matrix,input_data$value), |
|
315 |
- "HISTOGRAM" = WrappeR$histogram(min,max,join_condition_matrix,metadata_matrix,input_data$value)) |
|
316 |
- |
|
317 |
- error <- strtoi(response[1]) |
|
318 |
- data <- response[2] |
|
319 |
- |
|
320 |
- if(error!=0) |
|
321 |
- stop(data) |
|
322 |
- else |
|
323 |
- DAGgraph(data) |
|
324 |
-} |
|
325 |
- |
|
326 |
-.check_cover_param <- function(param,is_min) |
|
327 |
-{ |
|
328 |
- if(length(param)>1) |
|
329 |
- stop("length > 1") |
|
330 |
- |
|
331 |
- if(is.numeric(param)) |
|
332 |
- { |
|
333 |
- if(param<=0) |
|
334 |
- stop("No negative value") |
|
335 |
- else |
|
336 |
- return(as.integer(param)) |
|
337 |
- } |
|
338 |
- else if(is.character(param)) |
|
339 |
- { |
|
340 |
- if(is_min && identical(param,"ANY")) |
|
341 |
- stop("min cannot assume ANY as value") |
|
342 |
- return(param) |
|
343 |
- } |
|
344 |
- else |
|
345 |
- stop("invalid input data") |
|
346 |
-} |
|
347 |
- |
|
348 |
- |
349 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,79 +0,0 @@ |
1 |
-#' GMQL Operation: DIFFERENCE |
|
2 |
-#' |
|
3 |
-#' It produces one sample in the result for each sample of the left operand, |
|
4 |
-#' by keeping the same metadata of the left input sample and only those regions |
|
5 |
-#' (with their schema and values) of the left input sample which do not intersect with any region |
|
6 |
-#' in the right operand sample. |
|
7 |
-#' The optional \emph{joinby} clause is used to extract a subset of couples |
|
8 |
-#' from the cartesian product of two dataset \emph{left_input_data} x \emph{right_input_data} |
|
9 |
-#' on which to apply the DIFFERENCE operator: |
|
10 |
-#' only those samples that have the same value for each attribute |
|
11 |
-#' are considered when performing the difference. |
|
12 |
-#' |
|
13 |
-#' |
|
14 |
-#' @param right_input_data returned object from any GMQL function |
|
15 |
-#' @param left_input_data returned object from any GMQL function |
|
16 |
-#' @param joinBy list of CONDITION objects, or simple string concatenation |
|
17 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
18 |
-#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
19 |
-#' For details of CONDITION objects see: |
|
20 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
21 |
-#' |
|
22 |
-#' Every condition accepts only one string value (e.g. DEF("cell_type") ) |
|
23 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
24 |
-#' |
|
25 |
-#' @param is_exact single logical value: TRUE means that the region difference is executed only |
|
26 |
-#' on regions in left_input_data with exactly the same coordinates of at least one region present |
|
27 |
-#' in right_input_data; if is_exact = FALSE, the difference is executed on all regions in |
|
28 |
-#' left_input_data that overlap with at least one region in right_input_data (even just one base). |
|
29 |
-#' |
|
30 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
31 |
-#' as input for the subsequent GMQL function |
|
32 |
-#' |
|
33 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
34 |
-#' |
|
35 |
-#' @examples |
|
36 |
-#' |
|
37 |
-#' ## This GMQL statement returns all the regions in the first dataset that do not |
|
38 |
-#' ## overlap any region in the second dataset. |
|
39 |
-#' |
|
40 |
-#' initGMQL("gtf") |
|
41 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
42 |
-#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "GMQL") |
|
43 |
-#' r_left = readDataset(test_path) |
|
44 |
-#' r_right = readDataset(test_path2) |
|
45 |
-#' out = difference(r_left,r_right) |
|
46 |
-#' |
|
47 |
-#' \dontrun{ |
|
48 |
-#' ## This GMQL statement extracts for every pair of samples s1 in EXP1 and s2 in EXP2 |
|
49 |
-#' ## having the same value of the metadata attribute 'antibody_target' |
|
50 |
-#' ## the regions that appear in s1 but do not overlap any region in s2; |
|
51 |
-#' ## metadata of the result are the same as the metadata of s1. |
|
52 |
-#' |
|
53 |
-#' initGMQL("gtf") |
|
54 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
55 |
-#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "GMQL") |
|
56 |
-#' exp1 = readDataset(test_path) |
|
57 |
-#' exp2 = readDataset(test_path2) |
|
58 |
-#' out = difference(exp1,exp2, c("antibody_target")) |
|
59 |
-#' |
|
60 |
-#' } |
|
61 |
-#' |
|
62 |
-#' @export |
|
63 |
-#' |
|
64 |
-difference <- function(left_input_data, right_input_data, joinBy = NULL,is_exact = FALSE) |
|
65 |
-{ |
|
66 |
- if(!is.null(joinBy)) |
|
67 |
- join_condition_matrix <- .join_condition(joinBy) |
|
68 |
- else |
|
69 |
- join_condition_matrix <- scalaNull("Array[Array[String]]") |
|
70 |
- |
|
71 |
- response <- WrappeR$difference(join_condition_matrix,right_input_data$value,left_input_data$value,is_exact) |
|
72 |
- error <- strtoi(response[1]) |
|
73 |
- data <- response[2] |
|
74 |
- if(error!=0) |
|
75 |
- stop(data) |
|
76 |
- else |
|
77 |
- DAGgraph(data) |
|
78 |
-} |
|
79 |
- |
... | ... |
@@ -27,7 +27,7 @@ |
27 | 27 |
#' @examples |
28 | 28 |
#' |
29 | 29 |
#' initGMQL("gtf") |
30 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
30 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
31 | 31 |
#' r = readDataset(test_path) |
32 | 32 |
#' |
33 | 33 |
#' ## it counts the regions in each sample and stores their number as value of the new metadata |
... | ... |
@@ -36,7 +36,7 @@ |
36 | 36 |
#' \dontrun{ |
37 | 37 |
#' |
38 | 38 |
#' initGMQL("gtf") |
39 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
39 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
40 | 40 |
#' exp = readDataset(test_path) |
41 | 41 |
#' |
42 | 42 |
#' ## it copies all samples of exp dataset into res dataset, and then calculates |
... | ... |
@@ -56,8 +56,9 @@ extend <-function(input_data, metadata = NULL) |
56 | 56 |
if(!is.null(metadata)) |
57 | 57 |
metadata_matrix <- .aggregates(metadata,"META_OPERATOR") |
58 | 58 |
else |
59 |
- metadata_matrix <- scalaNull("Array[Array[String]]") |
|
60 |
- |
|
59 |
+ metadata_matrix <- .jnull("java/lang/String") |
|
60 |
+ |
|
61 |
+ WrappeR <- J("it/polimi/genomics/r/Wrapper") |
|
61 | 62 |
response <- WrappeR$extend(metadata_matrix,input_data$value) |
62 | 63 |
error <- strtoi(response[1]) |
63 | 64 |
data <- response[2] |
... | ... |
@@ -24,7 +24,7 @@ |
24 | 24 |
#' |
25 | 25 |
#' @examples |
26 | 26 |
#' |
27 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
27 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
28 | 28 |
#' TFARMatrix(test_path,regions = c("pvalue","peak")) |
29 | 29 |
#' |
30 | 30 |
#' @export |
... | ... |
@@ -98,7 +98,7 @@ TFARMatrix <- function(GMQL_dataset_path, metadata = NULL,metadata_prefix = NULL |
98 | 98 |
#' @examples |
99 | 99 |
#' |
100 | 100 |
#' |
101 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
101 |
+#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "RGMQL") |
|
102 | 102 |
#' grl <- importGMQL.gtf(test_path) |
103 | 103 |
#' TFARMemAtrix(grl,regions = c("pvalue","peak")) |
104 | 104 |
#' |
... | ... |
@@ -39,7 +39,7 @@ |
39 | 39 |
#' gr2 <- GRanges(seqnames = c("chr1", "chr1"), |
40 | 40 |
#' ranges = IRanges(c(7,13), width = 3), strand = c("+", "-"), score = 3:4, GC = c(0.3, 0.5)) |
41 | 41 |
#' grl = GRangesList(gr1,gr2) |
42 |
-#' test_out_path <- system.file("example",package = "GMQL") |
|
42 |
+#' test_out_path <- system.file("example",package = "RGMQL") |
|
43 | 43 |
#' exportGMQL.gdm(grl,test_out_path) |
44 | 44 |
#' |
45 | 45 |
#' |
... | ... |
@@ -92,7 +92,7 @@ exportGMQL.gdm <- function(samples, dir_out) |
92 | 92 |
#' gr2 <- GRanges(seqnames = c("chr1", "chr1"), |
93 | 93 |
#' ranges = IRanges(c(7,13), width = 3), strand = c("+", "-"), score = 3:4, GC = c(0.3, 0.5)) |
94 | 94 |
#' grl = GRangesList(gr1,gr2) |
95 |
-#' test_out_path <- system.file("example",package = "GMQL") |
|
95 |
+#' test_out_path <- system.file("example",package = "RGMQL") |
|
96 | 96 |
#' exportGMQL.gtf(grl,test_out_path) |
97 | 97 |
#' |
98 | 98 |
#' @export |
99 | 99 |
deleted file mode 100644 |
... | ... |
@@ -1,117 +0,0 @@ |
1 |
-#' GMQL Operation: JOIN |
|
2 |
-#' |
|
3 |
-#' It takes in input two datasets, respectively known as nchor (left) and experiment (right) and returns |
|
4 |
-#' a dataset of samples consisting of regions extracted from the operands according to the specified condition |
|
5 |
-#' (a.k.a genometric_predicate). |
|
6 |
-#' The number of generated output samples is the Cartesian product of the number of samples |
|
7 |
-#' in the anchor and in the experiment dataset (if joinBy is not specified). |
|
8 |
-#' The output metadata are the union of the input metadata, with their attribute names prefixed with |
|
9 |
-#' left or right respectively. |
|
10 |
-#' |
|
11 |
-#' |
|
12 |
-#' @param left_input_data returned object from any GMQL function |
|
13 |
-#' @param right_input_data returned object from any GMQL function |
|
14 |
-#' @param genometric_predicate is a list of lists of DISTAL object by means of logical ANDs |
|
15 |
-#' For details of DISTAL objects see: |
|
16 |
-#' \code{\link{DLE}}, \code{\link{DGE}}, \code{\link{MD}}, \code{\link{UP}}, \code{\link{DOWN}} |
|
17 |
-#' |
|
18 |
-#' @param joinBy list of CONDITION objects, or simple string concatenation |
|
19 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
20 |
-#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
21 |
-#' For details of CONDITION objects see: |
|
22 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
23 |
-#' |
|
24 |
-#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
25 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
26 |
-#' |
|
27 |
-#' @param region_output single string that declare which region is given in output for each input pair of left dataset |
|
28 |
-#' right dataset regions satisfying the genometric predicate: |
|
29 |
-#' \itemize{ |
|
30 |
-#' \item{left: outputs the anchor regions from left_input_data that satisfy the genometric predicate} |
|
31 |
-#' \item{right: outputs the experiment regions from right_input_data that satisfy the genometric predicate} |
|
32 |
-#' \item{int (intersection): outputs the overlapping part (intersection) of the left_input_data and right_input_data |
|
33 |
-#' regions that satisfy the genometric predicate; if the intersection is empty, no output is produced} |
|
34 |
-#' \item{contig: outputs the concatenation between the left_input_data and right_input_data regions that satisfy |
|
35 |
-#' the genometric predicate, (i.e. the output regionis defined as having left (right) coordinates |
|
36 |
-#' equal to the minimum (maximum) of the corresponding coordinate values in the left_input_data and right_input_data |
|
37 |
-#' regions satisfying the genometric predicate)} |
|
38 |
-#' } |
|
39 |
-#' |
|
40 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
41 |
-#' as input for the subsequent GMQL function |
|
42 |
-#' |
|
43 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
44 |
-#' |
|
45 |
-#' |
|
46 |
-#' @examples |
|
47 |
-#' |
|
48 |
-#' ## Given a dataset 'hm' and one called 'tss' with a sample including Transcription Start Site annotations, |
|
49 |
-#' ## it searches for those regions of hm that are at a minimal distance from a transcription start site (TSS) |
|
50 |
-#' ## and takes the first/closest one for each TSS, |
|
51 |
-#' ## provided that such distance is lesser than 120K bases and joined 'tss' and 'hm' samples are obtained |
|
52 |
-#' ## from the same provider (joinby clause). |
|
53 |
-#' |
|
54 |
-#' initGMQL("gtf") |
|
55 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
56 |
-#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "GMQL") |
|
57 |
-#' TSS = readDataset(test_path) |
|
58 |
-#' HM = readDataset(test_path2) |
|
59 |
-#' join_data = join(TSS,HM,genometric_predicate=list(list(MD(1),DLE(120000))),c("provider"),region_output="RIGHT") |
|
60 |
-#' |
|
61 |
-#' @export |
|
62 |
-#' |
|
63 |
-join <- function(right_input_data, left_input_data, genometric_predicate = NULL, |
|
64 |
- joinBy = NULL, region_output="contig") |
|
65 |
-{ |
|
66 |
- |
|
67 |
- if(!is.null(genometric_predicate)) |
|
68 |
- { |
|
69 |
- if(!is.list(genometric_predicate)) |
|
70 |
- stop("genometric_predicate must be list of lists") |
|
71 |
- |
|
72 |
- if(!all(sapply(genometric_predicate, function(x) is.list(x) ))) |
|
73 |
- stop("genometric_predicate must be list of lists") |
|
74 |
- |
|
75 |
- lapply(genometric_predicate, function(list_pred) { |
|
76 |
- if(length(list_pred)>4) |
|
77 |
- { |
|
78 |
- warning("only 4 element per list, we cut the rest") |
|
79 |
- length(list_pred)=4 |
|
80 |
- } |
|
81 |
- |
|
82 |
- if(!all(sapply(list_pred, function(x) {is(x,"DISTAL")} ))) |
|
83 |
- stop("All elements should be DISTAL object") |
|
84 |
- |
|
85 |
- }) |
|
86 |
- |
|
87 |
- genomatrix <- t(sapply(genometric_predicate, function(list_pred) { |
|
88 |
- dist_array <- sapply(list_pred, function(x) { |
|
89 |
- new_value = as.character(x) |
|
90 |
- array <- c(new_value) |
|
91 |
- }) |
|
92 |
- dist_array = c(dist_array,c("NA","NA"),c("NA","NA"),c("NA","NA")) |
|
93 |
- length(dist_array) = 8 |
|
94 |
- dist_array |
|
95 |
- })) |
|
96 |
- } |
|
97 |
- else |
|
98 |
- genomatrix <- scalaNull("Array[Array[String]]") |
|
99 |
- |
|
100 |
- if(!is.null(joinBy)) |
|
101 |
- join_condition_matrix <- .join_condition(joinBy) |
|
102 |
- else |
|
103 |
- join_condition_matrix <- scalaNull("Array[Array[String]]") |
|
104 |
- |
|
105 |
- ouput <- toupper(region_output) |
|
106 |
- if(!identical(ouput,"CONTIG") && !identical(ouput,"LEFT") && !identical(ouput,"RIGHT") |
|
107 |
- && !identical(ouput,"INT")) |
|
108 |
- stop("region_output must be contig,left,right or int (intersection)") |
|
109 |
- |
|
110 |
- response <- WrappeR$join(genomatrix,join_condition_matrix, ouput,right_input_data$value, left_input_data$value) |
|
111 |
- error <- strtoi(response[1]) |
|
112 |
- data <- response[2] |
|
113 |
- if(error!=0) |
|
114 |
- stop(data) |
|
115 |
- else |
|
116 |
- DAGgraph(data) |
|
117 |
-} |
118 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,87 +0,0 @@ |
1 |
-#' GMQL Operation: MAP |
|
2 |
-#' |
|
3 |
-#' It computes, for each sample in the right dataset, aggregates over the values of the right regions |
|
4 |
-#' that intersect with a region in a left sample, for each region of each sample in the left dataset; |
|
5 |
-#' The number of generated output samples is the Cartesian product of the samples in the two input datasets; |
|
6 |
-#' each output sample has the same regions as the related input left sample, with their attributes and values, |
|
7 |
-#' plus the attributes computed as aggregates over right region values. |
|
8 |
-#' Output sample metadata are the union of the related input sample metadata, |
|
9 |
-#' whose attribute names are prefixed with "left" or "right" respectively. |
|
10 |
-#' |
|
11 |
-#' When the joinby clause is present, only pairs of samples of left_input_data and of right_input_data with |
|
12 |
-#' metadata M1 and M2 respectively that satisfy the joinby condition are considered. |
|
13 |
-#' |
|
14 |
-#' The clause consists of a list of metadata attribute names that must be present with equal values |
|
15 |
-#' in both M1 and M2 |
|
16 |
-#' |
|
17 |
-#' |
|
18 |
-#' @param left_input_data returned object from any GMQL function |
|
19 |
-#' @param right_input_data returned object from any GMQL function |
|
20 |
-#' @param aggregates list of element in the form \emph{key} = \emph{function_aggregate}. |
|
21 |
-#' The \emph{function_aggregate} is an object of class OPERATOR |
|
22 |
-#' The aggregate functions available are: \code{\link{MIN}}, \code{\link{MAX}}, |
|
23 |
-#' \code{\link{SUM}}, \code{\link{BAG}}, \code{\link{AVG}}, \code{\link{COUNT}}, |
|
24 |
-#' \code{\link{STD}}, \code{\link{MEDIAN}}, \code{\link{Q1}}, \code{\link{Q2}}, \code{\link{Q3}}. |
|
25 |
-#' Every operator accepts a string value, execet for COUNT that cannot have a value. |
|
26 |
-#' Argument of 'function_aggregate' must exist in schema |
|
27 |
-#' Two style are allowed: |
|
28 |
-#' \itemize{ |
|
29 |
-#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
30 |
-#' \item list of values: e.g. SUM("pvalue") |
|
31 |
-#' } |
|
32 |
-#' "mixed style" is not allowed |
|
33 |
-#' |
|
34 |
-#' @param joinBy list of CONDITION objects, or simple string concatenation |
|
35 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
36 |
-#' Every object contains the name of metadata to be used in \emph{groupby}. |
|
37 |
-#' For details of CONDITION objects see: |
|
38 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
39 |
-#' |
|
40 |
-#' Every condition accepts only one string value. (e.g. DEF("cell_type") ) |
|
41 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
42 |
-#' |
|
43 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
44 |
-#' as input for the subsequent GMQL function |
|
45 |
-#' |
|
46 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
47 |
-#' |
|
48 |
-#' @examples |
|
49 |
-#' |
|
50 |
-#' ## it counts the number of regions in each sample from exp that overlap with a ref region, |
|
51 |
-#' ## and for each ref region it computes the minimum score of all the regions in each exp sample |
|
52 |
-#' ## that overlap with it. |
|
53 |
-#' ## The MAP joinby option ensures that only the exp samples referring to the same 'cell_tissue' |
|
54 |
-#' ## of a ref sample are mapped on such ref sample; |
|
55 |
-#' ## exp samples with no cell_tissue metadata attribute, or with such metadata |
|
56 |
-#' ## but with a different value from the one(s) of ref sample(s), are disregarded. |
|
57 |
-#' |
|
58 |
-#' initGMQL("gtf") |
|
59 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
60 |
-#' test_path2 <- system.file("example","DATA_SET_VAR_GDM",package = "GMQL") |
|
61 |
-#' exp = readDataset(test_path) |
|
62 |
-#' ref = readDataset(test_path2) |
|
63 |
-#' out = map(ref,exp, list(minScore = MIN("score")), joinBy = c("cell_tissue") ) |
|
64 |
-#' |
|
65 |
-#' |
|
66 |
-#' @export |
|
67 |
-#' |
|
68 |
-map <- function(left_input_data, right_input_data, aggregates = NULL, joinBy = NULL) |
|
69 |
-{ |
|
70 |
- if(!is.null(aggregates)) |
|
71 |
- metadata_matrix <- .aggregates(aggregates,"OPERATOR") |
|
72 |
- else |
|
73 |
- metadata_matrix = scalaNull("Array[Array[String]]") |
|
74 |
- |
|
75 |
- if(!is.null(joinBy)) |
|
76 |
- join_condition_matrix <- .join_condition(joinBy) |
|
77 |
- else |
|
78 |
- join_condition_matrix <- scalaNull("Array[Array[String]]") |
|
79 |
- |
|
80 |
- response<-WrappeR$map(join_condition_matrix,metadata_matrix,left_input_data$value,right_input_data$value) |
|
81 |
- error <- strtoi(response[1]) |
|
82 |
- data <- response[2] |
|
83 |
- if(error!=0) |
|
84 |
- stop(data) |
|
85 |
- else |
|
86 |
- DAGgraph(data) |
|
87 |
-} |
88 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,189 +0,0 @@ |
1 |
-#' GMQL Function: EXECUTE |
|
2 |
-#' |
|
3 |
-#' execute GMQL query. |
|
4 |
-#' The function works only after invoking at least one materialize |
|
5 |
-#' |
|
6 |
-#' @details |
|
7 |
-#' |
|
8 |
-#' After invoking execution function, all varialbe associated to DAG will be removed |
|
9 |
-#' from scala enviroment, although the associated R variable will remain stored in R environment |
|
10 |
-#' |
|
11 |
-#' @return None |
|
12 |
-#' |
|
13 |
-#' @examples |
|
14 |
-#' |
|
15 |
-#' initGMQL() |
|
16 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
17 |
-#' r = readDataset(test_path) |
|
18 |
-#' s = select(input_data = r) |
|
19 |
-#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s) |
|
20 |
-#' materialize(input_data = m, dir_out = test_path) |
|
21 |
-#' |
|
22 |
-#' \dontrun{ |
|
23 |
-#' execute() |
|
24 |
-#' } |
|
25 |
-#' @export |
|
26 |
-#' |
|
27 |
-execute <- function() |
|
28 |
-{ |
|
29 |
- remote_proc <- WrappeR$is_remote_processing() |
|
30 |
- if(!remote_proc) |
|
31 |
- .download_or_upload() |
|
32 |
- |
|
33 |
- response <- WrappeR$execute() |
|
34 |
- error <- strtoi(response[1]) |
|
35 |
- data <- response[2] |
|
36 |
- if(error!=0) |
|
37 |
- stop(data) |
|
38 |
- else |
|
39 |
- { |
|
40 |
- if(remote_proc) |
|
41 |
- { |
|
42 |
- url <- WrappeR$get_url() |
|
43 |
- .download_or_upload() |
|
44 |
- serializeQuery(url,FALSE,data) |
|
45 |
- } |
|
46 |
- } |
|
47 |
-} |
|
48 |
- |
|
49 |
-.download_or_upload <- function() |
|
50 |
-{ |
|
51 |
- data <- WrappeR$get_dataset_list() |
|
52 |
- data_list <- apply(data, 1, as.list) |
|
53 |
- url <- WrappeR$get_url() |
|
54 |
- remote <- WrappeR$is_remote_processing() |
|
55 |
- if(remote) |
|
56 |
- { |
|
57 |
- sapply(data_list,function(x){ |
|
58 |
- uploadSamples(url,x[[2]],x[[1]],x[[3]],FALSE) |
|
59 |
- }) |
|
60 |
- } |
|
61 |
- else |
|
62 |
- { |
|
63 |
- sapply(data_list,function(x){ |
|
64 |
- downloadDataset(url,x[[2]],x[[1]]) |
|
65 |
- }) |
|
66 |
- } |
|
67 |
-} |
|
68 |
- |
|
69 |
-#' GMQL Operation: MATERIALIZE |
|
70 |
-#' |
|
71 |
-#' It saves the contents of a dataset that contains samples metadata and samples regions. |
|
72 |
-#' It is normally used to persist the contents of any dataset generated during a GMQL query. |
|
73 |
-#' Any dataset can be materialized, but the operation can be very time-consuming. |
|
74 |
-#' For best performance, materialize the relevant data only. |
|
75 |
-#' |
|
76 |
-#' |
|
77 |
-#' @param input_data returned object from any GMQL function |
|
78 |
-#' @param dir_out destination folder path. |
|
79 |
-#' by default is current working directory of the R process |
|
80 |
-#' |
|
81 |
-#' @return None |
|
82 |
-#' |
|
83 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
84 |
-#' |
|
85 |
-#' @examples |
|
86 |
-#' |
|
87 |
-#' initGMQL("gtf") |
|
88 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
89 |
-#' r = readDataset(test_path) |
|
90 |
-#' s = select(input_data = r) |
|
91 |
-#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s) |
|
92 |
-#' materialize(input_data = m, dir_out = test_path) |
|
93 |
-#' |
|
94 |
-#' @export |
|
95 |
-#' |
|
96 |
-materialize <- function(input_data, dir_out = getwd()) |
|
97 |
-{ |
|
98 |
- response <- WrappeR$materialize(input_data$value,dir_out) |
|
99 |
- error <- strtoi(response[1]) |
|
100 |
- data <- response[2] |
|
101 |
- if(error!=0) |
|
102 |
- stop(data) |
|
103 |
- else |
|
104 |
- invisible(NULL) |
|
105 |
-} |
|
106 |
- |
|
107 |
- |
|
108 |
-#' GMQL Operation: TAKE |
|
109 |
-#' |
|
110 |
-#' It saves the contents of a dataset that contains samples metadata and samples regions. |
|
111 |
-#' It is normally used to store in memoery the contents of any dataset generated during a GMQL query. |
|
112 |
-#' the operation can be very time-consuming. |
|
113 |
-#' If you have invoked any materialization before take function, all those dataset will be materialized |
|
114 |
-#' as folder (like if execution was invoked) |
|
115 |
-#' |
|
116 |
-#' @import GenomicRanges |
|
117 |
-#' @importFrom stats setNames |
|
118 |
-#' |
|
119 |
-#' @param input_data returned object from any GMQL function |
|
120 |
-#' @param rows number of rows for each sample regions that you want to retrieve and stored in memory |
|
121 |
-#' by default is 0 that means take all rows for each sample |
|
122 |
-#' |
|
123 |
-#' @return GrangesList with associated metadata |
|
124 |
-#' |
|
125 |
-#' @examples |
|
126 |
-#' |
|
127 |
-#' initGMQL("gtf") |
|
128 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
129 |
-#' r = readDataset(test_path) |
|
130 |
-#' s = select(input_data = r) |
|
131 |
-#' m = merge(groupBy = c("antibody_targer","cell_karyotype"),input_data = s) |
|
132 |
-#' g <- take(input_data = m, rows = 45) |
|
133 |
-#' |
|
134 |
-#' @export |
|
135 |
-#' |
|
136 |
-take <- function(input_data, rows=0L) |
|
137 |
-{ |
|
138 |
- rows <- as.integer(rows[1]) |
|
139 |
- if(rows<0) |
|
140 |
- stop("rows cannot be negative") |
|
141 |
- |
|
142 |
- response <- WrappeR$take(input_data$value,rows) |
|
143 |
- error <- strtoi(response[1]) |
|
144 |
- data <- response[2] |
|
145 |
- if(error!=0) |
|
146 |
- stop(data) |
|
147 |
- |
|
148 |
- reg <- WrappeR$get_reg() |
|
149 |
- if(is.null(reg)) |
|
150 |
- stop("no regions defined") |
|
151 |
- meta <- WrappeR$get_meta() |
|
152 |
- if(is.null(meta)) |
|
153 |
- stop("no metadata defined") |
|
154 |
- schema <- WrappeR$get_schema() |
|
155 |
- if(is.null(schema)) |
|
156 |
- stop("no schema defined") |
|
157 |
- |
|
158 |
- reg_data_frame <- as.data.frame(reg) |
|
159 |
- list <- split(reg_data_frame, reg_data_frame[1]) |
|
160 |
- names <- c("seqname","start","end","strand",schema) |
|
161 |
- |
|
162 |
- sampleList <- lapply(list, function(x){ |
|
163 |
- x <- x[-1] |
|
164 |
- names(x) <- names |
|
165 |
- g <- GenomicRanges::makeGRangesFromDataFrame(x,keep.extra.columns = TRUE, |
|
166 |
- start.field = "start",end.field = "end") |
|
167 |
- }) |
|
168 |
- gRange_list <- GRangesList(sampleList) |
|
169 |
- |
|
170 |
- meta_list <- .metadata_from_frame_to_list(meta) |
|
171 |
- |
|
172 |
- S4Vectors::metadata(gRange_list) <- meta_list |
|
173 |
- return(gRange_list) |
|
174 |
-} |
|
175 |
- |
|
176 |
-.metadata_from_frame_to_list <- function(metadata_frame) |
|
177 |
-{ |
|
178 |
- meta_frame <- as.data.frame(metadata_frame) |
|
179 |
- list <- split(meta_frame, meta_frame[1]) |
|
180 |
- name_value_list <- lapply(list, function(x){ |
|
181 |
- x <- x[-1] |
|
182 |
- }) |
|
183 |
- meta_list <- lapply(name_value_list, function(x){ |
|
184 |
- stats::setNames(as.list(as.character(x[[2]])), x[[1]]) |
|
185 |
- }) |
|
186 |
-} |
|
187 |
- |
|
188 |
- |
|
189 |
- |
190 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,56 +0,0 @@ |
1 |
-#' GMQL Operation: MERGE |
|
2 |
-#' |
|
3 |
-#' It builds a dataset consisting of a single sample having as many regions |
|
4 |
-#' as the numebr of regions of the input data and as many metadata as the union of |
|
5 |
-#' the 'attribute-value' tuples of the input samples. |
|
6 |
-#' A groupby clause can be specified on metadata: the samples are then partitioned in groups, |
|
7 |
-#' each with a distinct value of the grouping metadata attributes. |
|
8 |
-#' The operation is separately applied to each group, yielding one sample in the result for each group. |
|
9 |
-#' Samples whose names are not present in the grouping metadata parameter are disregarded. |
|
10 |
-#' |
|
11 |
-#' |
|
12 |
-#' @param input_data returned object from any GMQL function |
|
13 |
-#' @param groupBy list of CONDITION objects, or simple string concatenation |
|
14 |
-#' (i.e c("cell_type","attribute_tag","size")). |
|
15 |
-#' Every object contains the name of metadata to be used in \emph{groupBy}. |
|
16 |
-#' For details of CONDITION objects see: |
|
17 |
-#' \code{\link{DEF}}, \code{\link{FULL}}, \code{\link{EXACT}} |
|
18 |
-#' |
|
19 |
-#' Every condition accepts only one string value (e.g. DEF("cell_type") ) |
|
20 |
-#' In case of single concatenation with no CONDITION, all metadata are considering as DEF |
|
21 |
-#' |
|
22 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
23 |
-#' as input for the subsequent GMQL function |
|
24 |
-#' |
|
25 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
26 |
-#' |
|
27 |
-#' @examples |
|
28 |
-#' |
|
29 |
-#' ## it creates a dataset called merged which contains one sample for each antibody_target value |
|
30 |
-#' ## found within the metadata of the exp dataset sample; |
|
31 |
-#' ## each created sample contains all regions from all 'exp' samples with a specific value for their |
|
32 |
-#' ## antibody_target metadata attribute. |
|
33 |
-#' |
|
34 |
-#' initGMQL("gtf") |
|
35 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
36 |
-#' exp = readDataset(test_path) |
|
37 |
-#' merged = merge(input_data = exp, groupBy = c("antibody_target")) |
|
38 |
-#' |
|
39 |
-#' @export |
|
40 |
-#' |
|
41 |
-merge <- function(input_data, groupBy = NULL) |
|
42 |
-{ |
|
43 |
- if(!is.null(groupBy)) |
|
44 |
- join_condition_matrix <- .join_condition(groupBy) |
|
45 |
- else |
|
46 |
- join_condition_matrix <- scalaNull("Array[Array[String]]") |
|
47 |
- |
|
48 |
- response <- WrappeR$merge(join_condition_matrix,input_data$value) |
|
49 |
- error <- strtoi(response[1]) |
|
50 |
- data <- response[2] |
|
51 |
- if(error!=0) |
|
52 |
- stop(data) |
|
53 |
- else |
|
54 |
- DAGgraph(data) |
|
55 |
-} |
|
56 |
- |
57 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,162 +0,0 @@ |
1 |
-#' GMQL operation: ORDER |
|
2 |
-#' |
|
3 |
-#' It is used to order either samples or sample regions or both, |
|
4 |
-#' according to a set of metadata and/or region attributes, and/or region coordinates. |
|
5 |
-#' Order can be specified as ascending / descending for every attribute |
|
6 |
-#' The number of samples and their regions remain the same (unless mtop/rtop parameters specified) |
|
7 |
-#' but a new ordering metadata and/or region attribute is added. |
|
8 |
-#' Sorted samples or regions have a new attribute "order", added to either metadata, or regions, |
|
9 |
-#' or both of them as specified in input |
|
10 |
-#' The input mtop = k and rtop = m extracts the first k samples and m regions respectively, |
|
11 |
-#' the clause mtopg = k and rtopg = m performs grouping operation, |
|
12 |
-#' grouping by identical values of ordering attributes |
|
13 |
-#' and then selects the first k samples or regions of each group |
|
14 |
-#' |
|
15 |
-#' |
|
16 |
-#' @param input_data "url-like" string taken from GMQL function |
|
17 |
-#' @param metadata_ordering list of ORDER objects where every object contains the name of metadata |
|
18 |
-#' The ORDER's available are: \code{\link{ASC}}, \code{\link{DESC}} |
|
19 |
-#' Every condition accepts only one string value. (e.g. ASC("cell_type") ) |
|
20 |
-#' @param mtop integer value specifying the first k samples. |
|
21 |
-#' default is 0 that means every sample must be considered |
|
22 |
-#' @param mtopg integer value specifying the first j samples in each group. |
|
23 |
-#' default is 0 that means every sample must be considered |
|
24 |
-#' @param mtopp integer value specifying the first j samples in each group. |
|
25 |
-#' default is 0 that means every sample must be considered |
|
26 |
-#' @param regions_ordering list of ORDER objects where every object contains the name of region schema value |
|
27 |
-#' The ORDER's available are: ASC, DESC. |
|
28 |
-#' Every condition accepts only one string value. (e.g. DESC("pvalue") ) |
|
29 |
-#' @param rtop integer value specifying the first m samples in each group. |
|
30 |
-#' default is 0 that means every sample must be considered |
|
31 |
-#' @param rtopg integer value specifying the first i samples in each group. |
|
32 |
-#' default is 0 that means every sample must be considered |
|
33 |
-#' @param rtopp integer value specifying the first i samples in each group. |
|
34 |
-#' default is 0 that means every sample must be considered |
|
35 |
-#' |
|
36 |
-#' |
|
37 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
38 |
-#' as input for the subsequent GMQL function |
|
39 |
-#' |
|
40 |
-#' @details |
|
41 |
-#' mtop, mtopg,mtopp, rtop, rtopg and rtopp are normally numbers: if you specify a vector, |
|
42 |
-#' only the first element will be used |
|
43 |
-#' mtop and mtopg and mtopp are mutalbe exclusive, so rtop and rtopg and rtopp |
|
44 |
-#' |
|
45 |
-#' |
|
46 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
47 |
-#' |
|
48 |
-#' @examples |
|
49 |
-#' |
|
50 |
-#' ## it orders the samples according to the Region_count metadata attribute and takes the two samples |
|
51 |
-#' ## that have the highest count. |
|
52 |
-#' |
|
53 |
-#' initGMQL("gtf") |
|
54 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
55 |
-#' r = readDataset(test_path) |
|
56 |
-#' o = order(r,list(DESC("Region_Count")), mtop = 2) |
|
57 |
-#' |
|
58 |
-#' @export |
|
59 |
-#' |
|
60 |
-order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0,mtopp = 0, |
|
61 |
- regions_ordering = NULL,rtop = 0,rtopg = 0,rtopp = 0) |
|
62 |
-{ |
|
63 |
- if(!is.numeric(mtop) || !is.numeric(mtopg) || !is.numeric(rtop) || !is.numeric(rtopg) |
|
64 |
- || !is.numeric(mtopp)|| !is.numeric(rtopp)) |
|
65 |
- stop("mtop, rtop, rtopg and mtopg must be integer") |
|
66 |
- |
|
67 |
- if(length(mtop)>0 || length(mtopg)>0 || length(rtop)>0 || length(rtopg)>0 |
|
68 |
- || length(mtopp)>0 || length(rtopp)>0) |
|
69 |
- warning("only the first element is taken by rtop, mtop, mtopg, rtopg, rtopp, mtopp") |
|
70 |
- |
|
71 |
- # we consider only the first element even if input is a vector of Int |
|
72 |
- # we cut the other arguments |
|
73 |
- |
|
74 |
- mtop = as.integer(mtop[1]) |
|
75 |
- mtopg = as.integer(mtopg[1]) |
|
76 |
- mtopp = as.integer(mtopp[1]) |
|
77 |
- |
|
78 |
- rtop = as.integer(rtop[1]) |
|
79 |
- rtopg = as.integer(rtopg[1]) |
|
80 |
- rtopp = as.integer(rtopp[1]) |
|
81 |
- |
|
82 |
- if(mtop > 0 && mtopg >0) |
|
83 |
- { |
|
84 |
- warning("cannot be used together.\nWe set mtopg = 0") |
|
85 |
- mtopg = 0L |
|
86 |
- } |
|
87 |
- |
|
88 |
- if(mtop >0 && mtopp>0) |
|
89 |
- { |
|
90 |
- warning("cannot be used together.\nWe set mtopp = 0") |
|
91 |
- mtopp = 0L |
|
92 |
- } |
|
93 |
- |
|
94 |
- if(mtopg >0 && mtopp>0) |
|
95 |
- { |
|
96 |
- warning("cannot be used together.\nWe set mtopp = 0") |
|
97 |
- mtopp = 0L |
|
98 |
- } |
|
99 |
- |
|
100 |
- if(rtop > 0 && rtopg >0) |
|
101 |
- { |
|
102 |
- warning("cannot be used together.\nWe set rtopg = 0") |
|
103 |
- rtopg = 0L |
|
104 |
- } |
|
105 |
- |
|
106 |
- if(rtop >0 && rtopp>0) |
|
107 |
- { |
|
108 |
- warning("cannot be used together.\nWe set rtopp = 0") |
|
109 |
- rtopp = 0L |
|
110 |
- } |
|
111 |
- |
|
112 |
- if(rtopg >0 && rtopp>0) |
|
113 |
- { |
|
114 |
- warning("cannot be used together.\nWe set rtopp = 0") |
|
115 |
- rtopp = 0L |
|
116 |
- } |
|
117 |
- |
|
118 |
- if(!is.null(metadata_ordering)) |
|
119 |
- meta_matrix <- .ordering_meta(metadata_ordering) |
|
120 |
- else |
|
121 |
- meta_matrix <- scalaNull("Array[Array[String]]") |
|
122 |
- |
|
123 |
- if(!is.null(regions_ordering)) |
|
124 |
- region_matrix <- .ordering_meta(regions_ordering) |
|
125 |
- else |
|
126 |
- region_matrix <- scalaNull("Array[Array[String]]") |
|
127 |
- |
|
128 |
- response <- WrappeR$order(meta_matrix,mtopg,mtop,mtopp,region_matrix,rtopg,rtop,rtopp,input_data$value) |
|
129 |
- error <- strtoi(response[1]) |
|
130 |
- data <- response[2] |
|
131 |
- if(error!=0) |
|
132 |
- stop(data) |
|
133 |
- else |
|
134 |
- DAGgraph(data) |
|
135 |
-} |
|
136 |
- |
|
137 |
- |
|
138 |
-.ordering_meta <- function(ordering) |
|
139 |
-{ |
|
140 |
- if(is.list(ordering)) |
|
141 |
- { |
|
142 |
- order_matrix <- t(sapply(ordering,function(x){ |
|
143 |
- new_value <- as.character(x) |
|
144 |
- if(length(new_value)==1) |
|
145 |
- new_value = c("ASC",new_value) |
|
146 |
- else if(!identical("ASC",new_value[1]) && !identical("DESC",new_value[1])) |
|
147 |
- stop("no more than one value") |
|
148 |
- matrix <- matrix(new_value) |
|
149 |
- })) |
|
150 |
- } |
|
151 |
- else if(is.character(ordering)) |
|
152 |
- { |
|
153 |
- order_matrix <- t(sapply(ordering, function(x) { |
|
154 |
- new_value = c("ASC",x) |
|
155 |
- matrix <- matrix(new_value) |
|
156 |
- })) |
|
157 |
- } |
|
158 |
- else |
|
159 |
- stop("only list or character") |
|
160 |
-} |
|
161 |
- |
|
162 |
- |
163 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,128 +0,0 @@ |
1 |
-#' GMQL Operation: PROJECT |
|
2 |
-#' |
|
3 |
-#' It creates, from an existing dataset, a new dataset with all the samples from input dataset |
|
4 |
-#' but keeping for each sample in the input dataset only those metadata and/or region attributes |
|
5 |
-#' expressed in the operator parameter list. |
|
6 |
-#' Region coordinates and values of the remaining metadata remain equal to those in the input dataset. |
|
7 |
-#' It allows to: |
|
8 |
-#' \itemize{ |
|
9 |
-#' \item{Remove existing metadata and/or region attributes from a dataset} |
|
10 |
-#' \item{Create new metadata and/or region attributes in the result} |
|
11 |
-#' } |
|
12 |
-#' |
|
13 |
-#' @param input_data string pointer taken from GMQL function |
|
14 |
-#' @param metadata vector of string made up by metadata attribute |
|
15 |
-#' @param regions vector of string made up by schema field attribute |
|
16 |
-#' @param all_but_reg logical value indicating which schema filed attribute you want to exclude. |
|
17 |
-#' If FALSE only the regions you choose is kept in the output of the project operation, |
|
18 |
-#' if TRUE the schema region are all except ones include in region parameter. |
|
19 |
-#' if regions is not defined \emph{all_but_reg} is not considerd. |
|
20 |
-#' @param all_but_meta logical value indicating which metadata you want to exclude. |
|
21 |
-#' If FALSE only the metadata you choose is kept in the output of the project operation, |
|
22 |
-#' if TRUE the metadata are all except ones include in region parameter. |
|
23 |
-#' if metadata is not defined \emph{all_but_meta} is not considerd. |
|
24 |
-#' @param regions_update single string predicate made up by operation on schema field attribute |
|
25 |
-#' @param metadata_update single string predicate made up by operation on metadata attribute |
|
26 |
-#' |
|
27 |
-#' @return DAGgraph class object. It contains the value associated to the graph used |
|
28 |
-#' as input for the subsequent GMQL function#' |
|
29 |
-#' |
|
30 |
-#' @references \url{http://www.bioinformatics.deib.polimi.it/genomic_computing/GMQL/doc/GMQLUserTutorial.pdf} |
|
31 |
-#' |
|
32 |
-#' |
|
33 |
-#' @examples |
|
34 |
-#' |
|
35 |
-#' ## it creates a new dataset called CTCF_NORM_SCORE by preserving all region attributes apart from score, |
|
36 |
-#' ## and creating a new region attribute called new_score by dividing the existing score value |
|
37 |
-#' ## of each region by 1000.0 and incrementing it by 100. |
|
38 |
-#' ## It also generates, for each sample of the new dataset, |
|
39 |
-#' ## a new metadata attribute called normalized with value 1, which can be used in future selections. |
|
40 |
-#' |
|
41 |
-#' initGMQL("gtf") |
|
42 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
43 |
-#' input = readDataset(test_path) |
|
44 |
-#' CTCF_NORM_SCORE = project(input,metadata_update="normalized AS 1", regions_update="new_score AS (score / 1000.0) + 100" , regions=c("score"), all_but_reg=TRUE) |
|
45 |
-#' |
|
46 |
-#' |
|
47 |
-#' \dontrun{ |
|
48 |
-#' |
|
49 |
-#' ## it produces an output dataset that contains the same samples as the input dataset. |
|
50 |
-#' ## Each output sample only contains, as region attributes, |
|
51 |
-#' ## the four basic coordinates (chr, left, right, strand) and the specified region attributes |
|
52 |
-#' ## 'variant_classification' and 'variant_type', and as metadata attributes only the specified ones, |
|
53 |
-#' ## i.e. manually_curated__tissue_status and manually_curated__tumor_tag. |
|
54 |
-#' |
|
55 |
-#' initGMQL("gtf") |
|
56 |
-#' test_path <- system.file("example","DATA_SET_VAR_GTF",package = "GMQL") |
|
57 |
-#' DS_in = readDataset(test_path) |
|
58 |
-#' DS_out = project(DS_in,regions=c("variant_classification", "variant_type"), |
|
59 |
-#' metadata=c("manually_curated__tissue_status","manually_curated__tumor_tag")) |
|
60 |
-#' |
|
61 |
-#' } |
|
62 |
-#' |
|
63 |
-#' @export |
|
64 |
-#' |
|
65 |
-#' |
|
66 |
-project <-function(input_data, metadata = NULL,metadata_update=NULL,all_but_meta = FALSE, |
|
67 |
- regions = NULL, regions_update = NULL,all_but_reg=FALSE) |
|
68 |
-{ |
|
69 |
- if(!is.null(metadata)) |
|
70 |
- { |
|
71 |
- if(!is.character(metadata)) |
|
72 |
- stop("metadata: no valid input") |
|
73 |
- |
|
74 |
- metadata <- metadata[!metadata %in% ""] |
|
75 |
- metadata <- metadata[!duplicated(metadata)] |
|
76 |
- |
|
77 |
- if(length(metadata)==0) |
|
78 |
- metadata <- scalaNull("Array[String]") |
|
79 |
- |
|
80 |
- metadata <- (I(as.character(metadata))) |
|
81 |
- } |
|
82 |
- else |
|
83 |
- metadata <- scalaNull("Array[String]") |
|
84 |
- |
|
85 |
- if(!is.null(regions)) |
|
86 |
- { |
|
87 |
- if(!is.character(regions)) |
|
88 |
- stop("regions: no valid input") |
|
89 |
- |
|
90 |
- regions = regions[!regions %in% ""] |
|
91 |
- regions = regions[!duplicated(regions)] |
|
92 |
- |
|
93 |
- if(length(regions)==0) |
|
94 |
- regions <- scalaNull("Array[String]") |
|
95 |
- |
|
96 |
- regions <- (I(as.character(regions))) |
|
97 |
- |
|
98 |
- } |
|
99 |
- else |
|
100 |
- regions <- scalaNull("Array[String]") |
|
101 |
- |
|
102 |
- if(!is.null(regions_update)) |
|
103 |
- .check_predicate(regions_update) |
|
104 |
- else |
|
105 |
- regions_update <- scalaNull("String") |
|
106 |
- |
|
107 |
- if(!is.null(metadata_update)) |
|
108 |
- .check_predicate(metadata_update) |
|
109 |
- else |
|
110 |
- metadata_update <- scalaNull("String") |
|
111 |
- |
|
112 |
- if(length(all_but_meta)>1) |
|
113 |
- warning("all_but_meta: no multiple values") |
|
114 |
- |
|
115 |
- if(length(all_but_reg)>1) |
|
116 |
- warning("all_but_reg: no multiple values") |
|
117 |
- all_but_reg <- all_but_reg[1] |
|
118 |
- all_but_meta <- all_but_meta[1] |
|
119 |
- |
|
120 |
- response <- WrappeR$project(metadata,metadata_update,all_but_meta, |
|
121 |
- regions,regions_update,all_but_reg,input_data$value) |
|
122 |
- error <- strtoi(response[1]) |
|
123 |
- data <- response[2] |
|