... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
Package: RGMQL |
2 | 2 |
Type: Package |
3 | 3 |
Title: GenoMetric Query Language for R/Bioconductor |
4 |
-Version: 0.99.41 |
|
4 |
+Version: 0.99.42 |
|
5 | 5 |
Author: Simone Pallotta, Marco Masseroli |
6 | 6 |
Maintainer: Simone Pallotta <simonepallotta@hotmail.com> |
7 | 7 |
Description: This package brings the GenoMetric Query Language (GMQL) |
... | ... |
@@ -11,7 +11,7 @@ Description: This package brings the GenoMetric Query Language (GMQL) |
11 | 11 |
GMQL adopts algorithms efficiently designed for big data using cloud-computing |
12 | 12 |
technologies (like Apache Hadoop and Spark) allowing GMQL to run on modern |
13 | 13 |
infrastructures, in order to achieve scalability and high performance. |
14 |
- It allows to to create, manipulate and extract genomic data from different |
|
14 |
+ It allows to create, manipulate and extract genomic data from different |
|
15 | 15 |
data sources both locally and remotely. Our RGMQL functions allow complex |
16 | 16 |
queries and processing leveraging on the R idiomatic paradigm. |
17 | 17 |
The RGMQL package also provides a rich set of ancillary classes that allow |
... | ... |
@@ -20,7 +20,7 @@ |
20 | 20 |
#' @examples |
21 | 21 |
#' |
22 | 22 |
#' ## This statement defines the path to the subdirectory "example" of the |
23 |
-#' ## package "RGMQL" and import as GRangesList the GMQL dataset |
|
23 |
+#' ## package "RGMQL" and imports as GRangesList the contained GMQL dataset |
|
24 | 24 |
#' |
25 | 25 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
26 | 26 |
#' grl = import_gmql(test_path, TRUE) |
... | ... |
@@ -25,24 +25,24 @@ |
25 | 25 |
#' The GMQL dataset is made up by two different file types: |
26 | 26 |
#' |
27 | 27 |
#' \itemize{ |
28 |
-#' \item{metadata files: they contain metadata associated to corrisponding |
|
28 |
+#' \item{metadata files: they contain metadata associated with corrisponding |
|
29 | 29 |
#' sample.} |
30 |
-#' \item{region files: they contain many genomic regions data.} |
|
31 |
-#' \item{region schema file: XML file that contains region attribute name |
|
30 |
+#' \item{region files: they contain genomic regions data.} |
|
31 |
+#' \item{region schema file: XML file that contains region attribute names |
|
32 | 32 |
#' (e.g. chr, start, end, pvalue)} |
33 | 33 |
#' } |
34 |
-#' Region sample files and metadata files are associated through file name: |
|
34 |
+#' Sample region files and metadata files are associated through file name: |
|
35 | 35 |
#' for example S_0001.gdm for region file and S_0001.gdm.meta for |
36 | 36 |
#' its metadata file |
37 | 37 |
#' |
38 | 38 |
#' |
39 | 39 |
#' @examples |
40 | 40 |
#' |
41 |
-#' ## load and attach add-on GenomicRanges package |
|
41 |
+#' ## Load and attach add-on GenomicRanges package |
|
42 | 42 |
#' library(GenomicRanges) |
43 | 43 |
#' |
44 |
-#' ## These statemens create two GRanges with the region attribute: seqnames, |
|
45 |
-#' ## ranges (region coordinates) and strand, plus two column element: |
|
44 |
+#' ## These statemens create two GRanges with the region attributes: seqnames, |
|
45 |
+#' ## ranges (region coordinates) and strand, plus two column elements: |
|
46 | 46 |
#' ## score and GC |
47 | 47 |
#' |
48 | 48 |
#' gr1 <- GRanges(seqnames = "chr2", ranges = IRanges(3, 6), strand = "+", |
... | ... |
@@ -56,11 +56,11 @@ |
56 | 56 |
#' grl = GRangesList(gr1, gr2) |
57 | 57 |
#' |
58 | 58 |
#' ## This statement defines the path to the subdirectory "example" of the |
59 |
-#' ## package "RGMQL" and export the GRangesList as GMQL dataset using the |
|
59 |
+#' ## package "RGMQL" and exports the GRangesList as GMQL datasets using the |
|
60 | 60 |
#' ## last name of 'dir_out' path as dataset name |
61 | 61 |
#' |
62 | 62 |
#' test_out_path <- system.file("example", package = "RGMQL") |
63 |
-#' export_gmql(grl, test_out_path,TRUE) |
|
63 |
+#' export_gmql(grl, test_out_path, TRUE) |
|
64 | 64 |
#' |
65 | 65 |
#' |
66 | 66 |
#' @export |
... | ... |
@@ -73,7 +73,8 @@ take_value.META_AGGREGATES <- function(obj){ |
73 | 73 |
#' \item{COUNT: It prepares input parameter to be passed to the library |
74 | 74 |
#' function count, performing all the type conversions needed } |
75 | 75 |
#' \item{COUNTSAMP: It prepares input parameter to be passed to the library |
76 |
-#' function third quartile, performing all the type conversions needed } |
|
76 |
+#' function countsamp, performing all the type conversions needed. |
|
77 |
+#' It is used only with group_by functions} |
|
77 | 78 |
#' \item{MIN: It prepares input parameter to be passed to the library |
78 | 79 |
#' function minimum, performing all the type conversions needed } |
79 | 80 |
#' \item{MAX: It prepares input parameter to be passed to the library |
... | ... |
@@ -88,7 +89,7 @@ take_value.META_AGGREGATES <- function(obj){ |
88 | 89 |
#' function bag; this function creates comma-separated strings of |
89 | 90 |
#' attribute values, performing all the type conversions needed} |
90 | 91 |
#' \item{BAGD: It prepares input parameter to be passed to the library |
91 |
-#' function bag; this function creates comma-separated strings of distinct |
|
92 |
+#' function bagd; this function creates comma-separated strings of distinct |
|
92 | 93 |
#' attribute values, performing all the type conversions needed} |
93 | 94 |
#' \item{Q1: It prepares input parameter to be passed to the library |
94 | 95 |
#' function fist quartile, performing all the type conversions needed} |
... | ... |
@@ -112,22 +113,22 @@ take_value.META_AGGREGATES <- function(obj){ |
112 | 113 |
#' |
113 | 114 |
#' init_gmql() |
114 | 115 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
115 |
-#' exp = read_GMQL(test_path) |
|
116 |
+#' exp = read_gmql(test_path) |
|
116 | 117 |
#' |
117 | 118 |
#' ## This statement copies all samples of exp dataset into res dataset, and |
118 |
-#' ## then calculates new metadata attribute for each of them: |
|
119 |
-#' ## sum_score is the sum of score of the sample regions. |
|
119 |
+#' ## then calculates new metadata attribute 'sum_score' for each of them: |
|
120 |
+#' ## sum_score is the sum of score values of the sample regions. |
|
120 | 121 |
#' |
121 | 122 |
#' res = extend(exp, sum_score = SUM("score")) |
122 | 123 |
#' |
123 | 124 |
#' ## This statement copies all samples of exp dataset into res dataset, |
124 |
-#' ## and then calculates new metadata attribute for each of them: |
|
125 |
+#' ## and then calculates new metadata attribute 'min_pvalue' for each of them: |
|
125 | 126 |
#' ## min_pvalue is the minimum pvalue of the sample regions. |
126 | 127 |
#' |
127 | 128 |
#' res = extend(exp, min_pvalue = MIN("pvalue")) |
128 | 129 |
#' |
129 | 130 |
#' ## This statement copies all samples of exp dataset into res dataset, |
130 |
-#' ## and then calculates new metadata attribute for each of them: |
|
131 |
+#' ## and then calculates new metadata attribute 'max_score' for each of them: |
|
131 | 132 |
#' ## max_score is the maximum score of the sample regions. |
132 | 133 |
#' |
133 | 134 |
#' res = extend(exp, max_score = MAX("score")) |
... | ... |
@@ -135,30 +136,31 @@ take_value.META_AGGREGATES <- function(obj){ |
135 | 136 |
#' ## The following cover operation produces output regions where at least 2 |
136 | 137 |
#' ## and at most 3 regions of exp dataset overlap, having as resulting region |
137 | 138 |
#' ## attribute the average signal of the overlapping regions; |
138 |
-#' ## the result has one sample for each input cell. |
|
139 |
+#' ## the result has one sample for each input cell value. |
|
139 | 140 |
#' |
140 | 141 |
#' res = cover(exp, 2, 3, groupBy = conds("cell"), avg_signal = AVG("signal")) |
141 | 142 |
#' |
142 |
-#' ## This statement copies all samples of DATA dataset into OUT dataset, |
|
143 |
-#' ## and then for each of them it adds another metadata attribute, allScores, |
|
143 |
+#' ## This statement copies all samples of 'exp' dataset into 'out' dataset, |
|
144 |
+#' ## and then for each of them it adds another metadata attribute, allScore, |
|
144 | 145 |
#' ## which is the aggregation comma-separated list of all the values |
145 | 146 |
#' ## that the region attribute score takes in the sample. |
146 | 147 |
#' |
147 | 148 |
#' out = extend(exp, allScore = BAG("score")) |
148 | 149 |
#' |
149 | 150 |
#' ## This statement counts the regions in each sample and stores their number |
150 |
-#' ## as value of the new metadata RegionCount attribute of the sample. |
|
151 |
+#' ## as value of the new metadata 'RegionCount' attribute of the sample. |
|
151 | 152 |
#' |
152 | 153 |
#' out = extend(exp, RegionCount = COUNT()) |
153 | 154 |
#' |
154 | 155 |
#' ## This statement copies all samples of exp dataset into res dataset, |
155 |
-#' ## and then calculates new metadata attribute for each of them: |
|
156 |
-#' ## std_score is the standard deviation score of the sample regions. |
|
156 |
+#' ## and then calculates new metadata attribute 'std_score' for each of them: |
|
157 |
+#' ## std_score is the standard deviation of the score values of the sample |
|
158 |
+#' ## regions. |
|
157 | 159 |
#' |
158 | 160 |
#' res = extend(exp, std_score = STD("score")) |
159 | 161 |
#' |
160 | 162 |
#' ## This statement copies all samples of exp dataset into res dataset, |
161 |
-#' ## and then calculates new metadata attribute for each of them: |
|
163 |
+#' ## and then calculates new metadata attribute 'm_score' for each of them: |
|
162 | 164 |
#' ## m_score is the median score of the sample regions. |
163 | 165 |
#' |
164 | 166 |
#' res = extend(exp, m_score = MEDIAN("score")) |
... | ... |
@@ -42,7 +42,7 @@ print.PARAMETER <- function(obj){ |
42 | 42 |
#' |
43 | 43 |
#' init_gmql() |
44 | 44 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
45 |
-#' exp = read_GMQL(test_path) |
|
45 |
+#' exp = read_gmql(test_path) |
|
46 | 46 |
#' |
47 | 47 |
#' ## The following statement produces an output dataset with a single |
48 | 48 |
#' ## output sample. The COVER operation considers all areas defined by |
... | ... |
@@ -61,8 +61,8 @@ print.PARAMETER <- function(obj){ |
61 | 61 |
#' |
62 | 62 |
#' ## The following statement produces an output dataset with a single |
63 | 63 |
#' ## output sample. The COVER operation considers all areas defined by |
64 |
-#' ## a half of maximum amount of overlapping regions in the input samples, |
|
65 |
-#' ## up to any amount of overlapping regions. |
|
64 |
+#' ## minimum of overlapping regions in the input samples equal to half of |
|
65 |
+#' ## the number of input samples, up to any amount of overlapping regions. |
|
66 | 66 |
#' |
67 | 67 |
#' res = cover(exp, ALL()/2, ANY()) |
68 | 68 |
#' |
... | ... |
@@ -37,74 +37,74 @@ check.DISTAL <- function(value) |
37 | 37 |
#' \itemize{ |
38 | 38 |
#' \item{DL: It denotes the less distance clause, |
39 | 39 |
#' which selects all the regions of a joined experiment dataset sample such |
40 |
-#' that their distance from the anchor region of a joined reference dataset |
|
40 |
+#' that their distance from the anchor region of the joined reference dataset |
|
41 | 41 |
#' sample is less than 'value' bases.} |
42 |
-#' \item{DLE: It denotes the less distance clause, |
|
42 |
+#' \item{DLE: It denotes the less equal distance clause, |
|
43 | 43 |
#' which selects all the regions of a joined experiment dataset sample such |
44 |
-#' that their distance from the anchor region of a joined reference dataset |
|
44 |
+#' that their distance from the anchor region of the joined reference dataset |
|
45 | 45 |
#' sample is less than, or equal to, 'value' bases.} |
46 |
-#' \item{DG: It denotes the less distance clause, |
|
46 |
+#' \item{DG: It denotes the great distance clause, |
|
47 | 47 |
#' which selects all the regions of a joined experiment dataset sample such |
48 |
-#' that their distance from the anchor region of a joined reference dataset |
|
48 |
+#' that their distance from the anchor region of the joined reference dataset |
|
49 | 49 |
#' sample is greater than 'value' bases. } |
50 |
-#' \item{DGE: It denotes the less distance clause, |
|
50 |
+#' \item{DGE: It denotes the great equal distance clause, |
|
51 | 51 |
#' which selects all the regions of a joined experiment dataset sample such |
52 |
-#' that their distance from the anchor region of a joined reference dataset |
|
52 |
+#' that their distance from the anchor region of the joined reference dataset |
|
53 | 53 |
#' sample is greater than, or equal to, 'value' bases.} |
54 | 54 |
#' \item{MD: It denotes the minimum distance clause, which selects |
55 |
-#' the first 'value' regions of a joined experiment at minimial distance |
|
56 |
-#' from the anchor region of a joined reference dataset sample.} |
|
55 |
+#' the first 'value' regions of the joined experiment at minimial distance |
|
56 |
+#' from the anchor region of the joined reference dataset sample.} |
|
57 | 57 |
#' \item{UP: It denotes the upstream direction of the genome. |
58 |
-#' It makes predicates to be hold on the upstream of the regions of a joined |
|
59 |
-#' experiment dataset sample. |
|
60 |
-#' UP is true when region of a joined experiment dataset sample is in the |
|
61 |
-#' upstream genome of the anchor region of a joined reference dataset sample. |
|
58 |
+#' It makes predicates to be hold on the upstream of the regions of the joined |
|
59 |
+#' reference dataset sample. |
|
60 |
+#' UP is true when region of the joined experiment dataset sample is in the |
|
61 |
+#' upstream genome of the anchor region of the joined reference dataset sample. |
|
62 | 62 |
#' When this clause is not present, distal conditions apply to both |
63 | 63 |
#' directions of the genome.} |
64 | 64 |
#' \item{DOWN: It denotes the downstream direction of the genome. |
65 |
-#' It makes predicates to be hold on the downstream of the regions of a joined |
|
66 |
-#' experiment dataset sample. |
|
67 |
-#' UP is true when region of a joined experiment dataset sample is in the |
|
68 |
-#' downstream genome of the anchor region of a joined reference dataset sample. |
|
69 |
-#' When this clause is not present, distal conditions apply to both |
|
65 |
+#' It makes predicates to be hold on the downstream of the regions of the |
|
66 |
+#' joined reference dataset sample. |
|
67 |
+#' DOWN is true when region of the joined experiment dataset sample is in the |
|
68 |
+#' downstream genome of the anchor region of the joined reference dataset |
|
69 |
+#' sample. When this clause is not present, distal conditions apply to both |
|
70 | 70 |
#' directions of the genome.} |
71 | 71 |
#' } |
72 | 72 |
#' |
73 | 73 |
#' @param value string identifying distance between genomic regions |
74 |
-#' in base pair, |
|
74 |
+#' in base pair |
|
75 | 75 |
#' |
76 | 76 |
#' @return Distal object |
77 | 77 |
#' |
78 | 78 |
#' @examples |
79 |
-#' ## Thi statement initializes and runs the GMQL server for local execution |
|
79 |
+#' ## This statement initializes and runs the GMQL server for local execution |
|
80 | 80 |
#' ## and creation of results on disk. Then, with system.file() it defines |
81 | 81 |
#' ## the path to the folders "DATASET" and "DATASET_GDM" in the subdirectory |
82 |
-#' ## "example" of the package "RGMQL" and opens such folder as a GMQL |
|
83 |
-#' ## dataset named "TSS" and "HM" respectively using customParser |
|
82 |
+#' ## "example" of the package "RGMQL", and opens such folders as a GMQL |
|
83 |
+#' ## datasets named "TSS" and "HM", respectively, using customParser |
|
84 | 84 |
#' |
85 | 85 |
#' init_gmql() |
86 | 86 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
87 | 87 |
#' test_path2 <- system.file("example", "DATASET_GDM", package = "RGMQL") |
88 |
-#' TSS = read_GMQL(test_path) |
|
89 |
-#' HM = read_GMQL(test_path2) |
|
88 |
+#' TSS = read_gmql(test_path) |
|
89 |
+#' HM = read_gmql(test_path2) |
|
90 | 90 |
#' |
91 | 91 |
#' ## Given a dataset HM and one called TSS with a sample including |
92 |
-#' ## Transcription Start Site annotations, it searches for those regions of HM |
|
93 |
-#' ## that are at a minimal distance from a transcription start site (TSS) |
|
94 |
-#' ## and takes the first/closest one for each TSS, provided that such |
|
95 |
-#' ## distance is lesser than 1200 bases and joined TSS and HM samples are |
|
96 |
-#' ## obtained from the same provider (joinby clause). |
|
92 |
+#' ## Transcription Start Site annotations, this statement searches for those |
|
93 |
+#' ## regions of HM that are at a minimal distance from a transcription |
|
94 |
+#' ## start site (TSS) and takes the first/closest one for each TSS, provided |
|
95 |
+#' ## that such distance is lesser than 1200 bases and joined TSS and HM |
|
96 |
+#' ## samples are obtained from the same provider (joinby clause). |
|
97 | 97 |
#' |
98 | 98 |
#' join_data = merge(TSS, HM, |
99 | 99 |
#' genometric_predicate = list(MD(1), DL(1200)), conds("provider"), |
100 | 100 |
#' region_output = "RIGHT") |
101 | 101 |
#' |
102 |
-#' ## Given a dataset 'HM' and one called 'TSS' with a sample including |
|
103 |
-#' ## Transcription Start Site annotations, it searches for those regions of HM |
|
104 |
-#' ## that are at a minimal distance from a transcription start site (TSS) |
|
105 |
-#' ## and takes the first/closest one for each TSS, provided that such distance |
|
106 |
-#' ## is greater than 12K bases and joined 'tss' and 'hm' samples are obtained |
|
107 |
-#' ## from the same provider (joinby clause). |
|
102 |
+#' ## Given a dataset HM and one called TSS with a sample including |
|
103 |
+#' ## Transcription Start Site annotations, this statement searches for those |
|
104 |
+#' ## regions of HM that are downstream and at a minimal distance from a |
|
105 |
+#' ## transcription start site (TSS) and takes the first/closest one for each |
|
106 |
+#' ## TSS, provided that such distance is greater than 12K bases and joined |
|
107 |
+#' ## TSS and HM samples are obtained from the same provider (joinby clause). |
|
108 | 108 |
#' |
109 | 109 |
#' join_data = merge(TSS, HM, |
110 | 110 |
#' genometric_predicate = list(MD(1), DGE(12000), DOWN()), conds("provider"), |
... | ... |
@@ -50,41 +50,42 @@ as.character.OPERATOR <- function(obj) { |
50 | 50 |
#' @param value string identifying name of metadata attribute |
51 | 51 |
#' @param type string identifying the type of the attribute value; |
52 | 52 |
#' it must be: INTEGER, DOUBLE or STRING |
53 |
+#' For NIL() function, only INTEGER and DOUBLE are allowed |
|
53 | 54 |
#' |
54 | 55 |
#' @return Operator object |
55 | 56 |
#' |
56 | 57 |
#' |
57 | 58 |
#' @examples |
58 |
-#' ## Thi statement initializes and runs the GMQL server for local execution |
|
59 |
+#' ## This statement initializes and runs the GMQL server for local execution |
|
59 | 60 |
#' ## and creation of results on disk. Then, with system.file() it defines |
60 |
-#' ## the path to the folders "DATASET" in the subdirectory "example" |
|
61 |
+#' ## the path to the folder "DATASET" in the subdirectory "example" |
|
61 | 62 |
#' ## of the package "RGMQL" and opens such folder as a GMQL dataset |
62 | 63 |
#' ## named "exp" |
63 | 64 |
#' |
64 | 65 |
#' init_gmql() |
65 | 66 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
66 |
-#' exp = read_GMQL(test_path) |
|
67 |
+#' exp = read_gmql(test_path) |
|
67 | 68 |
#' |
68 |
-#' ## This statement allows to select, in all input sample, all those regions |
|
69 |
+#' ## This statement allows to select, in all input samples, all those regions |
|
69 | 70 |
#' ## for which the region attribute score has a value which is greater |
70 | 71 |
#' ## than the metadata attribute value "avg_score" in their sample. |
71 | 72 |
#' |
72 | 73 |
#' data = filter(exp, r_predicate = score > META("avg_score")) |
73 | 74 |
#' |
74 |
-#' ## This statement defines a new numeric region attribute with "null" value. |
|
75 |
+#' ## This statement defines new numeric region attributes with "null" value. |
|
75 | 76 |
#' ## The syntax for creating a new attribute with null value is |
76 |
-#' ## attribute_name = NULL(TYPE), where type may be INTEGER, DOUBLE or STRING. |
|
77 |
+#' ## attribute_name = NULL(TYPE), where type may be INTEGER or DOUBLE. |
|
77 | 78 |
#' |
78 | 79 |
#' out = select(exp, regions_update = list(signal = NIL("INTEGER"), |
79 | 80 |
#' pvalue = NIL("DOUBLE"))) |
80 | 81 |
#' |
81 |
-#' ## This statement allows to build an output dataset out such that all |
|
82 |
-#' ## the samples from the input dataset exp are conserved, |
|
82 |
+#' ## This statement allows to build an output dataset named 'out' such that |
|
83 |
+#' ## all the samples from the input dataset 'exp' are conserved, |
|
83 | 84 |
#' ## as well as their region attributes (and their values) |
84 | 85 |
#' ## and their metadata attributes (and their values). |
85 |
-#' ## The new metadata attribute concSq is added to all output samples |
|
86 |
+#' ## The new metadata attribute 'concSq' is added to all output samples |
|
86 | 87 |
#' ## with value correspondent to the mathematical squared root |
87 |
-#' ## of the pre-existing metadata attribute concentration. |
|
88 |
+#' ## of the pre-existing metadata attribute 'concentration'. |
|
88 | 89 |
#' |
89 | 90 |
#' out = select(exp, metadata_update = list(concSq = SQRT("concentration"))) |
90 | 91 |
#' |
... | ... |
@@ -96,3 +96,25 @@ |
96 | 96 |
if(length(value)>1) |
97 | 97 |
stop("no multiple string") |
98 | 98 |
} |
99 |
+ |
|
100 |
+.is_login_expired <- function(url) |
|
101 |
+{ |
|
102 |
+ if(exists("GMQL_credentials", envir = .GlobalEnv)) |
|
103 |
+ { |
|
104 |
+ if(exists("authToken", where = GMQL_credentials)) |
|
105 |
+ { |
|
106 |
+ authToken <- GMQL_credentials$authToken |
|
107 |
+ url <- sub("/*[/]$","",url) |
|
108 |
+ h <- c('Accept' = 'Application/json', 'X-Auth-Token' = authToken) |
|
109 |
+ URL <- paste0(url,"/user") |
|
110 |
+ req <- httr::GET(URL,httr::add_headers(h)) |
|
111 |
+ if(req$status_code !=200) |
|
112 |
+ return(TRUE) |
|
113 |
+ else |
|
114 |
+ return(FALSE) |
|
115 |
+ } |
|
116 |
+ } |
|
117 |
+ return(TRUE) |
|
118 |
+} |
|
119 |
+ |
|
120 |
+ |
... | ... |
@@ -21,54 +21,55 @@ |
21 | 21 |
#' if NULL no filtering action occures |
22 | 22 |
#' (i.e every sample is taken for region filtering) |
23 | 23 |
#' @param metadata_prefix vector of strings that will support the metadata |
24 |
-#' filtering. If defined, each 'metadata' are concatenated with the |
|
24 |
+#' filtering. If defined, each 'metadata' is concatenated with the |
|
25 | 25 |
#' corresponding prefix. |
26 |
-#' @param regions vector of strings that extracts only region attribute |
|
27 |
-#' specified; if NULL no regions attribute is taken and the output is only |
|
28 |
-#' GRanges made up by the region coordinate attributes |
|
26 |
+#' @param region_attributes vector of strings that extracts only region |
|
27 |
+#' attributes specified; if NULL no regions attribute is taken and the output |
|
28 |
+#' is only GRanges made up by the region coordinate attributes |
|
29 | 29 |
#' (seqnames, start, end, strand) |
30 |
-#' @param suffix name for each metadata column of GRanges. by default is the |
|
31 |
-#' "antibody_target". This string is taken from sample metadata file or from |
|
32 |
-#' metadata() associated. If not present, the column name is the name of |
|
33 |
-#' selected regions |
|
30 |
+#' @param suffix name for each metadata column of GRanges. By default it is the |
|
31 |
+#' value of the metadata attribute named "antibody_target". This string is |
|
32 |
+#' taken from sample metadata file or from metadata() associated. |
|
33 |
+#' If not present, the column name is the name of selected regions specified |
|
34 |
+#' by 'regions' input parameter |
|
34 | 35 |
#' |
35 | 36 |
#' @details |
36 |
-#' This function works only with datatset or GRangesList which samples or |
|
37 |
-#' Granges have the same regions coordinates (chr, ranges, strand) |
|
37 |
+#' This function works only with datatset or GRangesList all whose samples or |
|
38 |
+#' Granges have the same region coordinates (chr, ranges, strand) |
|
38 | 39 |
#' |
39 |
-#' In case of Grangeslist data input the function will search for metadata |
|
40 |
-#' into metadata() function associated to Grangeslist. |
|
40 |
+#' In case of GRangesList data input, the function searches for metadata |
|
41 |
+#' into metadata() function associated to GRangesList. |
|
41 | 42 |
#' |
42 | 43 |
#' @return GRanges with selected regions |
43 | 44 |
#' |
44 | 45 |
#' @examples |
45 | 46 |
#' |
46 |
-#' ## This statement defines the path to the folders "DATASET" in the |
|
47 |
-#' ## subdirectory "example" of the package "RGMQL" and filter such folder |
|
48 |
-#' ## dataset including at output only "pvalue" and "peak" regions |
|
47 |
+#' ## This statement defines the path to the folder "DATASET" in the |
|
48 |
+#' ## subdirectory "example" of the package "RGMQL" and filters such folder |
|
49 |
+#' ## dataset including at output only "pvalue" and "peak" region attributes |
|
49 | 50 |
#' |
50 | 51 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
51 |
-#' filter_and_extract(test_path, regions = c("pvalue", "peak")) |
|
52 |
+#' filter_and_extract(test_path, region_attributes = c("pvalue", "peak")) |
|
52 | 53 |
#' |
53 |
-#' ## This statement import a GMQL dataset as GRangesList and filter it |
|
54 |
-#' ## including at output only "pvalue" and "peak" regions |
|
54 |
+#' ## This statement imports a GMQL dataset as GRangesList and filters it |
|
55 |
+#' ## including at output only "pvalue" and "peak" region attributes |
|
55 | 56 |
#' |
56 | 57 |
#' grl = import_gmql(test_path, TRUE) |
57 |
-#' filter_and_extract(grl, regions = c("pvalue", "peak")) |
|
58 |
+#' filter_and_extract(grl, region_attributes = c("pvalue", "peak")) |
|
58 | 59 |
#' |
59 | 60 |
#' |
60 | 61 |
#' @export |
61 | 62 |
#' |
62 | 63 |
filter_and_extract <- function(data, metadata = NULL, |
63 |
- metadata_prefix = NULL, regions = NULL, |
|
64 |
- suffix = "antibody_target") |
|
64 |
+ metadata_prefix = NULL, region_attributes = NULL, |
|
65 |
+ suffix = "antibody_target") |
|
65 | 66 |
{ |
66 | 67 |
if(is(data,"GRangesList")) |
67 |
- .extract_from_GRangesList(data, metadata, metadata_prefix, regions, |
|
68 |
- suffix) |
|
68 |
+ .extract_from_GRangesList(data, metadata, metadata_prefix, |
|
69 |
+ region_attributes, suffix) |
|
69 | 70 |
else |
70 |
- .extract_from_dataset(data, metadata, metadata_prefix, regions, |
|
71 |
- suffix) |
|
71 |
+ .extract_from_dataset(data, metadata, metadata_prefix, |
|
72 |
+ region_attributes, suffix) |
|
72 | 73 |
} |
73 | 74 |
|
74 | 75 |
.extract_from_dataset <- function(datasetName, metadata, metadata_prefix, |
... | ... |
@@ -94,40 +95,43 @@ filter_and_extract <- function(data, metadata = NULL, |
94 | 95 |
|
95 | 96 |
vector_field <- .schema_header(datasetName) |
96 | 97 |
|
98 |
+ |
|
97 | 99 |
if(length(gdm_meta_files)) |
98 | 100 |
{ |
99 |
- samples_with_suffix <- .check_metadata_files(metadata,metadata_prefix, |
|
100 |
- gdm_meta_files, suffix) |
|
101 |
+ samples_file <- .check_metadata_files(metadata,metadata_prefix, |
|
102 |
+ gdm_meta_files) |
|
101 | 103 |
|
102 |
- samples_file <- lapply(samples_with_suffix, function(x) x$sample) |
|
103 |
- suffix_vec <- lapply(samples_with_suffix, function(x) x$suffix) |
|
104 |
- suffixes <- unlist(suffix_vec) |
|
105 |
- samples_to_read <- unlist(samples_file) |
|
104 |
+ samples_meta_to_read <- unlist(samples_file) |
|
106 | 105 |
|
107 |
- if(length(samples_to_read)) |
|
108 |
- samples_to_read <- gsub(".meta$", "", samples_to_read) |
|
106 |
+ if(length(samples_meta_to_read)) |
|
107 |
+ samples_to_read <- gsub(".meta$", "", samples_meta_to_read) |
|
109 | 108 |
else |
109 |
+ { |
|
110 | 110 |
samples_to_read <- gsub(".meta$", "", gdm_meta_files) |
111 |
+ samples_meta_to_read <- gtf_meta_files |
|
112 |
+ |
|
113 |
+ } |
|
111 | 114 |
|
115 |
+ suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) |
|
112 | 116 |
granges <- .parse_gdm_files(vector_field,samples_to_read,regions, |
113 |
- suffixes) |
|
117 |
+ suffix_vec) |
|
114 | 118 |
} |
115 | 119 |
else |
116 | 120 |
{ |
117 |
- samples_with_suffix <- .check_metadata_files(metadata,metadata_prefix, |
|
118 |
- gtf_meta_files, suffix) |
|
119 |
- |
|
120 |
- samples_file <- lapply(samples_with_suffix, function(x) x$sample) |
|
121 |
- suffix_vec <- lapply(samples_with_suffix, function(x) x$suffix) |
|
122 |
- suffixes <- unlist(suffix_vec) |
|
123 |
- samples_to_read <- unlist(samples_file) |
|
121 |
+ samples_file <- .check_metadata_files(metadata,metadata_prefix, |
|
122 |
+ gtf_meta_files) |
|
123 |
+ samples_meta_to_read <- unlist(samples_file) |
|
124 | 124 |
|
125 |
- if(length(samples_to_read)) |
|
126 |
- samples_to_read <- gsub(".meta$", "", samples_to_read) |
|
125 |
+ if(length(samples_meta_to_read)) |
|
126 |
+ samples_to_read <- gsub(".meta$", "", samples_meta_to_read) |
|
127 | 127 |
else |
128 |
+ { |
|
128 | 129 |
samples_to_read <- gsub(".meta$", "", gtf_meta_files) |
130 |
+ samples_meta_to_read <- gtf_meta_files |
|
131 |
+ } |
|
129 | 132 |
|
130 |
- granges <- .parse_gtf_files(samples_to_read, regions, suffixes) |
|
133 |
+ suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read) |
|
134 |
+ granges <- .parse_gtf_files(samples_to_read, regions,suffix_vec) |
|
131 | 135 |
} |
132 | 136 |
} |
133 | 137 |
|
... | ... |
@@ -141,8 +145,7 @@ filter_and_extract <- function(data, metadata = NULL, |
141 | 145 |
stop("rangesList empty") |
142 | 146 |
|
143 | 147 |
meta_list <- metadata(rangesList) |
144 |
- samples <- .check_metadata_list(metadata, metadata_prefix, meta_list, |
|
145 |
- suffix) |
|
148 |
+ samples <- .check_metadata_list(metadata, metadata_prefix, meta_list) |
|
146 | 149 |
if(!length(unlist(samples))) |
147 | 150 |
samples <- rangesList |
148 | 151 |
else |
... | ... |
@@ -150,26 +153,73 @@ filter_and_extract <- function(data, metadata = NULL, |
150 | 153 |
index <- unlist(samples) |
151 | 154 |
samples <- rangesList[c(index)] |
152 | 155 |
} |
153 |
- granges <- .parse_Granges(samples,regions) |
|
156 |
+ new_meta_list <- metadata(samples) |
|
157 |
+ suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list) |
|
158 |
+ granges <- .parse_Granges(samples,regions,suffix_vec) |
|
154 | 159 |
} |
155 | 160 |
|
156 |
-.parse_Granges <- function(region_list,regions) |
|
161 |
+.parse_Granges <- function(region_list,regions,suffixes) |
|
157 | 162 |
{ |
163 |
+ if(is.null(suffixes)) |
|
164 |
+ suffixes = "" |
|
165 |
+ |
|
158 | 166 |
g1 <- region_list[[1]] |
159 | 167 |
elementMetadata(g1) <- NULL |
160 | 168 |
if(!is.null(regions)) |
161 | 169 |
{ |
162 |
- DF_list <- lapply(region_list, function(g_x){ |
|
170 |
+ DF_list <- mapply(function(g_x,h){ |
|
163 | 171 |
meta <- elementMetadata(g_x)[regions] |
172 |
+ if(h!="") |
|
173 |
+ names(meta) <- paste(regions,h,sep = ".") |
|
164 | 174 |
data.frame(meta) |
165 |
- }) |
|
175 |
+ },region_list, suffixes, SIMPLIFY = FALSE) |
|
166 | 176 |
DF_only_regions <- dplyr::bind_cols(DF_list) |
167 | 177 |
elementMetadata(g1) <- DF_only_regions |
168 | 178 |
} |
169 | 179 |
g1 |
170 | 180 |
} |
171 | 181 |
|
172 |
-.check_metadata_list <- function(metadata,metadata_prefix,meta_list,col_name) |
|
182 |
+.get_suffix <- function(col_name, from_list, meta_fl) |
|
183 |
+{ |
|
184 |
+ suffix <- paste0(col_name,"$") |
|
185 |
+ |
|
186 |
+ if(from_list) |
|
187 |
+ { |
|
188 |
+ meta_list <- mapply(function(x,index){ |
|
189 |
+ vec_names <- names(x) |
|
190 |
+ s_index <- grep(suffix,vec_names) |
|
191 |
+ first_index <- s_index[1] |
|
192 |
+ suffix <- unlist(x[first_index]) # ne prendo solo uno |
|
193 |
+ names(suffix) <- NULL |
|
194 |
+ |
|
195 |
+ #if found retrieve samples that has at least one choosen metadata |
|
196 |
+ if(first_index && !is.na(first_index)) |
|
197 |
+ suffix |
|
198 |
+ else |
|
199 |
+ "" |
|
200 |
+ }, meta_fl, seq_along(meta_fl)) |
|
201 |
+ } |
|
202 |
+ else |
|
203 |
+ { |
|
204 |
+ meta_list <- vapply(meta_fl, function(x){ |
|
205 |
+ list <- .add_metadata(x) |
|
206 |
+ vec_names <- names(list) |
|
207 |
+ index <- grep(suffix,vec_names) |
|
208 |
+ first_index <- index[1] |
|
209 |
+ suffix <- unlist(list[first_index]) # ne prendo solo uno |
|
210 |
+ names(suffix) <- NULL |
|
211 |
+ #if found retrieve samples that has at least one choosen metadata |
|
212 |
+ if(first_index && !is.na(first_index)) |
|
213 |
+ suffix |
|
214 |
+ else |
|
215 |
+ "" |
|
216 |
+ },character(1)) |
|
217 |
+ } |
|
218 |
+ names(meta_list) <- NULL |
|
219 |
+ meta_list |
|
220 |
+} |
|
221 |
+ |
|
222 |
+.check_metadata_list <- function(metadata,metadata_prefix,meta_list) |
|
173 | 223 |
{ |
174 | 224 |
vec_meta <- paste0(metadata_prefix,metadata) |
175 | 225 |
list <- mapply(function(x,index){ |
... | ... |
@@ -184,9 +234,8 @@ filter_and_extract <- function(data, metadata = NULL, |
184 | 234 |
}, meta_list, seq_along(meta_list)) |
185 | 235 |
} |
186 | 236 |
|
187 |
-.check_metadata_files <- function(metadata,metadata_prefix,meta_files,col_name) |
|
237 |
+.check_metadata_files <- function(metadata,metadata_prefix,meta_files) |
|
188 | 238 |
{ |
189 |
- suffix <- paste0(col_name,"$") |
|
190 | 239 |
vec_meta <- paste0(metadata_prefix,metadata) |
191 | 240 |
meta_list <- lapply(meta_files, function(x){ |
192 | 241 |
list <- .add_metadata(x) |
... | ... |
@@ -194,19 +243,11 @@ filter_and_extract <- function(data, metadata = NULL, |
194 | 243 |
a <- lapply(vec_meta, function(y)grep(y,vec_names)) |
195 | 244 |
## we would like that manage more index from grep |
196 | 245 |
found <- as.logical(length(unlist(a))) |
197 |
- index <- grep(suffix,vec_names) |
|
198 |
- suffix <- unlist(list[index])[1] # ne prendo solo uno |
|
199 |
- names(suffix) <- NULL |
|
200 | 246 |
#if found retrieve samples that has at least one choosen metadata |
201 |
- if(found) |
|
202 |
- list("sample" = x, "suffix" = suffix ) |
|
203 |
- else |
|
204 |
- list("sample" = NULL, "suffix" = suffix ) |
|
205 |
- |
|
247 |
+ if(found){x} |
|
206 | 248 |
}) |
207 | 249 |
} |
208 | 250 |
|
209 |
- |
|
210 | 251 |
.parse_gtf_files <- function(gtf_region_files, regions, suffixes) |
211 | 252 |
{ |
212 | 253 |
g1 <- rtracklayer::import(con = gtf_region_files[1], format = "gtf") |
... | ... |
@@ -229,7 +270,8 @@ filter_and_extract <- function(data, metadata = NULL, |
229 | 270 |
g1 |
230 | 271 |
} |
231 | 272 |
|
232 |
-.parse_gdm_files <- function(vector_field,gdm_region_files,regions,suffixes) |
|
273 |
+.parse_gdm_files <- function(vector_field,gdm_region_files,regions, |
|
274 |
+ suffixes) |
|
233 | 275 |
{ |
234 | 276 |
#read first sample cause chromosome regions are the same for all samples |
235 | 277 |
df <- data.table::fread(gdm_region_files[1],col.names = vector_field, |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
#' Method cover |
2 | 2 |
#' |
3 | 3 |
#' It takes as input a dataset containing one or more samples and returns |
4 |
-#' another dataset (with a single sample, if no \emph{groupby} option is |
|
4 |
+#' another dataset (with a single sample, if no \emph{groupBy} option is |
|
5 | 5 |
#' specified) by “collapsing” the input dataset samples and their regions |
6 | 6 |
#' according to certain rules specified by the input parameters. |
7 | 7 |
#' The attributes of the output genomic regions are only the region |
... | ... |
@@ -13,14 +13,14 @@ |
13 | 13 |
#' of the intersection and of the union of the contributing regions; |
14 | 14 |
#' the JaccardResult index is calculated as the ratio between the lengths |
15 | 15 |
#' of the result and the union of the contributing regions. |
16 |
-#' If aggregate functions are specified, a new attribute is added for |
|
16 |
+#' If aggregate functions are specified, a new region attribute is added for |
|
17 | 17 |
#' each aggregate function specified. |
18 | 18 |
#' Output metadata are the union of the input ones. |
19 |
-#' If \emph{groupby} clause is specified, the input samples are partitioned |
|
19 |
+#' If \emph{groupBy} clause is specified, the input samples are partitioned |
|
20 | 20 |
#' in groups, each with distinct values of the grouping metadata attributes, |
21 | 21 |
#' and the \emph{cover} operation is separately applied to each group, |
22 | 22 |
#' yielding to one sample in the result for each group. |
23 |
-#' Input samples that do not satisfy the \emph{groupby} condition |
|
23 |
+#' Input samples that do not satisfy the \emph{groupBy} condition |
|
24 | 24 |
#' are disregarded. |
25 | 25 |
#' |
26 | 26 |
#' @include AllClasses.R |
... | ... |
@@ -61,7 +61,7 @@ |
61 | 61 |
#' Every aggregate accepts a string value, except for COUNT, which does not |
62 | 62 |
#' have any value. |
63 | 63 |
#' Argument of 'aggregate function' must exist in schema, i.e. among region |
64 |
-#' attributes. Two style are allowed: |
|
64 |
+#' attributes. Two styles are allowed: |
|
65 | 65 |
#' \itemize{ |
66 | 66 |
#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
67 | 67 |
#' \item list of values: e.g. SUM("pvalue") |
... | ... |
@@ -83,7 +83,7 @@ |
83 | 83 |
#' to the \emph{AccIndex} region attribute.} |
84 | 84 |
#' \item{COVER: default value.} |
85 | 85 |
#' } |
86 |
-#' Can be all caps or lowercase |
|
86 |
+#' It can be all caps or lowercase |
|
87 | 87 |
#' |
88 | 88 |
#' @return GMQLDataset object. It contains the value to use as input |
89 | 89 |
#' for the subsequent GMQLDataset method |
... | ... |
@@ -97,10 +97,10 @@ |
97 | 97 |
#' ## using customParser |
98 | 98 |
#' |
99 | 99 |
#' init_gmql() |
100 |
-#' test_path <- system.file("example","DATASET",package = "RGMQL") |
|
101 |
-#' exp = read_GMQL(test_path) |
|
100 |
+#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
|
101 |
+#' exp = read_gmql(test_path) |
|
102 | 102 |
#' |
103 |
-#' ## the following statement produces an output dataset with a single output |
|
103 |
+#' ## The following statement produces an output dataset with a single output |
|
104 | 104 |
#' ## sample. The COVER operation considers all areas defined by a minimum |
105 | 105 |
#' ## of two overlapping regions in the input samples, up to any amount of |
106 | 106 |
#' ## overlapping regions. |
... | ... |
@@ -109,7 +109,7 @@ |
109 | 109 |
#' |
110 | 110 |
#' ## The following GMQL statement computes the result grouping the input |
111 | 111 |
#' ## exp samples by the values of their cell metadata attribute, |
112 |
-#' ## thus one output res sample is generated for each cell type; |
|
112 |
+#' ## thus one output res sample is generated for each cell value; |
|
113 | 113 |
#' ## output regions are produced where at least 2 and at most 3 regions |
114 | 114 |
#' ## of grouped exp samples overlap, setting as attributes of the resulting |
115 | 115 |
#' ## regions the minimum pvalue of the overlapping regions (min_pvalue) |
... | ... |
@@ -4,13 +4,13 @@ |
4 | 4 |
#' |
5 | 5 |
#' @description It produces one sample in the result for each sample of the |
6 | 6 |
#' left operand, by keeping the same metadata of the left input sample |
7 |
-#' and only those regions (with their schema and values) of the left input |
|
8 |
-#' sample which do not intersect with any region in the right operand sample. |
|
9 |
-#' The optional \emph{joinby} clause is used to extract a subset of couples |
|
10 |
-#' from the Cartesian product of two dataset \emph{x} and \emph{y} |
|
7 |
+#' and only those regions (with their attributes and values) of the left input |
|
8 |
+#' sample which do not intersect with any region in any right operand sample. |
|
9 |
+#' The optional \emph{joinBy} clause is used to extract a subset of pairs |
|
10 |
+#' from the Cartesian product of the two input datasets \emph{x} and \emph{y} |
|
11 | 11 |
#' on which to apply the DIFFERENCE operator: |
12 |
-#' only those samples that have the same value for each attribute |
|
13 |
-#' are considered when performing the difference. |
|
12 |
+#' only those samples that have the same value for each specified metadata |
|
13 |
+#' attribute are considered when performing the difference. |
|
14 | 14 |
#' |
15 | 15 |
#' @importFrom rJava J .jnull .jarray |
16 | 16 |
#' @importFrom BiocGenerics setdiff |
... | ... |
@@ -18,14 +18,15 @@ |
18 | 18 |
#' @param x GMQLDataset class object |
19 | 19 |
#' @param y GMQLDataset class object |
20 | 20 |
#' @param joinBy \code{\link{conds}} function to support methods with |
21 |
-#' groupBy or JoinBy input paramter |
|
21 |
+#' groupBy or JoinBy input parameter |
|
22 | 22 |
#' |
23 | 23 |
#' @param is_exact single logical value: TRUE means that the region difference |
24 |
-#' is executed only on regions in left_input_data with exactly the same |
|
25 |
-#' coordinates of at least one region present in right_input_data; |
|
24 |
+#' is executed only on regions in 'x' dataset with exactly the same |
|
25 |
+#' coordinates of at least one region present in 'y' dataset; |
|
26 | 26 |
#' if is_exact = FALSE, the difference is executed on all regions in |
27 |
-#' left_input_data that overlap with at least one region in right_input_data |
|
28 |
-#' (even just one base). |
|
27 |
+#' left_input_data that overlap (even just one base) with at least one |
|
28 |
+#' region in 'y' |
|
29 |
+#' |
|
29 | 30 |
#' |
30 | 31 |
#' @return GMQLDataset object. It contains the value to use as input |
31 | 32 |
#' for the subsequent GMQLDataset method |
... | ... |
@@ -40,19 +41,19 @@ |
40 | 41 |
#' init_gmql() |
41 | 42 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
42 | 43 |
#' test_path2 <- system.file("example", "DATASET_GDM", package = "RGMQL") |
43 |
-#' data1 = read_GMQL(test_path) |
|
44 |
-#' data2 = read_GMQL(test_path2) |
|
44 |
+#' data1 = read_gmql(test_path) |
|
45 |
+#' data2 = read_gmql(test_path2) |
|
45 | 46 |
#' |
46 |
-#' ## This GMQL statement returns all the regions in the first dataset |
|
47 |
+#' ## This statement returns all the regions in the first dataset |
|
47 | 48 |
#' ## that do not overlap any region in the second dataset. |
48 | 49 |
#' |
49 | 50 |
#' out = setdiff(data1, data2) |
50 | 51 |
#' |
51 |
-#' ## This GMQL statement extracts for every pair of samples s1 in EXP1 |
|
52 |
-#' ## and s2 in EXP2 having the same value of the metadata |
|
53 |
-#' ## attribute 'antibody_target' the regions that appear in s1 but |
|
54 |
-#' ## do not overlap any region in s2; |
|
55 |
-#' ## metadata of the result are the same as the metadata of s1. |
|
52 |
+#' ## This statement extracts for every pair of samples s1 in data1 |
|
53 |
+#' ## and s2 in data2 having the same value of the metadata |
|
54 |
+#' ## attribute 'cell' the regions that appear in s1 but |
|
55 |
+#' ## do not overlap any region in s2. |
|
56 |
+#' ## Metadata of the result are the same as the metadata of s1. |
|
56 | 57 |
#' |
57 | 58 |
#' out_t = setdiff(data1, data2, conds("cell")) |
58 | 59 |
#' |
... | ... |
@@ -38,7 +38,7 @@ |
38 | 38 |
#' |
39 | 39 |
#' init_gmql() |
40 | 40 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
41 |
-#' data <- read_GMQL(test_path) |
|
41 |
+#' data <- read_gmql(test_path) |
|
42 | 42 |
#' |
43 | 43 |
#' ## This statement counts the regions in each sample and stores their number |
44 | 44 |
#' ## as value of the new metadata attribute RegionCount of the sample. |
... | ... |
@@ -9,6 +9,45 @@ group_by.GMQLDateset <- function(.data, groupBy_meta = conds(), |
9 | 9 |
#' Method group_by |
10 | 10 |
#' |
11 | 11 |
#' @description Wrapper to GMQL GROUP operator |
12 |
+#' @description It performs the grouping of samples of the input dataset |
|
13 |
+#' based on one specified metadata and/or region attribute. If the metadata |
|
14 |
+#' attribute is multi-value, i.e., it assumes multiple values for sample |
|
15 |
+#' (e.g., both <disease, cancer> and <disease, diabetes>), the grouping |
|
16 |
+#' identifies different groups of samples for each attribute value combination |
|
17 |
+#' (e.g., group1 for samples that feature the combination <disease, cancer>, |
|
18 |
+#' group2 for samples that feature the combination <disease, diabetes>, |
|
19 |
+#' and group3 for samples that feature both combinations <disease, cancer> |
|
20 |
+#' and <disease, diabetes>). For each obtained group, it is possible to |
|
21 |
+#' request the evaluation of aggregate functions on metadata attributes; |
|
22 |
+#' these functions consider the metadata contained in all samples of the group. |
|
23 |
+#' The regions, their attributes and their values in output are the same |
|
24 |
+#' as the ones in input for each sample, and the total number of samples |
|
25 |
+#' does not change. All metadata in the input samples are conserved with |
|
26 |
+#' their values in the output samples, with the addition of the "_group" |
|
27 |
+#' attribute, whose value is the identifier of the group to which the specific |
|
28 |
+#' sample is assigned; other metadata attributes can be added as aggregate |
|
29 |
+#' functions computed on specified metadata. When used on region attributes, |
|
30 |
+#' group_by can group regions of each sample individually, based on their |
|
31 |
+#' coordinates (chr, start, stop, strand) and possibly also on other |
|
32 |
+#' specified grouping region attributes (when these are present in the schema |
|
33 |
+#' of the input dataset). In each sample, regions found in the same group |
|
34 |
+#' (i.e., regions with same coordinates and grouping attribute values), |
|
35 |
+#' are combined into a single region; this allows to merge regions that are |
|
36 |
+#' duplicated inside the same sample (based on the values of their coordinates |
|
37 |
+#' and of other possible specified region attributes). For each grouped |
|
38 |
+#' region, it is possible to request the evaluation of aggregate functions |
|
39 |
+#' on other region attributes (i.e., which are not coordinates, or grouping |
|
40 |
+#' region attributes). This use is independent on the possible grouping |
|
41 |
+#' realised based on metadata. The generated output schema only contains the |
|
42 |
+#' original region attributes on which the grouping has been based, and |
|
43 |
+#' additionally the attributes in case calculated as aggregated functions. |
|
44 |
+#' If the group_by is applied only on regions, the output metadata and their |
|
45 |
+#' values are equal to the ones in input. Both when applied on metadata and |
|
46 |
+#' on regions, the group_by operation returns a number of output samples |
|
47 |
+#' equal to the number of input ones. Note that the two possible uses of |
|
48 |
+#' group_by, on metadata and on regions, are perfectly orthogonal, |
|
49 |
+#' therefore they can be used in combination or independently. |
|
50 |
+#' |
|
12 | 51 |
#' |
13 | 52 |
#' @importFrom rJava J .jarray .jnull |
14 | 53 |
#' @importFrom dplyr group_by |
... | ... |
@@ -17,45 +56,45 @@ group_by.GMQLDateset <- function(.data, groupBy_meta = conds(), |
17 | 56 |
#' @param groupBy_meta \code{\link{conds}} function to support methods with |
18 | 57 |
#' groupBy or JoinBy input parameter |
19 | 58 |
#' |
20 |
-#' @param groupBy_regions vector of string made up by schema field attribute |
|
21 |
-#' @param region_aggregates It accept a list of aggregate function on |
|
22 |
-#' region attribute. |
|
23 |
-#' All the element in the form \emph{key} = \emph{aggregate}. |
|
24 |
-#' The \emph{aggregate} is an object of class AGGREGATES |
|
59 |
+#' @param groupBy_regions vector of strings made up by region attribute names |
|
60 |
+#' @param meta_aggregates It accepts a list of aggregate functions on |
|
61 |
+#' metadata attribute. |
|
62 |
+#' All the elements in the form \emph{key} = \emph{aggregate}. |
|
63 |
+#' The \emph{aggregate} is an object of class AGGREGATES. |
|
25 | 64 |
#' The aggregate functions available are: \code{\link{SUM}}, |
26 | 65 |
#' \code{\link{COUNTSAMP}}, \code{\link{MIN}}, \code{\link{MAX}}, |
27 | 66 |
#' \code{\link{AVG}}, \code{\link{MEDIAN}}, \code{\link{STD}}, |
28 | 67 |
#' \code{\link{BAG}}, \code{\link{BAGD}}, \code{\link{Q1}}, |
29 | 68 |
#' \code{\link{Q2}}, \code{\link{Q3}}. |
30 |
-#' Every aggregate accepts a string value, execet for COUNTSAMP, which does |
|
69 |
+#' Every aggregate accepts a string value, except for COUNTSAMP, which does |
|
31 | 70 |
#' not have any value. |
32 | 71 |
#' Argument of 'aggregate function' must exist in schema, i.e. among region |
33 |
-#' attributes. Two style are allowed: |
|
72 |
+#' attributes. Two styles are allowed: |
|
34 | 73 |
#' \itemize{ |
35 |
-#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
36 |
-#' \item list of values: e.g. SUM("pvalue") |
|
74 |
+#' \item list of key-value pairs: e.g. sum = SUM("cell") |
|
75 |
+#' \item list of values: e.g. SUM("cell") |
|
37 | 76 |
#' } |
38 | 77 |
#' "mixed style" is not allowed |
39 |
-#' @param meta_aggregates It accept a list of aggregate function on |
|
40 |
-#' metadata attribute. |
|
41 |
-#' All the element in the form \emph{key} = \emph{aggregate}. |
|
42 |
-#' The \emph{aggregate} is an object of class AGGREGATES |
|
78 |
+#' |
|
79 |
+#' @param region_aggregates It acceptss a list of aggregate function on |
|
80 |
+#' region attribute. |
|
81 |
+#' All the elements in the form \emph{key} = \emph{aggregate}. |
|
82 |
+#' The \emph{aggregate} is an object of class AGGREGATES. |
|
43 | 83 |
#' The aggregate functions available are: \code{\link{SUM}}, |
44 |
-#' \code{\link{COUNTSAMP}}, \code{\link{MIN}}, \code{\link{MAX}}, |
|
84 |
+#' \code{\link{COUNT}}, \code{\link{MIN}}, \code{\link{MAX}}, |
|
45 | 85 |
#' \code{\link{AVG}}, \code{\link{MEDIAN}}, \code{\link{STD}}, |
46 | 86 |
#' \code{\link{BAG}}, \code{\link{BAGD}}, \code{\link{Q1}}, |
47 | 87 |
#' \code{\link{Q2}}, \code{\link{Q3}}. |
48 |
-#' Every aggregate accepts a string value, execet for COUNTSAMP, which does |
|
88 |
+#' Every aggregate accepts a string value, except for COUNT, which does |
|
49 | 89 |
#' not have any value. |
50 | 90 |
#' Argument of 'aggregate function' must exist in schema, i.e. among region |
51 |
-#' attributes. Two style are allowed: |
|
91 |
+#' attributes. Two styles are allowed: |
|
52 | 92 |
#' \itemize{ |
53 |
-#' \item list of key-value pairs: e.g. sum = SUM("cell") |
|
54 |
-#' \item list of values: e.g. SUM("cell") |
|
93 |
+#' \item list of key-value pairs: e.g. sum = SUM("pvalue") |
|
94 |
+#' \item list of values: e.g. SUM("pvalue") |
|
55 | 95 |
#' } |
56 | 96 |
#' "mixed style" is not allowed |
57 | 97 |
#' |
58 |
-#' |
|
59 | 98 |
#' @return GMQLDataset object. It contains the value to use as input |
60 | 99 |
#' for the subsequent GMQLDataset method |
61 | 100 |
#' |
... | ... |
@@ -68,18 +107,18 @@ group_by.GMQLDateset <- function(.data, groupBy_meta = conds(), |
68 | 107 |
#' ## using customParser |
69 | 108 |
#' |
70 | 109 |
#' init_gmql() |
71 |
-#' test_path <- system.file("example","DATASET",package = "RGMQL") |
|
72 |
-#' exp = read_GMQL(test_path) |
|
110 |
+#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
|
111 |
+#' exp = read_gmql(test_path) |
|
73 | 112 |
#' |
74 | 113 |
#' ## This GMQL statement groups samples of the input 'exp' dataset according |
75 | 114 |
#' ## to their value of the metadata attribute 'tumor_type' and computes the |
76 |
-#' ## maximum value that the metadata attribute size takes inside the samples |
|
115 |
+#' ## maximum value that the metadata attribute 'size' takes inside the samples |
|
77 | 116 |
#' ## belonging to each group. The samples in the output GROUPS_T dataset |
78 | 117 |
#' ## have a new _group metadata attribute which indicates which group they |
79 | 118 |
#' ## belong to, based on the grouping on the metadata attribute tumor_type. |
80 |
-#' ## In addition, they present the new metadata aggregate attribute MaxSize. |
|
81 |
-#' ## Note that the samples without metadata attribute tumor_type are assigned |
|
82 |
-#' ## to a single group with _group value equal 0 |
|
119 |
+#' ## In addition, they present the new metadata aggregate attribute 'MaxSize'. |
|
120 |
+#' ## Note that the samples without metadata attribute 'tumor_type' are |
|
121 |
+#' ## assigned to a single group with _group value equal 0 |
|
83 | 122 |
#' |
84 | 123 |
#' GROUPS_T = group_by(exp, conds("tumor_type"), |
85 | 124 |
#' meta_aggregates = list(max_size = MAX("size"))) |
... | ... |
@@ -5,12 +5,12 @@ |
5 | 5 |
#' |
6 | 6 |
#' @importFrom rJava J |
7 | 7 |
#' |
8 |
-#' @param output_format string identifies the output format of sample files. |
|
9 |
-#' It can be TAB, GTF or COLLECT: |
|
8 |
+#' @param output_format string that identifies the output format of allsample |
|
9 |
+#' files. It can be TAB, GTF or COLLECT: |
|
10 | 10 |
#' \itemize{ |
11 |
-#' \item{TAB: tab delimited file format} |
|
12 |
-#' \item{GTF: tab-delimited text fstandard ormat based on the general |
|
13 |
-#' feature format} |
|
11 |
+#' \item{TAB: tab-delimited file format} |
|
12 |
+#' \item{GTF: tab-delimited text standard format based on the General |
|
13 |
+#' Feature Format} |
|
14 | 14 |
#' \item{COLLECT: used for storing output in memory} |
15 | 15 |
#' } |
16 | 16 |
#' @param remote_processing logical value specifying the processing mode. |
... | ... |
@@ -30,11 +30,11 @@ |
30 | 30 |
#' @examples |
31 | 31 |
#' |
32 | 32 |
#' ## This statement initializes GMQL with local processing with sample files |
33 |
-#' ## output format as tab delimited |
|
33 |
+#' ## output format as tab-delimited |
|
34 | 34 |
#' |
35 | 35 |
#' init_gmql("tab", FALSE) |
36 | 36 |
#' |
37 |
-#' ## initializes GMQL with remote processing |
|
37 |
+#' ## This statement initializes GMQL with remote processing |
|
38 | 38 |
#' |
39 | 39 |
#' remote_url = "http://genomic.deib.polimi.it/gmql-rest-r/" |
40 | 40 |
#' init_gmql(remote_processing = TRUE, url = remote_url) |
... | ... |
@@ -52,7 +52,7 @@ init_gmql <- function(output_format = "GTF", remote_processing = FALSE, |
52 | 52 |
|
53 | 53 |
# mettere attesa da input keyboard, controllare se token già esiste |
54 | 54 |
# da sessione precedente |
55 |
- if(!is.null(url) && !exists("authToken",envir = .GlobalEnv)) |
|
55 |
+ if(!is.null(url) && !exists("GMQL_credentials", envir = .GlobalEnv)) |
|
56 | 56 |
login_gmql(url,username,password) |
57 | 57 |
|
58 | 58 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
... | ... |
@@ -61,7 +61,7 @@ init_gmql <- function(output_format = "GTF", remote_processing = FALSE, |
61 | 61 |
|
62 | 62 |
#' Stop GMQL server |
63 | 63 |
#' |
64 |
-#' Stop GMQL server |
|
64 |
+#' it stops GMQL server processing |
|
65 | 65 |
#' |
66 | 66 |
#' @importFrom rJava J |
67 | 67 |
#' |
... | ... |
@@ -69,8 +69,8 @@ init_gmql <- function(output_format = "GTF", remote_processing = FALSE, |
69 | 69 |
#' |
70 | 70 |
#' @examples |
71 | 71 |
#' |
72 |
-#' ## These statements initializes GMQL with local processing with sample |
|
73 |
-#' ## files output format as tab delimited and then stop it |
|
72 |
+#' ## These statements first initializes GMQL with local processing and with |
|
73 |
+#' ## sample files output format as tab-delimited, and then stops it |
|
74 | 74 |
#' |
75 | 75 |
#' init_gmql("tab", FALSE) |
76 | 76 |
#' |
... | ... |
@@ -91,20 +91,21 @@ stop_gmql <- function() |
91 | 91 |
#' |
92 | 92 |
#' @details |
93 | 93 |
#' The invocation of this function allows to change mode of processing. |
94 |
-#' After invoking collect() it is not possbile to switch the processing mode. |
|
94 |
+#' After invoking collect() function, it is not possbile to switch the |
|
95 |
+#' processing mode. |
|
95 | 96 |
#' |
96 | 97 |
#' @importFrom rJava J |
97 | 98 |
#' |
98 | 99 |
#' @param is_remote logical value used in order to set the processing mode. |
99 |
-#' TRUE you set a remote query processing mode, otherwise it will be local, |
|
100 |
+#' TRUE: you set a remote query processing mode, otherwise it will be local |
|
100 | 101 |
#' |
101 | 102 |
#' @return None |
102 | 103 |
#' |
103 | 104 |
#' @examples |
104 | 105 |
#' |
105 |
-#' ## These statements initializes GMQL with local processing with sample |
|
106 |
-#' ## files output format as tab delimited and then change processing mode |
|
107 |
-#' ## to remote |
|
106 |
+#' ## This statement initializes GMQL with local processing with sample |
|
107 |
+#' ## files output format as tab-delimited, and then it changes processing |
|
108 |
+#' ## mode to remote |
|
108 | 109 |
#' |
109 | 110 |
#' init_gmql("tab", remote_processing = FALSE) |
110 | 111 |
#' |
... | ... |
@@ -4,8 +4,8 @@ |
4 | 4 |
#' |
5 | 5 |
#' @description It takes in input two datasets, respectively known as anchor |
6 | 6 |
#' (left) and experiment (right) and returns a dataset of samples consisting |
7 |
-#' of regions extracted from the operands according to the specified condition |
|
8 |
-#' (a.k.a \emph{genometric_predicate}). |
|
7 |
+#' of regions extracted from the operands according to the specified conditions |
|
8 |
+#' (a.k.a \emph{genometric_predicate} and region_attribute_predicate). |
|
9 | 9 |
#' The number of generated output samples is the Cartesian product |
10 | 10 |
#' of the number of samples in the anchor and in the experiment dataset |
11 | 11 |
#' (if \emph{joinBy} is not specified). |
... | ... |
@@ -19,44 +19,47 @@ |
19 | 19 |
#' @param x GMQLDataset class object |
20 | 20 |
#' @param y GMQLDataset class object |
21 | 21 |
#' |
22 |
-#' @param genometric_predicate it is a list of DISTAL objects |
|
22 |
+#' @param genometric_predicate it is a list of DISTAL objects. |
|
23 | 23 |
#' For details of DISTAL objects see: |
24 | 24 |
#' \code{\link{DLE}}, \code{\link{DGE}}, \code{\link{DL}}, \code{\link{DG}}, |
25 | 25 |
#' \code{\link{MD}}, \code{\link{UP}}, \code{\link{DOWN}} |
26 | 26 |
#' |
27 | 27 |
#' @param joinBy \code{\link{condition_evaluation}} function to support |
28 | 28 |
#' methods with groupBy or JoinBy input paramter |
29 |
-#' @param reg_attr vector of string made up by schema field attribute |
|
29 |
+#' @param reg_attr vector of strings made up by region field attribute names, |
|
30 |
+#' whose values in the paired left and right dataset regions must be equal in |
|
31 |
+#' order to consider the two paired regions |
|
30 | 32 |
#' @param region_output single string that declares which region is given in |
31 | 33 |
#' output for each input pair of left dataset and right dataset regions |
32 |
-#' satisfying the genometric predicate: |
|
34 |
+#' satisfying the genometric predicate and/or the region attribute predicate: |
|
33 | 35 |
#' \itemize{ |
34 | 36 |
#' \item{LEFT: It outputs the anchor regions from 'x' that satisfy the |
35 |
-#' genometric predicate} |
|
37 |
+#' genometric and/or region attribute predicate} |
|
36 | 38 |
#' \item{RIGHT: It outputs the experiment regions from 'y' that satisfy the |
37 |
-#' genometric predicate} |
|
39 |
+#' genometric and/or region attribute predicate} |
|
38 | 40 |
#' \item{INT (intersection): It outputs the overlapping part (intersection) |
39 |
-#' of the 'x' and 'y' regions that satisfy the genometric predicate; if the |
|
40 |
-#' intersection is empty, no output is produced} |
|
41 |
+#' of the 'x' and 'y' regions that satisfy the genometric and/or region |
|
42 |
+#' attribute predicate; if the intersection is empty, no output is produced} |
|
41 | 43 |
#' \item{CAT: It outputs the concatenation between the 'x' and 'y' regions |
42 |
-#' that satisfy the genometric predicate, (i.e. the output regionis defined as |
|
43 |
-#' having left (right) coordinates equal to the minimum (maximum) of the |
|
44 |
-#' corresponding coordinate values in the 'x' and 'y' regions satisfying |
|
45 |
-#' the genometric predicate)} |
|
46 |
-#' \item{LEFT_DIST: It outputs the duplicate elimination of "x" output |
|
47 |
-#' regions with the same values, regardless the "y" paired region and its |
|
48 |
-#' values. In this case, the output regions attributes and their values are |
|
49 |
-#' all those of "x", and the output metadata are equal to the "x" metadata, |
|
50 |
-#' without additional prefixes} |
|
51 |
-#' \item{RIGHT_DIST: It outputs the duplicate elimination of "y" output |
|
52 |
-#' regions with the same values, regardless the "x" paired region and its |
|
53 |
-#' values. In this case, the output regions attributes and their values are |
|
54 |
-#' all those of "y", and the output metadata are equal to the "y" metadata, |
|
55 |
-#' without additional prefixes} |
|
44 |
+#' that satisfy the genometric and/or region attribute predicate, |
|
45 |
+#' (i.e. the output regions defined as having left (right) coordinates equal |
|
46 |
+#' to the minimum (maximum) of the corresponding coordinate values in the |
|
47 |
+#' 'x' and 'y' regions satisfying the genometric and/or region attribute |
|
48 |
+#' predicate)} |
|
49 |
+#' \item{LEFT_DIST: It outputs the duplicate elimination of 'x' output |
|
50 |
+#' regions with the same coordinates and values, regardless the 'y' paired |
|
51 |
+#' region and its values. In this case, the output region attributes and their |
|
52 |
+#' values are all and only those of 'x', and the output metadata are equal |
|
53 |
+#' to the 'x' metadata, without additional prefixes} |
|
54 |
+#' \item{RIGHT_DIST: It outputs the duplicate elimination of 'y' output |
|
55 |
+#' regions with the same coordinates and values, regardless the 'x' paired |
|
56 |
+#' region and its values. In this case, the output regions attributes and their |
|
57 |
+#' values are all and only those of 'y', and the output metadata are equal |
|
58 |
+#' to the 'y' metadata, without additional prefixes} |
|
56 | 59 |
#' \item{BOTH: It outputs the same regions as LEFT, but it adds in the output |
57 |
-#' region attributes the coordinates of the "y" dataset region that, |
|
58 |
-#' together with the output "x" dataset region, satisfies the equi predicate |
|
59 |
-#' and the genometric predicate} |
|
60 |
+#' region attributes the coordinates of the 'y' paired region that, |
|
61 |
+#' together with the 'x' output region, satisfies the genometric and/or |
|
62 |
+#' region attribute predicate} |
|
60 | 63 |
#' } |
61 | 64 |
#' |
62 | 65 |
#' @return GMQLDataset object. It contains the value to use as input |
... | ... |
@@ -64,24 +67,24 @@ |
64 | 67 |
#' |
65 | 68 |
#' @examples |
66 | 69 |
#' |
67 |
-#' ## Thi statement initializes and runs the GMQL server for local execution |
|
70 |
+#' ## This statement initializes and runs the GMQL server for local execution |
|
68 | 71 |
#' ## and creation of results on disk. Then, with system.file() it defines |
69 | 72 |
#' ## the path to the folders "DATASET" and "DATASET_GDM" in the subdirectory |
70 |
-#' ## "example" of the package "RGMQL" and opens such folder as a GMQL |
|
71 |
-#' ## dataset named "exp" and "ref" respectively using customParser |
|
73 |
+#' ## "example" of the package "RGMQL" and opens such folders as a GMQL |
|
74 |
+#' ## datasets named TSS and HM, respectively, using customParser |
|
72 | 75 |
#' |
73 | 76 |
#' init_gmql() |
74 | 77 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
75 | 78 |
#' test_path2 <- system.file("example", "DATASET_GDM", package = "RGMQL") |
76 |
-#' TSS = read_GMQL(test_path) |
|
77 |
-#' HM = read_GMQL(test_path2) |
|
79 |
+#' TSS = read_gmql(test_path) |
|
80 |
+#' HM = read_gmql(test_path2) |
|
78 | 81 |
#' |
79 |
-#' ## Given a dataset 'HM' and one called 'TSS' with a sample including |
|
80 |
-#' ## Transcription Start Site annotations, it searches for those regions of HM |
|
81 |
-#' ## that are at a minimal distance from a transcription start site (TSS) |
|
82 |
-#' ## and takes the first/closest one for each TSS, provided that such distance |
|
83 |
-#' ## is lesser than 120K bases and joined 'tss' and 'hm' samples are obtained |
|
84 |
-#' ## from the same provider (joinby clause). |
|
82 |
+#' ## Given a dataset HM and one called TSS with a sample including |
|
83 |
+#' ## Transcription Start Site annotations, this statement searches for those |
|
84 |
+#' ## regions of HM that are at a minimal distance from a transcription start |
|
85 |
+#' ## site (TSS) and takes the first/closest one for each TSS, provided that |
|
86 |
+#' ## such distance is lesser than 120K bases and joined TSS and HM |
|
87 |
+#' ## samples are obtained from the same provider (joinby clause). |
|
85 | 88 |
#' |
86 | 89 |
#' join_data = merge(TSS, HM, genometric_predicate = list(MD(1), DLE(120000)), |
87 | 90 |
#' conds("provider"), region_output = "RIGHT") |
... | ... |
@@ -2,17 +2,17 @@ |
2 | 2 |
#' |
3 | 3 |
#' It computes, for each sample in the right dataset, aggregates over the |
4 | 4 |
#' values of the right dataset regions that intersect with a region in a left |
5 |
-#' dataset sample, for each region of each sample in the left dataset; |
|
5 |
+#' dataset sample, for each region of each sample in the left dataset. |
|
6 | 6 |
#' The number of generated output samples is the Cartesian product |
7 | 7 |
#' of the samples in the two input datasets; |
8 | 8 |
#' each output sample has the same regions as the related input left dataset |
9 | 9 |
#' sample, with their attributes and values, plus the attributes computed as |
10 | 10 |
#' aggregates over right region values. |
11 | 11 |
#' Output sample metadata are the union of the related input sample metadata, |
12 |
-#' whose attribute names are prefixed with "left" or "right" respectively. |
|
12 |
+#' whose attribute names are prefixed with left or right respectively. |
|
13 | 13 |
#' |
14 |
-#' When the joinby clause is present, only pairs of samples of left_input_data |
|
15 |
-#' and of right_input_data with metadata M1 and M2 respectively that satisfy |
|
14 |
+#' When the joinby clause is present, only pairs of samples of x dataset |
|
15 |
+#' and of y dataset with metadata M1 and M2 respectively that satisfy |
|
16 | 16 |
#' the joinby condition are considered. |
17 | 17 |
#' |
18 | 18 |
#' The clause consists of a list of metadata attribute names that must be |
... | ... |
@@ -42,33 +42,33 @@ |
42 | 42 |
#' @param joinBy \code{\link{conds}} function to support methods with |
43 | 43 |
#' groupBy or JoinBy input parameter |
44 | 44 |
#' @param count_name string defining the metadata count name; if it is |
45 |
-#' not specifying the name is "count_left_right" |
|
45 |
+#' not specified the name is "count_left_right" |
|
46 | 46 |
#' |
47 | 47 |
#' @return GMQLDataset object. It contains the value to use as input |
48 | 48 |
#' for the subsequent GMQLDataset method |
49 | 49 |
#' |
50 | 50 |
#' @examples |
51 | 51 |
#' |
52 |
-#' ## Thi statement initializes and runs the GMQL server for local execution |
|
52 |
+#' ## This statement initializes and runs the GMQL server for local execution |
|
53 | 53 |
#' ## and creation of results on disk. Then, with system.file() it defines |
54 | 54 |
#' ## the path to the folders "DATASET" and "DATASET_GDM" in the subdirectory |
55 |
-#' ## "example" of the package "RGMQL" and opens such folder as a GMQL |
|
56 |
-#' ## dataset named "exp" and "ref" respectively using customParser |
|
55 |
+#' ## "example" of the package "RGMQL", and opens such folders as a GMQL |
|
56 |
+#' ## dataset named "exp" and "ref", respectively, using customParser |
|
57 | 57 |
#' |
58 | 58 |
#' init_gmql() |
59 | 59 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
60 | 60 |
#' test_path2 <- system.file("example", "DATASET_GDM", package = "RGMQL") |
61 |
-#' exp = read_GMQL(test_path) |
|
62 |
-#' ref = read_GMQL(test_path2) |
|
61 |
+#' exp = read_gmql(test_path) |
|
62 |
+#' ref = read_gmql(test_path2) |
|
63 | 63 |
#' |
64 |
-#' # It counts the number of regions in each sample from exp that overlap with |
|
65 |
-#' # a ref region, and for each ref region it computes the minimum score |
|
66 |
-#' # of all the regions in each exp sample that overlap with it. |
|
67 |
-#' # The MAP joinby option ensures that only the exp samples referring to |
|
68 |
-#' # the same 'cell_tissue' of a ref sample are mapped on such ref sample; |
|
69 |
-#' # exp samples with no cell_tissue metadata attribute, or with such metadata |
|
70 |
-#' # but with a different value from the one(s) of ref sample(s), |
|
71 |
-#' # are disregarded. |
|
64 |
+#' ## This statement it counts the number of regions in each sample from exp |
|
65 |
+#' ## dataset that overlap with a ref dataset region, and for each ref region |
|
66 |
+#' ## it computes the minimum score of all the regions in each exp sample that |
|
67 |
+#' ## overlap with it. The MAP joinby option ensures that only the exp samples |
|
68 |
+#' ## referring to the same 'cell_tissue' of a ref sample are mapped on such |
|
69 |
+#' ## ref sample; exp samples with no cell_tissue metadata attribute, or with |
|
70 |
+#' ## such metadata attribute, but with a different value from the one(s) |
|
71 |
+#' ## of ref sample(s), are disregarded. |
|
72 | 72 |
#' |
73 | 73 |
#' out = map(ref, exp, minScore = MIN("score"), joinBy = conds("cell_tissue")) |
74 | 74 |
#' |
... | ... |
@@ -8,18 +8,19 @@ |
8 | 8 |
#' @return None |
9 | 9 |
#' |
10 | 10 |
#' @examples |
11 |
-#' ## Thi statement initializes and runs the GMQL server for local execution |
|
11 |
+#' ## This statement initializes and runs the GMQL server for local execution |
|
12 | 12 |
#' ## and creation of results on disk. Then, with system.file() it defines |
13 |
-#' ## the path to the folders "DATASET" in the subdirectory "example" |
|
13 |
+#' ## the path to the folder "DATASET" in the subdirectory "example" |
|
14 | 14 |
#' ## of the package "RGMQL" and opens such folder as a GMQL dataset |
15 | 15 |
#' ## named "data" |
16 | 16 |
#' |
17 | 17 |
#' init_gmql() |
18 |
-#' test_path <- system.file("example","DATASET",package = "RGMQL") |
|
19 |
-#' data = read_GMQL(test_path) |
|
18 |
+#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
|
19 |
+#' data = read_gmql(test_path) |
|
20 | 20 |
#' |
21 |
-#' ## The following statement materialize the dataset, previoulsy read, at |
|
22 |
-#' ## th specific destination path into local folder "ds1" opportunely created |
|
21 |
+#' ## The following statement materializes the dataset "data", previoulsy read, |
|
22 |
+#' ## at the specific destination test_path into local folder "ds1" opportunely |
|
23 |
+#' ## created |
|
23 | 24 |
#' |
24 | 25 |
#' collect(data, dir_out = test_path) |
25 | 26 |
#' |
... | ... |
@@ -95,9 +96,9 @@ collect.GMQLDataset <- function(x, dir_out = getwd(), name = "ds1") |
95 | 96 |
#' @importFrom dplyr collect |
96 | 97 |
#' |
97 | 98 |
#' @param x GMQLDataset class object |
98 |
-#' @param dir_out destination folder path. |
|
99 |
-#' by default it is the current working directory of the R process |
|
100 |
-#' @param name name of the result dataset. by default it is the string "ds1" |
|
99 |
+#' @param dir_out destination folder path. By default it is the current |
|
100 |
+#' working directory of the R process |
|
101 |
+#' @param name name of the result dataset. By default it is the string "ds1" |
|
101 | 102 |
#' |
102 | 103 |
#' @details |
103 | 104 |
#' |
... | ... |
@@ -116,10 +117,11 @@ collect.GMQLDataset <- function(x, dir_out = getwd(), name = "ds1") |
116 | 117 |
#' |
117 | 118 |
#' init_gmql() |
118 | 119 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
119 |
-#' data = read_GMQL(test_path) |
|
120 |
+#' data = read_gmql(test_path) |
|
120 | 121 |
#' |
121 |
-#' ## The following statement materialize the dataset, previoulsy read, at |
|
122 |
-#' ## th specific destination path into local folder "ds1" opportunely created |
|
122 |
+#' ## The following statement materializes the dataset 'data', previoulsy read, |
|
123 |
+#' ## at the specific destination test_path into local folder "ds1" opportunely |
|
124 |
+#' ## created |
|
123 | 125 |
#' |
124 | 126 |
#' collect(data, dir_out = test_path) |
125 | 127 |
#' |
... | ... |
@@ -170,9 +172,9 @@ gmql_materialize <- function(input_data, dir_out, name) |
170 | 172 |
#' @importFrom GenomicRanges GRangesList |
171 | 173 |
#' |
172 | 174 |
#' @param .data returned object from any GMQL function |
173 |
-#' @param rows number of rows for each sample regions that you want to |
|
175 |
+#' @param rows number of regions rows for each sample that you want to |
|
174 | 176 |
#' retrieve and store in memory. |
175 |
-#' By default it is 0 that means take all rows for each sample |
|
177 |
+#' By default it is 0, that means take all rows for each sample |
|
176 | 178 |
#' |
177 | 179 |
#' @param ... Additional arguments for use in other specific methods of the |
178 | 180 |
#' generic take function |
... | ... |
@@ -188,9 +190,21 @@ gmql_materialize <- function(input_data, dir_out, name) |
188 | 190 |
#' |
189 | 191 |
#' init_gmql() |
190 | 192 |
#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
191 |
-#' rd = read_GMQL(test_path) |
|
193 |
+#' rd = read_gmql(test_path) |
|
194 |
+#' |
|
195 |
+#' ## This statement creates a dataset called merged which contains one |
|
196 |
+#' ## sample for each antibody_target and cell value found within the metadata |
|
197 |
+#' ## of the exp dataset sample; each created sample contains all regions |
|
198 |
+#' ## from all 'exp' samples with a specific value for their |
|
199 |
+#' ## antibody_target and cell metadata attributes. |
|
200 |
+#' |
|
201 |
+#' aggr = aggregate(rd, conds(c("antibody_target", "cell"))) |
|
202 |
+#' |
|
203 |
+#' ## This statement performs the query and return the resulted dataset as |
|
204 |
+#' ## GRangesList named 'taken'. It returns only the first 45 regions of |
|
205 |
+#' ## each sample present into GRangesList and all the medatata associated to |
|
206 |
+#' ## each sample |
|
192 | 207 |
#' |
193 |
-#' aggr = aggregate(rd, conds(c("antibody_target", "cell_karyotype"))) |
|
194 | 208 |
#' taken <- take(aggr, rows = 45) |
195 | 209 |
#' |
196 | 210 |
#' @name take |
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
#' @description It builds a dataset consisting of a single sample having as |
6 | 6 |
#' many regions as the number of regions of all the input dataset samples |
7 | 7 |
#' and as many metadata as the union of the 'attribute-value' tuples of the |
8 |
-#' input samples. If \emph{groupBy} is specified: the samples are then |
|
8 |
+#' input samples. If \emph{groupBy} is specified, the samples are then |
|
9 | 9 |
#' partitioned in groups, each with a distinct value of the grouping metadata |
10 | 10 |
#' attributes. The operation is separately applied to each group, yielding |
11 | 11 |
#' one sample in the result for each group. Samples whose metadata are |
... | ... |
@@ -30,17 +30,16 @@ |
30 | 30 |
#' ## using customParser |
31 | 31 |
#' |
32 | 32 |
#' init_gmql() |
33 |
-#' test_path <- system.file("example","DATASET",package = "RGMQL") |
|
34 |
-#' exp = read_GMQL(test_path) |
|
33 |
+#' test_path <- system.file("example", "DATASET", package = "RGMQL") |
|
34 |
+#' exp = read_gmql(test_path) |
|
35 | 35 |
#' |
36 | 36 |
#' ## This statement creates a dataset called merged which contains one |
37 | 37 |
#' ## sample for each antibody_target and cell value found within the metadata |
38 | 38 |
#' ## of the exp dataset sample; each created sample contains all regions |
39 | 39 |
#' ## from all 'exp' samples with a specific value for their |
40 |
-#' ## antibody_target and cell metadata |
|
41 |
-#' ## attributes. |
|
40 |
+#' ## antibody_target and cell metadata attributes. |
|
42 | 41 |
#' |
43 |
-#' merged = aggregate(exp, conds(c("antibody_target","cell"))) |
|
42 |
+#' merged = aggregate(exp, conds(c("antibody_target", "cell"))) |
|
44 | 43 |
#' |
45 | 44 |
#' @name aggregate |
46 | 45 |
#' @rdname aggregate |
... | ... |
@@ -14,11 +14,11 @@ arrange.GMQLDataset <- function(.data, metadata_ordering = NULL, |
14 | 14 |
#' @description It is used to order either samples or sample regions or both, |
15 | 15 |
#' according to a set of metadata and/or region attributes. |
16 | 16 |
#' Order can be specified as ascending / descending for every attribute. |
17 |
-#' The number of samples and their regions remain the same as well as |
|
18 |
-#' their attributes, (unless fetching options are specified) but a new |
|
19 |
-#' ordering metadata and/or region attribute is added. |
|
20 |
-#' Sorted samples or regions have a new attribute "order", |
|
21 |
-#' added to either metadata, or regions, or both of them as specified in inputs |
|
17 |
+#' The number of samples s well as their attributes and their regions remain |
|
18 |
+#' the same, (unless fetching options are specified), but a new ordering |
|
19 |
+#' metadata and/or region attribute is added. Sorted samples or regions have |
|
20 |
+#' a new attribute "_order", added to either metadata, or "order" added to |
|
21 |
+#' their regions, or both of them as specified in input. |
|
22 | 22 |
#' |
23 | 23 |
#' @importFrom rJava J .jnull .jarray |
24 | 24 |
#' @importFrom dplyr arrange |
... | ... |
@@ -40,7 +40,7 @@ arrange.GMQLDataset <- function(.data, metadata_ordering = NULL, |
40 | 40 |
#' |
41 | 41 |
#' @param num_fetch integer value identifying the number of samples to fetch; |
42 | 42 |
#' by default it is 0, that means all samples are fetched |
43 |
-#' s |
|
43 |
+#' |
|
44 | 44 |
#' @param regions_ordering list of ordering functions containing name of |
45 | 45 |
#' region attribute. |
46 | 46 |
#' The functions available are: \code{\link{ASC}}, \code{\link{DESC}}. |
... | ... |
@@ -54,8 +54,8 @@ arrange.GMQLDataset <- function(.data, metadata_ordering = NULL, |
54 | 54 |
#' } |
55 | 55 |
#' if NULL, \emph{reg_num_fetch} is not considered |
56 | 56 |
#' |
57 |
-#' @param reg_num_fetch integer value identifying the number of region to fetch |
|
58 |
-#' by default it is 0, that means all regions are fetched |
|
57 |
+#' @param reg_num_fetch integer value identifying the number of regions to |