... | ... |
@@ -1,11 +1,10 @@ |
1 | 1 |
Package: RGMQL |
2 | 2 |
Type: Package |
3 | 3 |
Title: GenoMetric Query Language for R/Bioconductor |
4 |
-Version: 0.99.24 |
|
4 |
+Version: 0.99.25 |
|
5 | 5 |
Author: Simone Pallotta, Marco Masseroli |
6 | 6 |
Maintainer: Simone Pallotta <simonepallotta@hotmail.com> |
7 |
-Description: This RGMQL package brings the GenoMetric Query Language (GMQL) |
|
8 |
- functionalities into the R environment. |
|
7 |
+Description: This RGMQL package brings the GenoMetric Query Language (GMQL) functionalities into the R environment. |
|
9 | 8 |
GMQL is a high-level, declarative language to query and compare multiple and heterogeneous genomic |
10 | 9 |
datasets for biomedical knowledge discovery. It allows expressing easily queries |
11 | 10 |
and processing over genomic regions and their metadata, in a way similar to |
... | ... |
@@ -64,7 +64,7 @@ extend <-function(input_data, metadata = NULL) |
64 | 64 |
dispatch = TRUE) |
65 | 65 |
else |
66 | 66 |
metadata_matrix <- .jnull("java/lang/String") |
67 |
- |
|
67 |
+ |
|
68 | 68 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
69 | 69 |
response <- WrappeR$extend(metadata_matrix,input_data$value) |
70 | 70 |
error <- strtoi(response[1]) |
... | ... |
@@ -47,7 +47,7 @@ |
47 | 47 |
#' @export |
48 | 48 |
#' |
49 | 49 |
filter_and_extract <- function(data, metadata = NULL, |
50 |
- metadata_prefix = NULL, regions = NULL) |
|
50 |
+ metadata_prefix = NULL, regions = NULL) |
|
51 | 51 |
{ |
52 | 52 |
if(is(data,"GRangesList")) |
53 | 53 |
.extract_from_GRangesList(data,metadata,metadata_prefix,regions) |
... | ... |
@@ -38,7 +38,7 @@ import_gmql <- function(dataset_path, is_gtf) |
38 | 38 |
datasetName <- sub("/*[/]$","",datasetName) |
39 | 39 |
if(basename(datasetName) !="files") |
40 | 40 |
datasetName <- paste0(datasetName,"/files") |
41 |
- |
|
41 |
+ |
|
42 | 42 |
if(!dir.exists(datasetName)) |
43 | 43 |
stop("Directory does not exists") |
44 | 44 |
|
... | ... |
@@ -101,7 +101,7 @@ join <- function(right_input_data, left_input_data, |
101 | 101 |
warning("only 4 element per list, we cut the rest") |
102 | 102 |
length(list_pred)=4 |
103 | 103 |
} |
104 |
- |
|
104 |
+ |
|
105 | 105 |
if(!all(sapply(list_pred, function(x) {is(x,"DISTAL")} ))) |
106 | 106 |
stop("All elements should be DISTAL object") |
107 | 107 |
}) |
... | ... |
@@ -120,18 +120,18 @@ join <- function(right_input_data, left_input_data, |
120 | 120 |
} |
121 | 121 |
else |
122 | 122 |
genomatrix <- .jnull("java/lang/String") |
123 |
- |
|
123 |
+ |
|
124 | 124 |
if(!is.null(joinBy)) |
125 |
- join_condition_matrix <- .jarray(.join_condition(joinBy), |
|
125 |
+ join_condition_matrix <- .jarray(.join_condition(joinBy), |
|
126 | 126 |
dispatch = TRUE) |
127 | 127 |
else |
128 | 128 |
join_condition_matrix <- .jnull("java/lang/String") |
129 |
- |
|
129 |
+ |
|
130 | 130 |
ouput <- toupper(region_output) |
131 | 131 |
if(!identical(ouput,"CONTIG") && !identical(ouput,"LEFT") && |
132 | 132 |
!identical(ouput,"RIGHT") && !identical(ouput,"INT")) |
133 | 133 |
stop("region_output must be contig,left,right or int (intersection)") |
134 |
- |
|
134 |
+ |
|
135 | 135 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
136 | 136 |
response <- WrappeR$join(genomatrix,join_condition_matrix, |
137 | 137 |
ouput,right_input_data$value, |
... | ... |
@@ -88,13 +88,13 @@ map <- function(left_input_data, right_input_data, aggregates = NULL, |
88 | 88 |
dispatch = TRUE) |
89 | 89 |
else |
90 | 90 |
metadata_matrix = .jnull("java/lang/String") |
91 |
- |
|
91 |
+ |
|
92 | 92 |
if(!is.null(joinBy)) |
93 | 93 |
join_condition_matrix <- .jarray(.join_condition(joinBy), |
94 | 94 |
dispatch = TRUE) |
95 | 95 |
else |
96 | 96 |
join_condition_matrix <- .jnull("java/lang/String") |
97 |
- |
|
97 |
+ |
|
98 | 98 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
99 | 99 |
response<-WrappeR$map(join_condition_matrix, metadata_matrix, |
100 | 100 |
left_input_data$value, right_input_data$value) |
... | ... |
@@ -28,7 +28,7 @@ execute <- function() |
28 | 28 |
remote_proc <- WrappeR$is_remote_processing() |
29 | 29 |
if(!remote_proc) |
30 | 30 |
.download_or_upload() |
31 |
- |
|
31 |
+ |
|
32 | 32 |
response <- WrappeR$execute() |
33 | 33 |
error <- strtoi(response[1]) |
34 | 34 |
data <- response[2] |
... | ... |
@@ -143,14 +143,14 @@ take <- function(input_data, rows=0L) |
143 | 143 |
rows <- as.integer(rows[1]) |
144 | 144 |
if(rows<0) |
145 | 145 |
stop("rows cannot be negative") |
146 |
- |
|
146 |
+ |
|
147 | 147 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
148 | 148 |
response <- WrappeR$take(input_data$value,rows) |
149 | 149 |
error <- strtoi(response[1]) |
150 | 150 |
data <- response[2] |
151 | 151 |
if(error!=0) |
152 | 152 |
stop(data) |
153 |
- |
|
153 |
+ |
|
154 | 154 |
reg <- .jevalArray(WrappeR$get_reg(),simplify = TRUE) |
155 | 155 |
if(is.null(reg)) |
156 | 156 |
stop("no regions defined") |
... | ... |
@@ -160,11 +160,11 @@ take <- function(input_data, rows=0L) |
160 | 160 |
schema <- .jevalArray(WrappeR$get_schema(),simplify = TRUE) |
161 | 161 |
if(is.null(schema)) |
162 | 162 |
stop("no schema defined") |
163 |
- |
|
163 |
+ |
|
164 | 164 |
reg_data_frame <- as.data.frame(reg) |
165 | 165 |
list <- split(reg_data_frame, reg_data_frame[1]) |
166 | 166 |
names <- c("seqname","start","end","strand",schema) |
167 |
- |
|
167 |
+ |
|
168 | 168 |
sampleList <- lapply(list, function(x){ |
169 | 169 |
x <- x[-1] |
170 | 170 |
names(x) <- names |
... | ... |
@@ -175,7 +175,7 @@ take <- function(input_data, rows=0L) |
175 | 175 |
}) |
176 | 176 |
gRange_list <- GRangesList(sampleList) |
177 | 177 |
meta_list <- .metadata_from_frame_to_list(meta) |
178 |
- |
|
178 |
+ |
|
179 | 179 |
S4Vectors::metadata(gRange_list) <- meta_list |
180 | 180 |
return(gRange_list) |
181 | 181 |
} |
... | ... |
@@ -57,7 +57,7 @@ merge <- function(input_data, groupBy = NULL) |
57 | 57 |
dispatch = TRUE) |
58 | 58 |
else |
59 | 59 |
join_condition_matrix <- .jnull("java/lang/String") |
60 |
- |
|
60 |
+ |
|
61 | 61 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
62 | 62 |
response <- WrappeR$merge(join_condition_matrix,input_data$value) |
63 | 63 |
error <- strtoi(response[1]) |
... | ... |
@@ -70,22 +70,22 @@ order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0, |
70 | 70 |
if(!is.numeric(mtop) || !is.numeric(mtopg) || !is.numeric(rtop) || |
71 | 71 |
!is.numeric(rtopg) || !is.numeric(mtopp)|| !is.numeric(rtopp)) |
72 | 72 |
stop("mtop, rtop, rtopg and mtopg must be integer") |
73 |
- |
|
73 |
+ |
|
74 | 74 |
if(length(mtop)>1 || length(mtopg)>1 || length(rtop)>1 || length(rtopg)>1 |
75 | 75 |
|| length(mtopp)>1 || length(rtopp)>1) |
76 | 76 |
warning("only first element: rtop, mtop, mtopg, rtopg, rtopp, mtopp") |
77 |
- |
|
77 |
+ |
|
78 | 78 |
# we consider only the first element even if input is a vector of Int |
79 | 79 |
# we cut the other arguments |
80 |
- |
|
80 |
+ |
|
81 | 81 |
mtop = as.integer(mtop[1]) |
82 | 82 |
mtopg = as.integer(mtopg[1]) |
83 | 83 |
mtopp = as.integer(mtopp[1]) |
84 |
- |
|
84 |
+ |
|
85 | 85 |
rtop = as.integer(rtop[1]) |
86 | 86 |
rtopg = as.integer(rtopg[1]) |
87 | 87 |
rtopp = as.integer(rtopp[1]) |
88 |
- |
|
88 |
+ |
|
89 | 89 |
if(mtop > 0 && mtopg >0) |
90 | 90 |
{ |
91 | 91 |
warning("cannot be used together.\nWe set mtopg = 0") |
... | ... |
@@ -97,43 +97,43 @@ order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0, |
97 | 97 |
warning("cannot be used together.\nWe set mtopp = 0") |
98 | 98 |
mtopp = 0L |
99 | 99 |
} |
100 |
- |
|
100 |
+ |
|
101 | 101 |
if(mtopg >0 && mtopp>0) |
102 | 102 |
{ |
103 | 103 |
warning("cannot be used together.\nWe set mtopp = 0") |
104 | 104 |
mtopp = 0L |
105 | 105 |
} |
106 |
- |
|
106 |
+ |
|
107 | 107 |
if(rtop > 0 && rtopg >0) |
108 | 108 |
{ |
109 | 109 |
warning("cannot be used together.\nWe set rtopg = 0") |
110 | 110 |
rtopg = 0L |
111 | 111 |
} |
112 |
- |
|
112 |
+ |
|
113 | 113 |
if(rtop >0 && rtopp>0) |
114 | 114 |
{ |
115 | 115 |
warning("cannot be used together.\nWe set rtopp = 0") |
116 | 116 |
rtopp = 0L |
117 | 117 |
} |
118 |
- |
|
118 |
+ |
|
119 | 119 |
if(rtopg >0 && rtopp>0) |
120 | 120 |
{ |
121 | 121 |
warning("cannot be used together.\nWe set rtopp = 0") |
122 | 122 |
rtopp = 0L |
123 | 123 |
} |
124 |
- |
|
124 |
+ |
|
125 | 125 |
if(!is.null(metadata_ordering)) |
126 | 126 |
meta_matrix <- .jarray(.ordering_meta(metadata_ordering), |
127 | 127 |
dispatch = TRUE) |
128 | 128 |
else |
129 | 129 |
meta_matrix <- .jnull("java/lang/String") |
130 |
- |
|
130 |
+ |
|
131 | 131 |
if(!is.null(regions_ordering)) |
132 | 132 |
region_matrix <- .jarray(.ordering_meta(regions_ordering), |
133 | 133 |
dispatch = TRUE) |
134 | 134 |
else |
135 | 135 |
region_matrix <- .jnull("java/lang/String") |
136 |
- |
|
136 |
+ |
|
137 | 137 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
138 | 138 |
response <- WrappeR$order(meta_matrix, mtopg, mtop, mtopp, region_matrix, |
139 | 139 |
rtopg, rtop, rtopp, input_data$value) |
... | ... |
@@ -169,6 +169,7 @@ order <- function(input_data, metadata_ordering = NULL, mtop = 0, mtopg = 0, |
169 | 169 |
} |
170 | 170 |
else |
171 | 171 |
stop("only list or character") |
172 |
+ |
|
172 | 173 |
} |
173 | 174 |
|
174 | 175 |
|
... | ... |
@@ -95,34 +95,34 @@ project <-function(input_data, metadata = NULL, metadata_update=NULL, |
95 | 95 |
{ |
96 | 96 |
if(!is.character(metadata)) |
97 | 97 |
stop("metadata: no valid input") |
98 |
- |
|
98 |
+ |
|
99 | 99 |
metadata <- metadata[!metadata %in% ""] |
100 | 100 |
metadata <- metadata[!duplicated(metadata)] |
101 |
- |
|
101 |
+ |
|
102 | 102 |
if(length(metadata)==0) |
103 | 103 |
metadata <- .jnull("java/lang/String") |
104 |
- |
|
104 |
+ |
|
105 | 105 |
metadata <- .jarray(metadata) |
106 | 106 |
} |
107 | 107 |
else |
108 | 108 |
metadata <- .jnull("java/lang/String") |
109 |
- |
|
109 |
+ |
|
110 | 110 |
if(!is.null(regions)) |
111 | 111 |
{ |
112 | 112 |
if(!is.character(regions)) |
113 | 113 |
stop("regions: no valid input") |
114 |
- |
|
114 |
+ |
|
115 | 115 |
regions = regions[!regions %in% ""] |
116 | 116 |
regions = regions[!duplicated(regions)] |
117 |
- |
|
117 |
+ |
|
118 | 118 |
if(length(regions)==0) |
119 | 119 |
regions <- .jnull("java/lang/String") |
120 |
- |
|
120 |
+ |
|
121 | 121 |
regions <- .jarray(regions) |
122 | 122 |
} |
123 | 123 |
else |
124 | 124 |
regions <- .jnull("java/lang/String") |
125 |
- |
|
125 |
+ |
|
126 | 126 |
reg_update <- substitute(regions_update) |
127 | 127 |
if(!is.null(reg_update)) |
128 | 128 |
{ |
... | ... |
@@ -131,7 +131,7 @@ project <-function(input_data, metadata = NULL, metadata_update=NULL, |
131 | 131 |
} |
132 | 132 |
else |
133 | 133 |
regions_update <- .jnull("java/lang/String") |
134 |
- |
|
134 |
+ |
|
135 | 135 |
meta_update <- substitute(metadata_update) |
136 | 136 |
if(!is.null(meta_update)) |
137 | 137 |
{ |
... | ... |
@@ -140,15 +140,15 @@ project <-function(input_data, metadata = NULL, metadata_update=NULL, |
140 | 140 |
} |
141 | 141 |
else |
142 | 142 |
metadata_update <- .jnull("java/lang/String") |
143 |
- |
|
143 |
+ |
|
144 | 144 |
if(length(all_but_meta)>1) |
145 | 145 |
warning("all_but_meta: no multiple values") |
146 |
- |
|
146 |
+ |
|
147 | 147 |
if(length(all_but_reg)>1) |
148 | 148 |
warning("all_but_reg: no multiple values") |
149 | 149 |
all_but_reg <- all_but_reg[1] |
150 | 150 |
all_but_meta <- all_but_meta[1] |
151 |
- |
|
151 |
+ |
|
152 | 152 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
153 | 153 |
response <- WrappeR$project(metadata,metadata_update,all_but_meta, |
154 | 154 |
regions,regions_update, |
... | ... |
@@ -52,12 +52,12 @@ init_gmql <- function(output_format = "gtf", remote_processing = FALSE, |
52 | 52 |
!identical(out_format,"COLLECT")) |
53 | 53 |
stop("output_format must be TAB, GTF or COLLECT") |
54 | 54 |
.check_logical(remote_processing) |
55 |
- |
|
55 |
+ |
|
56 | 56 |
# mettere attesa da input keyboard, controllare se token già esiste |
57 | 57 |
# da sessione precedente |
58 | 58 |
if(!is.null(url) && !exists("authToken",envir = .GlobalEnv)) |
59 | 59 |
login_gmql(url,username,password) |
60 |
- |
|
60 |
+ |
|
61 | 61 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
62 | 62 |
WrappeR$initGMQL(out_format,remote_processing) |
63 | 63 |
} |
... | ... |
@@ -132,7 +132,7 @@ read_dataset <- function(dataset, parser = "CustomParser", is_local=TRUE, |
132 | 132 |
{ |
133 | 133 |
if(!dir.exists(dataset)) |
134 | 134 |
stop("folder does not exist") |
135 |
- |
|
135 |
+ |
|
136 | 136 |
dataset <- sub("/*[/]$","",dataset) |
137 | 137 |
if(basename(dataset) !="files") |
138 | 138 |
dataset <- paste0(dataset,"/files") |
... | ... |
@@ -145,16 +145,16 @@ read_dataset <- function(dataset, parser = "CustomParser", is_local=TRUE, |
145 | 145 |
url <- WrappeR$get_url() |
146 | 146 |
if(is.null(url)) |
147 | 147 |
stop("You have to log on using login function") |
148 |
- |
|
148 |
+ |
|
149 | 149 |
if(!exists("authToken",envir = .GlobalEnv)) |
150 | 150 |
stop("You have to log on using login function") |
151 |
- |
|
151 |
+ |
|
152 | 152 |
list <- show_schema(url,dataset) |
153 | 153 |
schema_names <- sapply(list$fields, function(x){x$name}) |
154 | 154 |
schema_type <- sapply(list$fields, function(x){x$type}) |
155 | 155 |
schema_matrix <- cbind(schema_type,schema_names) |
156 | 156 |
#schema_type <- list$type |
157 |
- |
|
157 |
+ |
|
158 | 158 |
if(is.null(schema_matrix) || length(schema_matrix)==0) |
159 | 159 |
schema_matrix <- .jnull("java/lang/String") |
160 | 160 |
else |
... | ... |
@@ -203,7 +203,7 @@ read <- function(samples) |
203 | 203 |
{ |
204 | 204 |
if(!is(samples,"GRangesList")) |
205 | 205 |
stop("only GrangesList") |
206 |
- |
|
206 |
+ |
|
207 | 207 |
meta <- S4Vectors::metadata(samples) |
208 | 208 |
if(is.null(meta) || length(meta)==0) { |
209 | 209 |
#repeat meta for each sample in samples list |
... | ... |
@@ -212,17 +212,19 @@ read <- function(samples) |
212 | 212 |
We provide two metadata for you") |
213 | 213 |
index_meta <- rep(1:len,each = len) |
214 | 214 |
rep_meta <- rep(c("Provider","Polimi", "Application", "R-GMQL"), |
215 |
- times=len) |
|
215 |
+ times=len) |
|
216 | 216 |
meta_matrix <- matrix(rep_meta,ncol = 2,byrow = TRUE) |
217 | 217 |
meta_matrix <- cbind(index_meta,meta_matrix) |
218 | 218 |
} |
219 |
- else { |
|
219 |
+ else |
|
220 |
+ { |
|
220 | 221 |
unlist_meta <- unlist(meta) |
221 | 222 |
names_meta <- names(unlist_meta) |
222 | 223 |
group_names <- gsub(".*_([0-9]*)\\..*","\\1", names_meta) |
223 | 224 |
names(unlist_meta) <- NULL |
224 | 225 |
meta_matrix <- cbind(group_names,names_meta,unlist_meta) |
225 | 226 |
} |
227 |
+ |
|
226 | 228 |
df <- data.frame(samples) |
227 | 229 |
df <- df[-2] #delete group_name |
228 | 230 |
region_matrix <- as.matrix(sapply(df, as.character)) |
... | ... |
@@ -249,11 +251,11 @@ We provide two metadata for you") |
249 | 251 |
} |
250 | 252 |
rownames(schema_matrix) <- NULL |
251 | 253 |
colnames(schema_matrix) <- NULL |
252 |
- |
|
254 |
+ |
|
253 | 255 |
schema_matrix <- .jarray(schema_matrix,dispatch = TRUE) |
254 | 256 |
meta_matrix <- .jarray(meta_matrix,dispatch = TRUE) |
255 | 257 |
region_matrix <- .jarray(region_matrix,dispatch = TRUE) |
256 |
- |
|
258 |
+ |
|
257 | 259 |
WrappeR <- J("it/polimi/genomics/r/Wrapper") |
258 | 260 |
response <- WrappeR$read(meta_matrix,region_matrix,schema_matrix) |
259 | 261 |
DataSet(response) |
... | ... |
@@ -98,7 +98,7 @@ select <- function(input_data, predicate = NULL, region_predicate = NULL, |
98 | 98 |
} |
99 | 99 |
else |
100 | 100 |
predicate <- .jnull("java/lang/String") |
101 |
- |
|
101 |
+ |
|
102 | 102 |
reg_pred <- substitute(region_predicate) |
103 | 103 |
if(!is.null(reg_pred)) |
104 | 104 |
{ |
... | ... |
@@ -107,7 +107,7 @@ select <- function(input_data, predicate = NULL, region_predicate = NULL, |
107 | 107 |
} |
108 | 108 |
else |
109 | 109 |
region_predicate <- .jnull("java/lang/String") |
110 |
- |
|
110 |
+ |
|
111 | 111 |
if(is.null(semi_join) && is.null(semi_join_dataset)) |
112 | 112 |
{ |
113 | 113 |
join_condition_matrix <- .jnull("java/lang/String") |
... | ... |
@@ -22,7 +22,7 @@ |
22 | 22 |
full.names = TRUE) |
23 | 23 |
if(length(schema_name)==0) |
24 | 24 |
stop("schema not present") |
25 |
- |
|
25 |
+ |
|
26 | 26 |
xml_schema <- xml2::read_xml(schema_name) |
27 | 27 |
list_field <- xml2::as_list(xml_schema) |
28 | 28 |
vector_field <- unlist(list_field) |
... | ... |
@@ -33,10 +33,10 @@ |
33 | 33 |
{ |
34 | 34 |
if(!is.list(meta_data)) |
35 | 35 |
stop("meta_data: invalid input") |
36 |
- |
|
36 |
+ |
|
37 | 37 |
if(!all(sapply(meta_data, function(x) is(x,class)))) |
38 | 38 |
stop("All elements must be META_AGGREGATES object") |
39 |
- |
|
39 |
+ |
|
40 | 40 |
names <- names(meta_data) |
41 | 41 |
if(is.null(names)) |
42 | 42 |
{ |
... | ... |
@@ -46,12 +46,13 @@ |
46 | 46 |
else |
47 | 47 |
{ |
48 | 48 |
if("" %in% names) |
49 |
- stop("No partial names assignment is allowed") |
|
49 |
+ stop("No partial names assignment is allowed") |
|
50 | 50 |
} |
51 | 51 |
aggregate_matrix <- t(sapply(meta_data, function(x) { |
52 | 52 |
new_value = as.character(x) |
53 | 53 |
matrix <- matrix(new_value) |
54 | 54 |
})) |
55 |
+ |
|
55 | 56 |
m_names <- matrix(names) |
56 | 57 |
metadata_matrix <- cbind(m_names,aggregate_matrix) |
57 | 58 |
} |
... | ... |
@@ -94,7 +95,7 @@ |
94 | 95 |
{ |
95 | 96 |
if(!is.character(value)) |
96 | 97 |
stop("no valid data") |
97 |
- |
|
98 |
+ |
|
98 | 99 |
if(length(value)>1) |
99 | 100 |
stop("no multiple string") |
100 | 101 |
} |
... | ... |
@@ -103,7 +104,7 @@ |
103 | 104 |
{ |
104 | 105 |
if(!is.logical(value)) |
105 | 106 |
stop("no valid data") |
106 |
- |
|
107 |
+ |
|
107 | 108 |
if(length(value)>1) |
108 | 109 |
stop("no multiple string") |
109 | 110 |
} |
... | ... |
@@ -5,9 +5,7 @@ |
5 | 5 |
|
6 | 6 |
ORDER <- function(value) |
7 | 7 |
{ |
8 |
- op_list <- list( |
|
9 |
- value = value |
|
10 |
- ) |
|
8 |
+ op_list <- list(value = value) |
|
11 | 9 |
## Set the name for the class |
12 | 10 |
class(op_list) <- "ORDER" |
13 | 11 |
return(op_list) |
... | ... |
@@ -17,7 +15,7 @@ check.ORDER <- function(value) |
17 | 15 |
{ |
18 | 16 |
if(is.character(value) && length(value)>1) |
19 | 17 |
stop("value: no multiple string") |
20 |
- |
|
18 |
+ |
|
21 | 19 |
if(!is.character(value)) |
22 | 20 |
stop("value: is not a string") |
23 | 21 |
} |
... | ... |
@@ -26,8 +24,7 @@ print.ORDER <- function(obj) { |
26 | 24 |
as.character(obj) |
27 | 25 |
} |
28 | 26 |
|
29 |
-c.ORDER <- function(...) |
|
30 |
-{ |
|
27 |
+c.ORDER <- function(...) { |
|
31 | 28 |
a <- list(...) |
32 | 29 |
} |
33 | 30 |
|
... | ... |
@@ -63,7 +60,7 @@ as.character.ORDER <- function(obj) { |
63 | 60 |
DESC <- function(value) |
64 | 61 |
{ |
65 | 62 |
check.ORDER(value) |
66 |
- |
|
63 |
+ |
|
67 | 64 |
list <- list(value = value) |
68 | 65 |
## Set the name for the class |
69 | 66 |
class(list) <- c("DESC","ORDER") |
... | ... |
@@ -98,7 +95,7 @@ DESC <- function(value) |
98 | 95 |
ASC <- function(value) |
99 | 96 |
{ |
100 | 97 |
check.ORDER(value) |
101 |
- |
|
98 |
+ |
|
102 | 99 |
list <- list(value = value) |
103 | 100 |
## Set the name for the class |
104 | 101 |
class(list) <- c("ASC","ORDER") |
... | ... |
@@ -201,11 +201,11 @@ run_query <- function(url, fileName, query, output_gtf = TRUE) |
201 | 201 |
out <- "GTF" |
202 | 202 |
else |
203 | 203 |
out <- "TAB" |
204 |
- |
|
204 |
+ |
|
205 | 205 |
URL <- paste0(url,"/queries/run/",fileName,"/",out) |
206 | 206 |
h <- c('Accept' = "Application/json", |
207 | 207 |
'Content-Type' = 'text/plain','X-Auth-Token' = authToken) |
208 |
- |
|
208 |
+ |
|
209 | 209 |
req <- httr::POST(URL,body = query ,httr::add_headers(h),encode = "json") |
210 | 210 |
content <- httr::content(req,"parsed") |
211 | 211 |
if(req$status_code !=200) |
... | ... |
@@ -257,7 +257,7 @@ run_query_fromfile <- function(url, fileName, filePath, output_gtf = TRUE) |
257 | 257 |
{ |
258 | 258 |
if(!file.exists(filePath)) |
259 | 259 |
stop("file does not exist") |
260 |
- |
|
260 |
+ |
|
261 | 261 |
query <- readLines(filePath) |
262 | 262 |
run_query(url,fileName,query,output_gtf) |
263 | 263 |
} |
... | ... |
@@ -325,7 +325,7 @@ compile_query_fromfile <- function(url ,filePath) |
325 | 325 |
{ |
326 | 326 |
if(!file.exists(filePath)) |
327 | 327 |
stop("file does not exist") |
328 |
- |
|
328 |
+ |
|
329 | 329 |
query <- readLines(filePath) |
330 | 330 |
compile_query(url,query) |
331 | 331 |
} |
... | ... |
@@ -338,11 +338,11 @@ serialize_query <- function(url,output_gtf,base64) |
338 | 338 |
out <- "gtf" |
339 | 339 |
else |
340 | 340 |
out <- "tab" |
341 |
- |
|
341 |
+ |
|
342 | 342 |
URL <- paste0(url,"/queries/dag/",out) |
343 | 343 |
h <- c('Accept' = "Application/json", |
344 | 344 |
'Content-Type' = 'text/plain','X-Auth-Token' = authToken) |
345 |
- |
|
345 |
+ |
|
346 | 346 |
req <- httr::POST(URL,body = base64 ,httr::add_headers(h),encode = "json") |
347 | 347 |
content <- httr::content(req,"parsed") |
348 | 348 |
if(req$status_code !=200) |
... | ... |
@@ -15,73 +15,131 @@ link-citations: true |
15 | 15 |
|
16 | 16 |
# Introduction |
17 | 17 |
|
18 |
-Recent years have seen a tremendous increase in the volume of data generated in the life sciences, especially propelled by the rapid progress of Next Generation Sequencing (NGS) technologies. |
|
19 |
-This high-throughput technologies can produce billions of short DNA or RNA fragments in excess of a few terabytes of data in a single run. |
|
20 |
-Next-generation sequencing refers to the deep, in-parallel DNA sequencing technologies providing massively parallel analysis and extremely high-throughput from multiple samples at much reduced cost. Improvement of sequencing technologies and data processing pipelines is rapidly providing sequencing data, with associated high-level features, of many individual genomes in multiple biological and clinical conditions. |
|
21 |
-To make effective use of the produced data, the design of big data algorithms and their efficient implementation on modern high performance computing infrastructures, |
|
22 |
-such as clouds, CPU clusters and network infrastructures, is required in order to achieve scalability and performance. |
|
23 |
-For this purpose the GenoMetric Query Language (GMQL) has been proposed as high-level, declarative language to process, query, and compare multiple and heterogeneous genomic datasets for biomedical knowledge discovery [@Bioinformatics2015] |
|
18 |
+Recent years have seen a tremendous increase in the volume of data generated |
|
19 |
+in the life sciences, especially propelled by the rapid progress of |
|
20 |
+Next Generation Sequencing (NGS) technologies. |
|
21 |
+This high-throughput technologies can produce billions of short DNA or RNA |
|
22 |
+fragments in excess of a few terabytes of data in a single run. |
|
23 |
+Next-generation sequencing refers to the deep, in-parallel DNA sequencing |
|
24 |
+technologies providing massively parallel analysis and extremely |
|
25 |
+high-throughput from multiple samples at much reduced cost. |
|
26 |
+Improvement of sequencing technologies and data processing pipelines |
|
27 |
+is rapidly providing sequencing data, with associated high-level features, |
|
28 |
+of many individual genomes in multiple biological and clinical conditions. |
|
29 |
+To make effective use of the produced data, the design of big data algorithms |
|
30 |
+and their efficient implementation on modern high performance |
|
31 |
+computing infrastructures, such as clouds, CPU clusters |
|
32 |
+and network infrastructures, is required in order to achieve scalability |
|
33 |
+and performance. |
|
34 |
+For this purpose the GenoMetric Query Language (GMQL) has been proposed |
|
35 |
+as high-level, declarative language to process, query, |
|
36 |
+and compare multiple and heterogeneous genomic datasets for biomedical |
|
37 |
+knowledge discovery [@Bioinformatics2015] |
|
24 | 38 |
|
25 | 39 |
## Purpose |
26 | 40 |
|
27 |
-A very important emerging problem is to make sense of the enormous amount and variety of NGS data becoming available, i.e. to discover how different genomic regions and their products interact and cooperate with each other. |
|
28 |
-To this aim, the integration of several heterogeneous DNA feature data is required. |
|
29 |
-Such big genomic feature data are collected within numerous and heterogeneous files, usually distributed within different repositories, lacking an attribute-based organization and a systematic description of their metadata. |
|
30 |
-These heterogeneous data can contain the hidden answer to very important biomedical questions. |
|
31 |
-To inveil them, standard tools already available for knowledge extraction are too specialized or present powerful features, but have a rough interface not well-suited for scientists/biologists. |
|
32 |
-GMQL addresses these aspects using cloud-based technologies (including Apache Hadoop, mapReduce, and Spark), and focusing on genomic data operations written as simple queries with implicit iterations over thousands of heterogeneous samples, computed efficiently [@IEEE7484654]. |
|
33 |
-This RGMQL package makes easy to take advantage of GMQL functionalities also to scientists and biologists with limited knowledge of query and programming languages, but used to the R/Bioconductor environment. |
|
34 |
-This package is built over a GMQL scalable data management engine written in Scala programming language, released as Scala API [@githubrepo] providing a set of functions to combine, manipulate, compare, and extract genomic data from different datasources both from local and remote datasets. |
|
35 |
-These functions allow performing complex GMQL processing and queries without knowledge of GMQL syntax, but leveraging on R idiomatic paradigm and logic. |
|
41 |
+A very important emerging problem is to make sense of the enormous amount and |
|
42 |
+variety of NGS data becoming available, i.e. to discover how different genomic |
|
43 |
+regions and their products interact and cooperate with each other. |
|
44 |
+To this aim, the integration of several heterogeneous DNA feature data |
|
45 |
+is required. |
|
46 |
+Such big genomic feature data are collected within numerous and |
|
47 |
+heterogeneous files, usually distributed within different repositories, |
|
48 |
+lacking an attribute-based organization and a systematic description |
|
49 |
+of their metadata. |
|
50 |
+These heterogeneous data can contain the hidden answer to very important |
|
51 |
+biomedical questions. |
|
52 |
+To inveil them, standard tools already available for knowledge extraction |
|
53 |
+are too specialized or present powerful features, but have a rough interface |
|
54 |
+not well-suited for scientists/biologists. |
|
55 |
+GMQL addresses these aspects using cloud-based technologies |
|
56 |
+(including Apache Hadoop, mapReduce, and Spark), and focusing on genomic data |
|
57 |
+operations written as simple queries with implicit iterations over thousands |
|
58 |
+of heterogeneous samples, computed efficiently [@IEEE7484654]. |
|
59 |
+This RGMQL package makes easy to take advantage of GMQL functionalities also |
|
60 |
+to scientists and biologists with limited knowledge of query and |
|
61 |
+programming languages, but used to the R/Bioconductor environment. |
|
62 |
+This package is built over a GMQL scalable data management engine |
|
63 |
+written in Scala programming language, released as Scala API [@githubrepo] |
|
64 |
+providing a set of functions to combine, manipulate, compare, and extract |
|
65 |
+genomic data from different datasources both from local and remote datasets. |
|
66 |
+These functions allow performing complex GMQL processing and queries without |
|
67 |
+knowledge of GMQL syntax, but leveraging on R idiomatic paradigm and logic. |
|
36 | 68 |
|
37 | 69 |
|
38 | 70 |
# Genomic Data Model |
39 | 71 |
|
40 |
-The Genomic Data Model (GDM) is based on the notions of datasets and samples[@modeling2016] |
|
41 |
-Datasets are collections of samples, and each sample consists of two parts, the region data, which describe portions of the genome, and the metadata, which describe sample general properties and how observations are collected. |
|
42 |
-In contrast to other data models, it clearly divides, and comprehensively manages, observations about genomic regions and metadata. |
|
43 |
-GDM provides a flat attribute based organization, just requiring that each dataset is associated with a given data schema, which specifies the attributes and their type of region data. |
|
44 |
-The first attributes of such schema are fixed (chr, start, end, strand); they represent the genomic region identifying coordinates. |
|
72 |
+The Genomic Data Model (GDM) is based on the notions of datasets |
|
73 |
+and samples[@modeling2016] |
|
74 |
+Datasets are collections of samples, and each sample consists of two parts, |
|
75 |
+the region data, which describe portions of the genome, and the metadata, |
|
76 |
+which describe sample general properties and how observations are collected. |
|
77 |
+In contrast to other data models, it clearly divides, and comprehensively |
|
78 |
+manages, observations about genomic regions and metadata. |
|
79 |
+GDM provides a flat attribute based organization, just requiring that |
|
80 |
+each dataset is associated with a given data schema, which specifies |
|
81 |
+the attributes and their type of region data. |
|
82 |
+The first attributes of such schema are fixed (chr, start, end, strand); |
|
83 |
+they represent the genomic region identifying coordinates. |
|
45 | 84 |
In addition, metadata have free attribute-value pair format. |
46 | 85 |
|
47 | 86 |
## Genomic Region |
48 | 87 |
|
49 |
-Genomic region data describe a broad variety of biomolecular aspects and are very valuable for biomolecular investigation. |
|
50 |
-A genomic region is a portion of a genome, qualified by a quadruple of values called region coordinates: |
|
88 |
+Genomic region data describe a broad variety of biomolecular aspects and are |
|
89 |
+very valuable for biomolecular investigation. |
|
90 |
+A genomic region is a portion of a genome, qualified by a quadruple of values |
|
91 |
+called region coordinates: |
|
51 | 92 |
$$< chr, left, right, strand >$$ |
52 |
-Regions can have an arbitrary number of associated values, according to the processing of DNA, RNA or epigenomic sequencing reads that determined the region. |
|
93 |
+Regions can have an arbitrary number of associated values, according to |
|
94 |
+the processing of DNA, RNA or epigenomic sequencing reads that determined |
|
95 |
+the region. |
|
53 | 96 |
|
54 | 97 |
## Metadata |
55 | 98 |
|
56 |
-Metadata describe the biological and clinical properties associated with each sample. |
|
57 |
-They are usually collected in a broad variety of data structures and formats that constitute barriers to their use and comparison GDM models metadata simply as arbitrary semi-structured attribute-value pairs, where attributes may have multiple values. |
|
99 |
+Metadata describe the biological and clinical properties associated with |
|
100 |
+each sample. |
|
101 |
+They are usually collected in a broad variety of data structures and formats |
|
102 |
+that constitute barriers to their use and comparison GDM models metadata |
|
103 |
+simply as arbitrary semi-structured attribute-value pairs, |
|
104 |
+where attributes may have multiple values. |
|
58 | 105 |
|
59 | 106 |
## Genomic Sample |
60 | 107 |
|
61 |
-Formally, a sample s is a collection of genomic regions modeled as the following triple: $$< id, {< r_i,v_i >}, {m_j} >$$ where: |
|
108 |
+Formally, a sample s is a collection of genomic regions modeled as |
|
109 |
+the following triple: $$< id, {< r_i,v_i >}, {m_j} >$$ where: |
|
62 | 110 |
|
63 | 111 |
* id is the sample identifier |
64 | 112 |
* Each region is a pair of coordinates $r_i$ and values $v_i$ |
65 | 113 |
* Metadata $m_j$ are attribute-value pairs $< a_j,v_j >$ |
66 | 114 |
|
67 |
-Note that the sample id attribute provides a many-to-many connection between regions and metadata of a sample. |
|
68 |
-Through the use of a data type system to express region data, and of arbitrary attribute-value pairs for metadata, GDM provides interoperability across datasets in multiple formats produced by different experimental techniques. |
|
115 |
+Note that the sample id attribute provides a many-to-many connection between |
|
116 |
+regions and metadata of a sample. |
|
117 |
+Through the use of a data type system to express region data, and of arbitrary |
|
118 |
+attribute-value pairs for metadata, GDM provides interoperability across |
|
119 |
+datasets in multiple formats produced by different experimental techniques. |
|
69 | 120 |
|
70 | 121 |
## Dataset |
71 | 122 |
|
72 |
-A dataset is a collection of samples uniquely identified, with the same region schema and with each sample consisting of two parts: |
|
123 |
+A dataset is a collection of samples uniquely identified, with the same region |
|
124 |
+schema and with each sample consisting of two parts: |
|
73 | 125 |
|
74 | 126 |
* region data: describing characteristics and location of genomic portions |
75 | 127 |
* metadata: expressing general properties of the sample |
76 | 128 |
|
77 |
-Each dataset is typically produced within the same project by using the same or equivalent technology and tools, but with different experimental conditions, described by metadata. |
|
129 |
+Each dataset is typically produced within the same project by using the same |
|
130 |
+or equivalent technology and tools, but with different experimental |
|
131 |
+conditions, described by metadata. |
|
78 | 132 |
|
79 |
-Datasets contain large number of information describing regions of a genome, with data encoded in human readable format using plain text files. |
|
133 |
+Datasets contain large number of information describing regions of a genome, |
|
134 |
+with data encoded in human readable format using plain text files. |
|
80 | 135 |
|
81 |
-GMQL datasets are materialized in a standard layout composed of three types of files: |
|
136 |
+GMQL datasets are materialized in a standard layout composed of three |
|
137 |
+types of files: |
|
82 | 138 |
|
83 |
-1. genomic region tab-delimited text files with extension .gdm, or .gtf if in standard GTF format |
|
84 |
-2. metadata attribute-value tab-delimited text files with the same fullname (name and extension) of the correspondent region file and extension .meta |
|
139 |
+1. genomic region tab-delimited text files with extension .gdm, or .gtf |
|
140 |
+if in standard GTF format |
|
141 |
+2. metadata attribute-value tab-delimited text files with the same fullname |
|
142 |
+(name and extension) of the correspondent region file and extension .meta |
|
85 | 143 |
3. schema XML file containing region attribute names and types |
86 | 144 |
|
87 | 145 |
All these files reside in unique folder called files. |
... | ... |
@@ -89,14 +147,22 @@ All these files reside in unique folder called files. |
89 | 147 |
<!--  --> |
90 | 148 |
|
91 | 149 |
In RGMQL package dataset files are considered read-only. |
92 |
-Once read, genomic information is represented in abstract structure inside the package, mapped to a R GRanges data structure at occurency. |
|
150 |
+Once read, genomic information is represented in abstract structure inside |
|
151 |
+the package, mapped to a R GRanges data structure at occurency. |
|
93 | 152 |
|
94 | 153 |
|
95 | 154 |
# GenoMetric Query Language |
96 | 155 |
|
97 |
-The GenoMetric Query Language name stems from the language ability to deal with genomic distances, which are measured as number of nucleotide bases between genomic regions (aligned to the same reference genome) and computed using arithmetic operations between region coordinates. |
|
98 |
-GMQL is a high-level, declarative language that allows expressing queries easily over genomic regions and their metadata, in a way similar to what can be done with the Structured Query Language (SQL) over a relational database. |
|
99 |
-GMQL approach exhibits two main differences with respect to other tools based on Hadoop, mapReduce framework, and Spark engine technologies to address similar biomedical problems:\newline |
|
156 |
+The GenoMetric Query Language name stems from the language ability to deal |
|
157 |
+with genomic distances, which are measured as number of nucleotide bases |
|
158 |
+between genomic regions (aligned to the same reference genome) and computed |
|
159 |
+using arithmetic operations between region coordinates. |
|
160 |
+GMQL is a high-level, declarative language that allows expressing queries |
|
161 |
+easily over genomic regions and their metadata, in a way similar to what can |
|
162 |
+be done with the Structured Query Language (SQL) over a relational database. |
|
163 |
+GMQL approach exhibits two main differences with respect to other tools |
|
164 |
+based on Hadoop, mapReduce framework, and Spark engine technologies |
|
165 |
+to address similar biomedical problems:\newline |
|
100 | 166 |
|
101 | 167 |
* GMQL: |
102 | 168 |
|
... | ... |
@@ -108,29 +174,43 @@ GMQL approach exhibits two main differences with respect to other tools based on |
108 | 174 |
1. read generally from raw or alligned data from NGS machines |
109 | 175 |
2. provide no support for metadata management |
110 | 176 |
|
111 |
-GMQL is the appropriate tool for querying numerous processed genomic datasets and very many samples that are becoming available. |
|
112 |
-Note however that GMQL performs worse than some other available systems on a small number of small-scale datasets, but these other systems are not cloud-based; hence, they are not adequate for efficient big data processing and, in some cases, they are inherently limited in their data management capacity, as they only work as RAM memory resident processes. |
|
177 |
+GMQL is the appropriate tool for querying numerous processed genomic datasets |
|
178 |
+and very many samples that are becoming available. |
|
179 |
+Note however that GMQL performs worse than some other available systems on a |
|
180 |
+small number of small-scale datasets, but these other systems are not |
|
181 |
+cloud-based; hence, they are not adequate for efficient big data processing |
|
182 |
+and, in some cases, they are inherently limited in their |
|
183 |
+data management capacity, as they only work as RAM memory resident processes. |
|
113 | 184 |
|
114 | 185 |
## Query structure |
115 | 186 |
|
116 |
-A GMQL operation is expressed as a sequence of GMQL operations with the following structure: |
|
187 |
+A GMQL operation is expressed as a sequence of GMQL operations with the |
|
188 |
+following structure: |
|
117 | 189 |
$$< variable > = operator(< parameters >) < variable >;$$ |
118 | 190 |
where each $< variable >$ stands for a GDM dataset |
119 | 191 |
|
120 |
-This RGMQL package brings GMQL functionalities into R environemnt, allowing users to build directly a GMQL query without knowing the GMQL syntax. |
|
121 |
-In RGMQL every GMQL operations is translated into a R function and expressed as: |
|
192 |
+This RGMQL package brings GMQL functionalities into R environemnt, |
|
193 |
+allowing users to build directly a GMQL query without knowing the GMQL syntax. |
|
194 |
+In RGMQL every GMQL operations is translated into a R function |
|
195 |
+and expressed as: |
|
122 | 196 |
$$ variable = operator(variable, parameters)$$ |
123 | 197 |
|
124 |
-It is very similar to the GMQL syntax for operation expression although expressed with the R idiomatic paradigm and logic, with parameters totaly builded using R native data structures such as lists, matrices, vectors or R logic conditions. |
|
198 |
+It is very similar to the GMQL syntax for operation expression although |
|
199 |
+expressed with the R idiomatic paradigm and logic, with parameters totaly |
|
200 |
+builded using R native data structures such as lists, matrices, |
|
201 |
+vectors or R logic conditions. |
|
125 | 202 |
|
126 | 203 |
|
127 | 204 |
# Processing Environments |
128 | 205 |
|
129 |
-In this section, we show how GMQL processing is built in R, which operations are available in RGMQL, and the difference beetween local and remote dataset processing. |
|
206 |
+In this section, we show how GMQL processing is built in R, which operations |
|
207 |
+are available in RGMQL, and the difference beetween local |
|
208 |
+and remote dataset processing. |
|
130 | 209 |
|
131 | 210 |
## Local Processing |
132 | 211 |
|
133 |
-RGMQL local processing consumes computational power directly from local CPUs/system while managing datasets (both GMQL or generic text plain datasets). |
|
212 |
+RGMQL local processing consumes computational power directly from local |
|
213 |
+CPUs/system while managing datasets (both GMQL or generic text plain datasets). |
|
134 | 214 |
|
135 | 215 |
### Initialization |
136 | 216 |
|
... | ... |
@@ -138,22 +218,27 @@ Load and attach the GMQL package in a R session using library function: |
138 | 218 |
```{r, initialization, eval = TRUE} |
139 | 219 |
library('RGMQL') |
140 | 220 |
``` |
141 |
-Before starting using any GMQL operation we need to initialise the GMQL context with the following code: |
|
221 |
+Before starting using any GMQL operation we need to initialise the GMQL |
|
222 |
+context with the following code: |
|
142 | 223 |
```{r, init, eval = TRUE} |
143 | 224 |
init_gmql() |
144 | 225 |
``` |
145 |
-The function *init_gmql()* initializes the context of scalable data management engine laid upon Spark and Hadoop. |
|
146 |
-Details on this and all other functions are provided in the R documentation for this package (e.g., help(RGMQL)). |
|
226 |
+The function *init_gmql()* initializes the context of scalable data management |
|
227 |
+engine laid upon Spark and Hadoop. |
|
228 |
+Details on this and all other functions are provided in the R documentation |
|
229 |
+for this package (e.g., help(RGMQL)). |
|
147 | 230 |
|
148 | 231 |
### Read Dataset |
149 | 232 |
|
150 | 233 |
After initialization we need to read datasets. |
151 |
-We already defined above the formal definition of dataset and the power of GMQL to deal with data in a variety of standard tab-delimited text formats. |
|
234 |
+We already defined above the formal definition of dataset and the power of |
|
235 |
+GMQL to deal with data in a variety of standard tab-delimited text formats. |
|
152 | 236 |
In the following, we show how to get data from different sources.\newline |
153 | 237 |
We distinguish two different cases: |
154 | 238 |
|
155 | 239 |
1. Local dataset:\newline |
156 |
-A local dataset is a folder with sample files (region files and correspondent metadata files) on the user computer. |
|
240 |
+A local dataset is a folder with sample files (region files and correspondent |
|
241 |
+metadata files) on the user computer. |
|
157 | 242 |
As data are already in the user computer, we simply execute: |
158 | 243 |
|
159 | 244 |
```{r, read GMQL dataset, eval = TRUE} |
... | ... |
@@ -161,10 +246,13 @@ gmql_dataset_path <- system.file("example", "EXON", package = "RGMQL") |
161 | 246 |
data_out = read_dataset(gmql_dataset_path) |
162 | 247 |
``` |
163 | 248 |
In this case we are reading a dataset named EXON specified by path. |
164 |
-It doens't matter what kind of format the data are, *read_dataset()* read many standard tab-delimited text formats without specified any paramter at input. |
|
249 |
+It doens't matter what kind of format the data are, *read_dataset()* read many |
|
250 |
+standard tab-delimited text formats without specified any paramter at input. |
|
165 | 251 |
|
166 | 252 |
2. GRangesList:\newline |
167 |
-For better integration in the R environment and with other packages, we provide a *read()* function to read directly from R memory/environment using GRangesList as input. |
|
253 |
+For better integration in the R environment and with other packages, |
|
254 |
+we provide a *read()* function to read directly from R memory/environment |
|
255 |
+using GRangesList as input. |
|
168 | 256 |
|
169 | 257 |
```{r, read GRangesList, eval = TRUE} |
170 | 258 |
library("GenomicRanges") |
... | ... |
@@ -182,19 +270,27 @@ grl <- GRangesList("txA" = gr1, "txB" = gr2) |
182 | 270 |
data_out <- read(grl) |
183 | 271 |
``` |
184 | 272 |
In this example we show how versatile RGMQL package are. |
185 |
-As specified above, we can directly read a list of GRanges previously created starting from two GRanges. |
|
186 |
-Both *read()* and *read_dataset()* functions returns a result object, in this case *data_out* containing an internal R representation of the dataset used as input for executing the subsequent GMQL operation. |
|
273 |
+As specified above, we can directly read a list of GRanges previously created |
|
274 |
+starting from two GRanges. |
|
275 |
+Both *read()* and *read_dataset()* functions returns a result object, |
|
276 |
+in this case *data_out* containing an internal R representation of the dataset |
|
277 |
+used as input for executing the subsequent GMQL operation. |
|
187 | 278 |
|
188 | 279 |
### Queries |
189 | 280 |
|
190 | 281 |
GMQL is not a traditional query language: |
191 |
-With "query" we intend a group of operations that together produce a result; in this sense GMQL queries are more similar to SQL scripts. |
|
192 |
-GMQL programming consists of a series of select, union, project, difference (and so on ...) commands. |
|
282 |
+With "query" we intend a group of operations that together produce a result; |
|
283 |
+in this sense GMQL queries are more similar to SQL scripts. |
|
284 |
+GMQL programming consists of a series of select, union, project, |
|
285 |
+difference (and so on ...) commands. |
|
193 | 286 |
|
194 | 287 |
Let's see some short examples: |
195 | 288 |
|
196 | 289 |
1) Find somatic mutations in exons. |
197 |
-???Consider mutation data samples of human breast cancer cases. For each sample, quantify the mutations in each exon and select the exons with at least one mutation. Return the list of samples ordered by the number of such exons.??? |
|
290 |
+Consider mutation data samples of human breast cancer cases. |
|
291 |
+For each sample, quantify the mutations in each exon and select the exons |
|
292 |
+with at least one mutation. Return the list of samples ordered by |
|
293 |
+the number of such exons. |
|
198 | 294 |
```{r, query, eval = TRUE} |
199 | 295 |
## Read EXON dataset containing a single sample with exon regions |
200 | 296 |
## and MUT dataset containing multiple samples with mutation regions |
... | ... |
@@ -207,7 +303,8 @@ mut_ds <- read_dataset(mut_path) |
207 | 303 |
mut = select(mut_ds, manually_curated__dataType == 'dnaseq' & |
208 | 304 |
clinical_patient__tumor_tissue_site == 'breast') |
209 | 305 |
|
210 |
-exon = select(exon_ds, annotation_type == 'exons' & original_provider == 'RefSeq') |
|
306 |
+exon = select(exon_ds, annotation_type == 'exons' & |
|
307 |
+ original_provider == 'RefSeq') |
|
211 | 308 |
|
212 | 309 |
|
213 | 310 |
## For each mutation sample, count mutations within each exon while |
... | ... |
@@ -230,7 +327,8 @@ exon3 <- extend(exon2, list(exon_count = COUNT())) |
230 | 327 |
exon_res = RGMQL::order(exon3, list(DESC("exon_count"))) |
231 | 328 |
``` |
232 | 329 |
|
233 |
-If you want to store persistently the result, you can materialize it into specific path defined as input parameter. |
|
330 |
+If you want to store persistently the result, you can materialize it into |
|
331 |
+specific path defined as input parameter. |
|
234 | 332 |
```{r, materialize, eval = TRUE} |
235 | 333 |
## Materialize the result dataset on disk |
236 | 334 |
materialize(exon_res) |
... | ... |
@@ -239,55 +337,72 @@ by default *materialize()* has R workig directoy as stored path. |
239 | 337 |
|
240 | 338 |
### Execution |
241 | 339 |
|
242 |
-GMQL processing does not store results: They remain in the environment until you invoke the *execute()* function. |
|
340 |
+GMQL processing does not store results: They remain in the environment until |
|
341 |
+you invoke the *execute()* function. |
|
243 | 342 |
```{r, execute, eval = FALSE} |
244 | 343 |
execute() |
245 | 344 |
``` |
246 |
-*execute()* can be issued only if at least one *read()* and one *materialize()* are present in the GMQL query, otherwise an error is generated. |
|
247 |
-Data are saved in the path specified in every *materialize()* present in the query. |
|
345 |
+*execute()* can be issued only if at least one *read()* and one *materialize()* |
|
346 |
+are present in the GMQL query, otherwise an error is generated. |
|
347 |
+Data are saved in the path specified in every *materialize()* present |
|
348 |
+in the query. |
|
248 | 349 |
|
249 | 350 |
Beside *execute()* we can use: |
250 | 351 |
```{r, take, eval = TRUE} |
251 | 352 |
g <- take(input_data = exon_res, rows = 45) |
252 | 353 |
``` |
253 |
-to execute all *materialize()* commands in the GMQL query and extract data as GRangesList format, a GRanges for each materialize. |
|
354 |
+to execute all *materialize()* commands in the GMQL query and extract data |
|
355 |
+as GRangesList format, a GRanges for each materialize. |
|
254 | 356 |
NOTE: GRangesList are contained in the R environment and are not saved on disk. |
255 | 357 |
|
256 |
-with *rows* parameter is possible to specified how many rows, for each sample inside input dataset, will be exported; by default is $0$, that means all rows will be exported. |
|
257 |
-Note that, since we are working with big data, exported all row could be very time and space consuming. |
|
358 |
+with *rows* parameter is possible to specified how many rows, for each sample |
|
359 |
+inside input dataset, will be exported; by default is $0$, |
|
360 |
+that means all rows will be exported. |
|
361 |
+Note that, since we are working with big data, |
|
362 |
+exported all row could be very time and space consuming. |
|
258 | 363 |
|
259 | 364 |
## Remote Processing |
260 | 365 |
|
261 |
-RGMQL remote processing consumes computational power from remote cluster/system while managing GMQL datasets.\newline |
|
366 |
+RGMQL remote processing consumes computational power from |
|
367 |
+remote cluster/system while managing GMQL datasets.\newline |
|
262 | 368 |
|
263 | 369 |
Remote processing exits in two flavour:\newline |
264 | 370 |
|
265 | 371 |
- REST web services: \newline |
266 |
- User can write GMQL queries (using original GMQL syntax) to be executed remotely on remote data (or local data previously uploaded). |
|
372 |
+ User can write GMQL queries (using original GMQL syntax) to be executed |
|
373 |
+ remotely on remote data (or local data previously uploaded). |
|
267 | 374 |
|
268 | 375 |
- BATCH execution: \newline |
269 |
- Similar to local execution; user reads data and the system automatically uploads them on the |
|
270 |
- remote system: once loaded, RGMQL functions can be issued to manage remote data.\newline |
|
376 |
+ Similar to local execution; user reads data and the system automatically |
|
377 |
+ uploads them on the |
|
378 |
+ remote system: once loaded, RGMQL functions can be issued to manage |
|
379 |
+ remote data.\newline |
|
271 | 380 |
|
272 | 381 |
### REST web services |
273 | 382 |
|
274 |
-This RGMQL package allows to invoke REST services implementing the commands specified at [link](http://130.186.13.219/gmql-rest/swagger/). |
|
383 |
+This RGMQL package allows to invoke REST services implementing the |
|
384 |
+commands specified at [link](http://130.186.13.219/gmql-rest/swagger/). |
|
275 | 385 |
|
276 | 386 |
|
277 | 387 |
#### Initialization |
278 | 388 |
|
279 |
-GMQL REST services require login; so, the first step is to perform logon with user and password, or as guest. |
|
280 |
-Upon succesfull logon, you get a request token that you must use in every subsequent REST call. |
|
389 |
+GMQL REST services require login; so, the first step is to perform logon |
|
390 |
+with user and password, or as guest. |
|
391 |
+Upon succesfull logon, you get a request token that you must use |
|
392 |
+in every subsequent REST call. |
|
281 | 393 |
Login can be performed using the function: |
282 | 394 |
```{r, eval = TRUE} |
283 | 395 |
test_url = "http://130.186.13.219/gmql-rest" |
284 | 396 |
login_gmql(test_url) |
285 | 397 |
``` |
286 |
-that saves the token in the Global R environment within the variable named *authToken*. With this token you can call all the functions in the GMQL REST web services suite. |
|
398 |
+that saves the token in the Global R environment within the variable |
|
399 |
+named *authToken*. With this token you can call all the functions |
|
400 |
+in the GMQL REST web services suite. |
|
287 | 401 |
|
288 | 402 |
#### Execution |
289 | 403 |
|
290 |
-User can write a GMQL query as in the following example, and run it as second parameter of the *run_query()* function. |
|
404 |
+User can write a GMQL query as in the following example, and run it as |
|
405 |
+second parameter of the *run_query()* function. |
|
291 | 406 |
```{r, run, eval = TRUE} |
292 | 407 |
test_url = "http://130.186.13.219/gmql-rest" |
293 | 408 |
login_gmql(test_url) |
... | ... |
@@ -302,18 +417,23 @@ query_path <- system.file("example","query1.txt", package = "RGMQL") |
302 | 417 |
job <- run_query_fromfile(test_url, "query1", query_path, output_gtf = FALSE) |
303 | 418 |
``` |
304 | 419 |
|
305 |
-Once run, query continues on the remote server while *run_query()* and *run_query_fromfile()* returns immediately. |
|
420 |
+Once run, query continues on the remote server while *run_query()* |
|
421 |
+and *run_query_fromfile()* returns immediately. |
|
306 | 422 |
User can extract from result (job) the job_id and status. |
307 |
-jod_id can then be used to continuosly invoke log and trace calls, both in this RGMQL package, to check for job completed status. |
|
423 |
+jod_id can then be used to continuosly invoke log and trace calls, |
|
424 |
+both in this RGMQL package, to check for job completed status. |
|
308 | 425 |
|
309 | 426 |
```{r, trace, eval = TRUE} |
310 | 427 |
jod_id <- job$id |
311 | 428 |
trace_job(test_url,jod_id) |
312 | 429 |
``` |
313 | 430 |
|
314 |
-Then, results materialized on the remote repository can by downloaded locally and imported in a GRangesList using the functions in this RGMQL package.[see import/export](# Utilities) |
|
431 |
+Then, results materialized on the remote repository can by downloaded |
|
432 |
+locally and imported in a GRangesList using the functions |
|
433 |
+in this RGMQL package.[see import/export](# Utilities) |
|
315 | 434 |
|
316 |
-the returned *job* contains also the name of dataset, it will be materialized with |
|
435 |
+the returned *job* contains also the name of dataset, |
|
436 |
+it will be materialized with |
|
317 | 437 |
```{r, download, eval = FALSE} |
318 | 438 |
name_dataset <- job$datasets[[1]]$name |
319 | 439 |
download_dataset(test_url,name_dataset) |
... | ... |
@@ -321,10 +441,14 @@ download_dataset(test_url,name_dataset) |
321 | 441 |
|
322 | 442 |
### Batch execution |
323 | 443 |
|
324 |
-This execution type is similar to local processing (syntax, functions, and so on ...) except:\newline |
|
325 |
-1. materialized data are stored only on the remote repository, from where they can be download locally and imported in a GRangesList using the functions in this RGMQL package.[see import/export](# Utilities) |
|
444 |
+This execution type is similar to local processing |
|
445 |
+(syntax, functions, and so on ...) except:\newline |
|
446 |
+1. materialized data are stored only on the remote repository, |
|
447 |
+from where they can be download locally and imported in a GRangesList |
|
448 |
+using the functions in this RGMQL package.[see import/export](# Utilities) |
|
326 | 449 |
|
327 |
-Before starting with examples, note that we have to log into remote infrastructure with login function: |
|
450 |
+Before starting with examples, note that we have to log into remote |
|
451 |
+infrastructure with login function: |
|
328 | 452 |
```{r, login remote, eval = TRUE} |
329 | 453 |
login_gmql(test_url) |
330 | 454 |
``` |
... | ... |
@@ -346,7 +470,8 @@ mut_ds <- read_dataset("public.HG19_BED_ANNOTATION",is_local = FALSE) |
346 | 470 |
mut = select(mut_ds, manually_curated__dataType == 'dnaseq' & |
347 | 471 |
clinical_patient__tumor_tissue_site == 'breast') |
348 | 472 |
|
349 |
-exon = select(exon_ds, annotation_type == 'exons' & original_provider == 'RefSeq') |
|
473 |
+exon = select(exon_ds, annotation_type == 'exons' & |
|
474 |
+ original_provider == 'RefSeq') |
|
350 | 475 |
|
351 | 476 |
## For each mutation sample, count mutations within each exon while |
352 | 477 |
## mapping the mutations to the exon regions using the map() function |
... | ... |
@@ -385,7 +510,9 @@ remote_processing(TRUE) |
385 | 510 |
``` |
386 | 511 |
An user can switch mode until the first *materialize()* has been performed. |
387 | 512 |
|
388 |
-This kind of processing comes from the fact that the *read()* function can accept both local dataset and repository dataset, even in the same query as in the following example: |
|
513 |
+This kind of processing comes from the fact that the *read()* function can |
|
514 |
+accept both local dataset and repository dataset, |
|
515 |
+even in the same query as in the following example: |
|
389 | 516 |
```{r, mixed query, eval = TRUE} |
390 | 517 |
|
391 | 518 |
## Read EXON dataset containing a single sample with exon regions |
... | ... |
@@ -398,7 +525,8 @@ mut_ds <- read_dataset("public.HG19_BED_ANNOTATION",is_local = FALSE) |
398 | 525 |
mut = select(mut_ds, manually_curated__dataType == 'dnaseq' & |
399 | 526 |
clinical_patient__tumor_tissue_site == 'breast') |
400 | 527 |
|
401 |
-exon = select(exon_ds, annotation_type == 'exons' & original_provider == 'RefSeq') |
|
528 |
+exon = select(exon_ds, annotation_type == 'exons' & |
|
529 |
+ original_provider == 'RefSeq') |
|
402 | 530 |
|
403 | 531 |
## For each mutation sample, count mutations within each exon while |
404 | 532 |
## mapping the mutations to the exon regions using the map() function |
... | ... |
@@ -429,9 +557,12 @@ materialize(exon_res) |
429 | 557 |
execute() |
430 | 558 |
``` |
431 | 559 |
|
432 |
-As we can see the two *read()* functions read from different sources: *exon_ds* from local dataset, *mut_ds* from repository. |
|
560 |
+As we can see the two *read()* functions read from different sources: |
|
561 |
+*exon_ds* from local dataset, *mut_ds* from repository. |
|
433 | 562 |
|
434 |
-If we set local processing to false (*remote_processing(FALSE)*), the execution is performed locally downloading all remote repositories, otherwise all local dataset are automatically uploaded. |
|
563 |
+If we set local processing to false (*remote_processing(FALSE)*), |
|
564 |
+the execution is performed locally downloading all remote repositories, |
|
565 |
+otherwise all local dataset are automatically uploaded. |
|
435 | 566 |
|
436 | 567 |
NOTE: |
437 | 568 |
|
... | ... |
@@ -439,8 +570,10 @@ The public dataset cannot be downloaded from repositories by design |
439 | 570 |
|
440 | 571 |
# Utilities |
441 | 572 |
|
442 |
-RGMQL package contains functions that allow the user to interface with other packages available in R/Bioconductor repository, e.g. TFARM, GenomicRanges. |
|
443 |
-These functions return GRangesList or GRanges with metadata associated, if present, as data structure suitable to further processing in other packages. |
|
573 |
+RGMQL package contains functions that allow the user to interface with other |
|
574 |
+packages available in R/Bioconductor repository, e.g. TFARM, GenomicRanges. |
|
575 |
+These functions return GRangesList or GRanges with metadata associated, |
|
576 |
+if present, as data structure suitable to further processing in other packages. |
|
444 | 577 |
|
445 | 578 |
## Import/Export |
446 | 579 |
|
... | ... |
@@ -450,21 +583,21 @@ dataset_path <- system.file("example", "EXON", package = "RGMQL") |
450 | 583 |
data <- import_gmql(dataset_path, is_gtf = FALSE) |
451 | 584 |
data |
452 | 585 |
``` |
453 |
-the second parameter *is_gtf* must be specified if the file format are .GTF or .GDM. |
|
586 |
+the second parameter *is_gtf* must specifies the file format: .GTF or .GDM. |
|
454 | 587 |
|
455 | 588 |
We can export a GRangesList as GMQL dataset as follows: |
456 | 589 |
```{r, export, eval = FALSE} |
457 | 590 |
|
458 | 591 |
dir_out <- system.file("example", "dir_out", package = "RGMQL") |
459 |
-## The third parameter TRUE indicates that the GRangesList data is exported in GTF format |
|
460 | 592 |
export_gmql(data, dir_out, is_gtf = TRUE) |
461 | 593 |
|
462 | 594 |
``` |
463 |
-the second parameter *is_gtf* specified if the file format are .GTF or .GDM. |
|
595 |
+the second parameter *is_gtf* specifies the file format: .GTF or .GDM. |
|
464 | 596 |
|
465 | 597 |
## Filter and extract |
466 | 598 |
|
467 |
-We can also import only a part of a GMQL dataset into R environment, by filtering its content as follows: |
|
599 |
+We can also import only a part of a GMQL dataset into R environment, |
|
600 |
+by filtering its content as follows: |
|
468 | 601 |
```{r, filter_extract, eval = TRUE} |
469 | 602 |
|
470 | 603 |
data_in <- system.file("example", "TEAD", package = "RGMQL") |
... | ... |
@@ -472,11 +605,16 @@ matrix <- filter_and_extract(data_in, metadata = NULL,regions = c("count")) |
472 | 605 |
matrix |
473 | 606 |
|
474 | 607 |
``` |
475 |
-*filter_and_extract()* filters the samples in dataset based of their specified *metadata*, and then extracts as metadata columns of GRanges the vector of region attributes you want to retrieve. |
|
608 |
+*filter_and_extract()* filters the samples in dataset based of their specified |
|
609 |
+*metadata*, and then extracts as metadata columns of GRanges the vector |
|
610 |
+of region attributes you want to retrieve. |
|
476 | 611 |
If the argument is NULL, all samples will be taken. |
477 |
-the number of columns will be equal to the number of samples left after filtering. |
|
478 |
-If *regions* is not specified, only the foundamental elements of GRanges will be shown. |
|
479 |
-Note that this function works only if every sample in dataset has the same region coordinates, in terms of value and total number. |
|
612 |
+the number of columns will be equal to the number of samples left after |
|
613 |
+filtering. |
|
614 |
+If *regions* is not specified, only the foundamental elements of GRanges |
|
615 |
+will be shown. |
|
616 |
+Note that this function works only if every sample in dataset has the same |
|
617 |
+region coordinates, in terms of value and total number. |
|
480 | 618 |
|
481 | 619 |
# References |
482 | 620 |
|