... | ... |
@@ -92,15 +92,30 @@ import_gmql <- function(dataset_path, is_gtf) |
92 | 92 |
name_samples <- lapply(regions, function(x){ |
93 | 93 |
gsub("*.gdm", "",basename(x))}) |
94 | 94 |
vector_field <- .schema_header(datasetName) |
95 |
- |
|
95 |
+ type_and_coord <- .schema_type_coordinate(datasetName) |
|
96 | 96 |
names(vector_field) <- NULL |
97 |
- sampleList <- lapply(regions,function(x){ |
|
98 |
- df <- read.delim(x,col.names = vector_field,header = FALSE) |
|
99 |
- g <- GenomicRanges::makeGRangesFromDataFrame(df, |
|
100 |
- keep.extra.columns = TRUE, |
|
101 |
- start.field = "left", |
|
102 |
- end.field = "right") |
|
103 |
- }) |
|
97 |
+ if(type_and_coord$coordinate_system %in% c("1-based")) |
|
98 |
+ { |
|
99 |
+ sampleList <- lapply(regions,function(x){ |
|
100 |
+ df <- read.delim(x,col.names = vector_field,header = FALSE) |
|
101 |
+ g <- GenomicRanges::makeGRangesFromDataFrame(df, |
|
102 |
+ keep.extra.columns = TRUE, |
|
103 |
+ start.field = "left", |
|
104 |
+ end.field = "right") |
|
105 |
+ }) |
|
106 |
+ } |
|
107 |
+ else |
|
108 |
+ { |
|
109 |
+ sampleList <- lapply(regions,function(x){ |
|
110 |
+ df <- read.delim(x,col.names = vector_field,header = FALSE) |
|
111 |
+ df$left = df$left +1 |
|
112 |
+ g <- GenomicRanges::makeGRangesFromDataFrame(df, |
|
113 |
+ keep.extra.columns = TRUE, |
|
114 |
+ start.field = "left", |
|
115 |
+ end.field = "right") |
|
116 |
+ }) |
|
117 |
+ } |
|
118 |
+ |
|
104 | 119 |
names(sampleList) <- name_samples |
105 | 120 |
gRange_list <- GenomicRanges::GRangesList(sampleList) |
106 | 121 |
} |
... | ... |
@@ -121,6 +121,7 @@ export_gmql <- function(samples, dir_out, is_gtf) |
121 | 121 |
lapply(samples,function(x,dir){ |
122 | 122 |
sample_name <- file.path(dir,paste0("S_",cnt(),".gdm")) |
123 | 123 |
region_frame <- data.frame(x) |
124 |
+ region_frame$start = region_frame$start - 1 |
|
124 | 125 |
write.table(region_frame,sample_name,col.names = FALSE, |
125 | 126 |
row.names = FALSE, sep = '\t',quote = FALSE) |
126 | 127 |
},files_sub_dir) |
... | ... |
@@ -183,6 +184,17 @@ export_gmql <- function(samples, dir_out, is_gtf) |
183 | 184 |
xml2::xml_attr(root,"name") <- "DatasetName_SCHEMAS" |
184 | 185 |
xml2::xml_attr(root,"xmlns") <- "http://genomic.elet.polimi.it/entities" |
185 | 186 |
xml2::xml_add_child(root,"gmqlSchema") |
187 |
+ if(to_GTF) |
|
188 |
+ { |
|
189 |
+ xml2::xml_attr(root,"type") <- "gtf" |
|
190 |
+ xml2::xml_attr(root,"coordinate_system") <- "1-based" |
|
191 |
+ } |
|
192 |
+ else |
|
193 |
+ { |
|
194 |
+ xml2::xml_attr(root,"type") <- "tab" |
|
195 |
+ xml2::xml_attr(root,"coordinate_system") <- "0-based" |
|
196 |
+ } |
|
197 |
+ |
|
186 | 198 |
gmqlSchema <- xml2::xml_child(root,1) |
187 | 199 |
|
188 | 200 |
names_node <- names(node_list) |
... | ... |
@@ -36,6 +36,27 @@ |
36 | 36 |
vector_field <- unlist(list_field) |
37 | 37 |
} |
38 | 38 |
|
39 |
+.schema_type_coordinate <- function(datasetName) |
|
40 |
+{ |
|
41 |
+ schema_name <- list.files(datasetName, pattern = "*.schema$", |
|
42 |
+ full.names = TRUE) |
|
43 |
+ |
|
44 |
+ schema_name_xml <- list.files(datasetName, pattern = "*.xml$", |
|
45 |
+ full.names = TRUE) |
|
46 |
+ |
|
47 |
+ if(!length(schema_name) && !length(schema_name_xml)) |
|
48 |
+ stop("schema not present") |
|
49 |
+ |
|
50 |
+ if(!length(schema_name)) |
|
51 |
+ xml_schema <- xml2::read_xml(schema_name_xml) |
|
52 |
+ else |
|
53 |
+ xml_schema <- xml2::read_xml(schema_name) |
|
54 |
+ |
|
55 |
+ gmql_schema_tag <- xml2::xml_children(xml_schema) |
|
56 |
+ all_attrs <- xml2::xml_attrs(gmql_schema_tag) |
|
57 |
+ all_attrs_list <- as.list(all_attrs[[1]]) |
|
58 |
+} |
|
59 |
+ |
|
39 | 60 |
# aggregates factory |
40 | 61 |
.aggregates <- function(meta_data,class) |
41 | 62 |
{ |
... | ... |
@@ -11,7 +11,8 @@ |
11 | 11 |
#' \item{TAB: tab-delimited file format} |
12 | 12 |
#' \item{GTF: tab-delimited text standard format based on the General |
13 | 13 |
#' Feature Format} |
14 |
-#' \item{COLLECT: used for storing output in memory} |
|
14 |
+#' \item{COLLECT: used for storing output in memory (only in the case of local |
|
15 |
+#' processing, i.e., remote_processing = FALSE,)} |
|
15 | 16 |
#' } |
16 | 17 |
#' @param remote_processing logical value specifying the processing mode. |
17 | 18 |
#' True for processing on cluster (remote), false for local processing. |
... | ... |
@@ -22,8 +23,8 @@ |
22 | 23 |
#' You can always perform it by calling the function \code{\link{login_gmql}} |
23 | 24 |
#' explicitly |
24 | 25 |
#' |
25 |
-#' @param username string name used during signup |
|
26 |
-#' @param password string password used during signup |
|
26 |
+#' @param username string name used during remote server signup |
|
27 |
+#' @param password string password used during remote server signup |
|
27 | 28 |
#' |
28 | 29 |
#' @return None |
29 | 30 |
#' |
... | ... |
@@ -152,7 +152,7 @@ gmql_materialize <- function(input_data, dir_out, name) |
152 | 152 |
res_dir_out <- dir_out |
153 | 153 |
|
154 | 154 |
if(grepl("\\.",name)) |
155 |
- stop("name dataset cannot contains dot") |
|
155 |
+ stop("dataset name cannot contains dot") |
|
156 | 156 |
|
157 | 157 |
response <- WrappeR$materialize(input_data, res_dir_out) |
158 | 158 |
error <- strtoi(response[1]) |
... | ... |
@@ -753,14 +753,15 @@ By default *suffix* correspond to a metadata: *antibody_target*. |
753 | 753 |
## Metadata |
754 | 754 |
|
755 | 755 |
Each sample of a GMQL dataset has its own metadata associated and generally |
756 |
-every metadata attribute has a single value.The case with distinct values |
|
757 |
-for the same metadata attribute, is showed in the figure below for the disease |
|
758 |
-metadata attribute |
|
756 |
+every metadata attribute has a single value. The case with distinct values |
|
757 |
+for the same metadata attribute is shown in the figure below for the |
|
758 |
+\emph{disease} metadata attribute. |
|
759 | 759 |
\newline\newline |
760 | 760 |
 |
761 | 761 |
\newline\newline |
762 |
-In this case GMQL automcatically handles this situation. |
|
763 |
-Import/export paragraph we showed that a GMQL dataset can be imported into R environment as a GRangesList, and so its metadata too. |
|
762 |
+In this case GMQL automatically handles this situation. In the Import/export |
|
763 |
+paragraph, we showed that a GMQL dataset can be imported into R environment |
|
764 |
+as a GRangesList, and so its metadata too. |
|
764 | 765 |
|
765 | 766 |
```{r, metadata} |
766 | 767 |
# This statement defines the path to the folder "DATASET_META" in the |
... | ... |
@@ -781,8 +782,9 @@ metadata(grl_data) |
781 | 782 |
The metadata are stored as simple list in the form key-values and it does not |
782 | 783 |
matter if mutiple values for the same metadata attribute are present; |
783 | 784 |
all values are stored and shown. |
784 |
-Difficulties can arise when we need to get all the metadata values; normally |
|
785 |
-since list is in the form key-value, we can extract the metadata values using: |
|
785 |
+Difficulties can arise when we need to get all the metadata values; normally, |
|
786 |
+since the metadata list is in the form key-value, we can extract the metadata |
|
787 |
+values using: |
|
786 | 788 |
|
787 | 789 |
```{r, retrieve_value} |
788 | 790 |
|
... | ... |
@@ -800,7 +802,7 @@ all disease values, we should use instead: |
800 | 802 |
|
801 | 803 |
```{r, retrieve_values} |
802 | 804 |
|
803 |
-# get all disease value of sample S_00000 |
|
805 |
+# get all disease values of sample S_00000 |
|
804 | 806 |
|
805 | 807 |
a$S_00000[which(names(a$S_00000) %in% "disease")] |
806 | 808 |
|