Browse code

vignette, add conversion 0-based/1-based

Simone authored on 18/02/2018 15:45:34
Showing 6 changed files

... ...
@@ -92,15 +92,30 @@ import_gmql <- function(dataset_path, is_gtf)
92 92
         name_samples <- lapply(regions, function(x){
93 93
             gsub("*.gdm", "",basename(x))})
94 94
         vector_field <- .schema_header(datasetName)
95
-
95
+        type_and_coord <- .schema_type_coordinate(datasetName)
96 96
         names(vector_field) <- NULL
97
-        sampleList <- lapply(regions,function(x){
98
-            df <- read.delim(x,col.names = vector_field,header = FALSE)
99
-            g <- GenomicRanges::makeGRangesFromDataFrame(df,
100
-                                            keep.extra.columns = TRUE,
101
-                                            start.field = "left",
102
-                                            end.field = "right")
103
-        })
97
+        if(type_and_coord$coordinate_system %in% c("1-based"))
98
+        {
99
+            sampleList <- lapply(regions,function(x){
100
+                df <- read.delim(x,col.names = vector_field,header = FALSE)
101
+                g <- GenomicRanges::makeGRangesFromDataFrame(df,
102
+                        keep.extra.columns = TRUE,
103
+                        start.field = "left",
104
+                        end.field = "right")
105
+            })
106
+        }
107
+        else
108
+        {
109
+            sampleList <- lapply(regions,function(x){
110
+                df <- read.delim(x,col.names = vector_field,header = FALSE)
111
+                df$left = df$left +1
112
+                g <- GenomicRanges::makeGRangesFromDataFrame(df,
113
+                        keep.extra.columns = TRUE,
114
+                        start.field = "left",
115
+                        end.field = "right")
116
+            })
117
+        }
118
+       
104 119
         names(sampleList) <- name_samples
105 120
         gRange_list <- GenomicRanges::GRangesList(sampleList)
106 121
     }
... ...
@@ -121,6 +121,7 @@ export_gmql <- function(samples, dir_out, is_gtf)
121 121
         lapply(samples,function(x,dir){
122 122
             sample_name <- file.path(dir,paste0("S_",cnt(),".gdm"))
123 123
             region_frame <- data.frame(x)
124
+            region_frame$start = region_frame$start - 1
124 125
             write.table(region_frame,sample_name,col.names = FALSE,
125 126
                             row.names = FALSE, sep = '\t',quote = FALSE)
126 127
         },files_sub_dir)
... ...
@@ -183,6 +184,17 @@ export_gmql <- function(samples, dir_out, is_gtf)
183 184
     xml2::xml_attr(root,"name") <- "DatasetName_SCHEMAS"
184 185
     xml2::xml_attr(root,"xmlns") <- "http://genomic.elet.polimi.it/entities"
185 186
     xml2::xml_add_child(root,"gmqlSchema")
187
+    if(to_GTF)
188
+    {
189
+        xml2::xml_attr(root,"type") <- "gtf"
190
+        xml2::xml_attr(root,"coordinate_system") <- "1-based"
191
+    }
192
+    else
193
+    {
194
+        xml2::xml_attr(root,"type") <- "tab"
195
+        xml2::xml_attr(root,"coordinate_system") <- "0-based"
196
+    }
197
+    
186 198
     gmqlSchema <- xml2::xml_child(root,1)
187 199
 
188 200
     names_node <- names(node_list)
... ...
@@ -36,6 +36,27 @@
36 36
     vector_field <- unlist(list_field)
37 37
 }
38 38
 
39
+.schema_type_coordinate <- function(datasetName)
40
+{
41
+    schema_name <- list.files(datasetName, pattern = "*.schema$",
42
+                              full.names = TRUE)
43
+    
44
+    schema_name_xml <- list.files(datasetName, pattern = "*.xml$",
45
+                                  full.names = TRUE)
46
+    
47
+    if(!length(schema_name) && !length(schema_name_xml))
48
+        stop("schema not present")
49
+    
50
+    if(!length(schema_name))
51
+        xml_schema <- xml2::read_xml(schema_name_xml)
52
+    else
53
+        xml_schema <- xml2::read_xml(schema_name)
54
+    
55
+    gmql_schema_tag <- xml2::xml_children(xml_schema)
56
+    all_attrs <- xml2::xml_attrs(gmql_schema_tag)
57
+    all_attrs_list <- as.list(all_attrs[[1]])
58
+}
59
+
39 60
 # aggregates factory
40 61
 .aggregates <- function(meta_data,class)
41 62
 {
... ...
@@ -11,7 +11,8 @@
11 11
 #' \item{TAB: tab-delimited file format}
12 12
 #' \item{GTF: tab-delimited text standard format based on the General 
13 13
 #' Feature Format}
14
-#' \item{COLLECT: used for storing output in memory}
14
+#' \item{COLLECT: used for storing output in memory (only in the case of local 
15
+#' processing, i.e., remote_processing = FALSE,)}
15 16
 #' }
16 17
 #' @param remote_processing logical value specifying the processing mode.
17 18
 #' True for processing on cluster (remote), false for local processing.
... ...
@@ -22,8 +23,8 @@
22 23
 #' You can always perform it by calling the function \code{\link{login_gmql}} 
23 24
 #' explicitly
24 25
 #' 
25
-#' @param username string name used during signup 
26
-#' @param password string password used during signup
26
+#' @param username string name used during remote server signup 
27
+#' @param password string password used during remote server signup
27 28
 #' 
28 29
 #' @return None
29 30
 #'
... ...
@@ -152,7 +152,7 @@ gmql_materialize <- function(input_data, dir_out, name)
152 152
         res_dir_out <- dir_out
153 153
     
154 154
     if(grepl("\\.",name))
155
-        stop("name dataset cannot contains dot")
155
+        stop("dataset name cannot contains dot")
156 156
     
157 157
     response <- WrappeR$materialize(input_data, res_dir_out)
158 158
     error <- strtoi(response[1])
... ...
@@ -753,14 +753,15 @@ By default *suffix* correspond to a metadata: *antibody_target*.
753 753
 ## Metadata
754 754
 
755 755
 Each sample of a GMQL dataset has its own metadata associated and generally 
756
-every metadata attribute has a single value.The case with distinct values 
757
-for the same metadata attribute, is showed in the figure below for the disease 
758
-metadata attribute 
756
+every metadata attribute has a single value. The case with distinct values 
757
+for the same metadata attribute is shown in the figure below for the 
758
+\emph{disease} metadata attribute.
759 759
 \newline\newline
760 760
 ![metadata with multiple values](multi_metadata.png)
761 761
 \newline\newline
762
-In this case GMQL automcatically handles this situation.
763
-Import/export paragraph we showed that a GMQL dataset can be imported into R environment as a GRangesList, and so its metadata too.
762
+In this case GMQL automatically handles this situation. In the Import/export 
763
+paragraph, we showed that a GMQL dataset can be imported into R environment 
764
+as a GRangesList, and so its metadata too.
764 765
 
765 766
 ```{r, metadata}
766 767
 # This statement defines the path to the folder "DATASET_META" in the 
... ...
@@ -781,8 +782,9 @@ metadata(grl_data)
781 782
 The metadata are stored as simple list in the form key-values and it does not 
782 783
 matter if mutiple values for the same metadata attribute are present; 
783 784
 all values are stored and shown.
784
-Difficulties can arise when we need to get all the metadata values; normally 
785
-since list is in the form key-value, we can extract the metadata values using:
785
+Difficulties can arise when we need to get all the metadata values; normally, 
786
+since the metadata list is in the form key-value, we can extract the metadata 
787
+values using:
786 788
 
787 789
 ```{r, retrieve_value}
788 790
 
... ...
@@ -800,7 +802,7 @@ all disease values, we should use instead:
800 802
 
801 803
 ```{r, retrieve_values}
802 804
 
803
-# get all disease value of sample S_00000
805
+# get all disease values of sample S_00000
804 806
 
805 807
 a$S_00000[which(names(a$S_00000) %in% "disease")]
806 808