Browse code

Delete use_case_3.Rmd

SilviaC7 authored on 22/07/2021 18:31:00 • GitHub committed on 22/07/2021 18:31:00
Showing 1 changed files

1 1
deleted file mode 100644
... ...
@@ -1,242 +0,0 @@
1
-output:
2
-  html_document: default
3
-  pdf_document: default
4
-
5
-title: 'RGMQL Example R Notebook: Use case 3'
6
-author: "Silvia Cascianelli"
7
-date: "`r Sys.Date()`"
8
-output:
9
-  pdf_document: default
10
-  html_notebook: default
11
-  html_document: default
12
-  BiocStyle::html_document:
13
-  chunk_output_type: inline
14
-
15
-```{r setup, include = FALSE}
16
-knitr::opts_chunk$set(cache = TRUE)
17
-```
18
-
19
-In this example we investigate the ENCODE Chip-Seq narrow and broad data, extracted from the GMQL remote repository to automate all the steps needed to identify transcription factor (TF) high accumulation DNA zones using RGQML together with TFHAZ, another R/Bioconductor package . The knowledge of DNA regions in which transcription factors bind, in particular the HOT (High Occupancy Target) regions occupied by many different factors, is crucial to understand cancer genesis and develop new targeted therapies.
20
-
21
-Load the RGMQL package and initialize the remote GMQL context of scalable data management engine, specifying remote_processing = TRUE, and, possibly, an authenticated login:
22
-
23
-```{r, initialization}
24
-library(RGMQL)
25
-remote_url <- "http://www.gmql.eu/gmql-rest"
26
-init_gmql(url = remote_url, remote_processing = TRUE)
27
-           #, username = 'XXXX', password = 'XXXX')
28
-```
29
-
30
-Download and extract the list of datasets in the curated remote repository and focus on those concerning ENCODE:
31
-
32
-```{r, available_datasets}
33
-dataset_list <- show_datasets_list(remote_url)
34
-list <- unlist(lapply(dataset_list[["datasets"]], function(x) x$name))
35
-grep(pattern = 'ENCODE', x = list, value = TRUE)
36
-```
37
-
38
-Select ChIP-seq data from the ENCODE NARROW dataset AUG_2017, aligned to HG19:
39
-
40
-```{r, read_NARROW}
41
-
42
-Enc_Narrow <- read_gmql("public.HG19_ENCODE_NARROW_AUG_2017",
43
-                           is_local = FALSE)
44
-HM_TF_rep_narrow <- filter(Enc_Narrow, assay == "ChIP-seq" & assembly
45
-                          == "hg19" & project == "ENCODE" & file_status ==                           "released" & biosample_term_name == "H1-hESC" &                            output_type == "optimal idr thresholded peaks")
46
-
47
-```
48
-
49
-Select ChIP-seq data from the latest ENCODE BROAD dataset AUG_2017, aligned to HG19 and:
50
-
51
-```{r, read_BROAD}
52
-Enc_Broad <- read_gmql("public.HG19_ENCODE_BROAD_AUG_2017",
53
-                          is_local = FALSE)
54
-HM_TF_rep_broad <- filter(Enc_Broad, assay == "ChIP-seq" & assembly ==
55
-                          "hg19" & project == "ENCODE" & file_status == 
56
-                          "released" & biosample_term_name == "H1-hESC" &                            output_type == "optimal idr thresholded peaks")
57
-
58
-```
59
-
60
-Take the union of the two previously generated datasets:
61
-
62
-```{r, union}
63
-HM_TF_rep <- union(HM_TF_rep_broad, HM_TF_rep_narrow)
64
-```
65
-
66
-Filter out samples subjected to pharmacological treatment or with specific "audit" marker:
67
-
68
-```{r, cleaning}
69
-HM_TF_rep_good_0 <- filter(HM_TF_rep, !biosample_treatments == "*" & !
70
-                            (audit_error == "extremely low read depth" |                               audit_error == "extremely low read
71
-                            length") & !(audit_warning == "insufficient                                read depth") & !(audit_not_compliant ==   
72
-                            "insufficient read depth" | audit_not_compliant                             =="insufficient replicate concordance" |                                   audit_not_compliant == "missing input
73
-                            control" | audit_not_compliant == "severe                                  bottlenecking" | audit_not_compliant
74
-                            == "unreplicated experiment"))
75
-```
76
-
77
-Filter out samples related to HM histone modifications:
78
-
79
-```{r, discard_HM}
80
-TF_rep_good_0 <- filter(HM_TF_rep_good_0, !(experiment_target ==                                   "H2AFZhuman" | experiment_target == "H3F3A-human" |                         experiment_target == "H3K27ac-human" |                                     experiment_target == "H3K27me3-human" |                                    experiment_target == "H3K36me3-human" |                                    experiment_target == "H3K4me1-human" |                                     experiment_target == "H3K4me2-human" |                                     experiment_target == "H3K4me3-human" |                                     experiment_target == "H3K79me2-human" |                                    experiment_target == "H3K9ac-human" |                                      experiment_target == "H3K9me1-human" |                                     experiment_target == "H3K9me2-human" |                                     experiment_target == "H3K9me3-human" |                                     experiment_target == "H4K20me1-human"))
81
-```
82
-
83
-Create new regions attribute: Length of each region and for each sample, compute the number of regions and the sum of each region length just created:
84
-
85
-```{r, removal}
86
-TF_rep_good_1 <- select(TF_rep_good_0, regions_update = list(length = right - left))
87
-
88
-TF_rep_good <- extend(TF_rep_good_1, Region_number = COUNT(),
89
-                     sum_length = SUM("length"))
90
-
91
-```
92
-
93
-(1) Processing to obtain a dataset on which finding the filtering threshold;
94
-
95
-```{r, filtering_threshold_1}
96
-TF_rep_good_merged <- aggregate(TF_rep_good, groupBy =
97
-                     conds(default = c("biosample_term_name")))
98
-                                 
99
-
100
-TF_rep_good_ordered <- arrange(TF_rep_good_merged,                
101
-                      regions_ordering = list(ASC("length")))
102
-
103
-collect(TF_rep_good_ordered, name = "TF_rep_good_ordered")
104
-job <- execute()
105
-```
106
-
107
-(1.1)Monitor the job status:
108
-
109
-```{r, filtering_threshold_1_job_monitoring}
110
-trace_job(remote_url , job$id)
111
-```
112
-
113
-(2) Processing to obtain a dataset on which finding the filtering threshold; once the job status is 'SUCCESS' download the resulting dataset obtained remotely in the working directory of the local File System:
114
-
115
-```{r, filtering_threshold_2_download_in_FS}
116
-dataset_name <- job$datasets[[1]]$name
117
-print(dataset_name)
118
-
119
-grl_TF_rep_good_ordered <- download_as_GRangesList(remote_url, dataset_name)
120
-
121
-download_dataset(remote_url, datasetName = dataset_name, path = './Results_use_case_3')
122
-
123
-```
124
-
125
-(3) Processing to obtain a dataset on which finding the filtering threshold; once the needed dataset is saved, perform in the R environment all the remaining operations required to retrieve the threshold value:
126
-
127
-```{r, filtering_threshold_3}
128
-name_sample <- names(grl_TF_rep_good_ordered)# ? campione singolo
129
-g <- grl_TF_rep_good_ordered[[name_sample]] #estraggo GRanges
130
-Region_number_tot <- length(g) # conto tutte le regioni  
131
-n_up <- Region_number_tot * 0.95 #95percentile del numero di regioni
132
-n_up_1 <- n_up + 1
133
-index <- which(g$order >= ceiling(n_up) & g$order <= floor(n_up_1))
134
-region <- g[index]
135
-```
136
-
137
-(4) Processing to obtain a dataset on which finding the filtering threshold; the chosen threshold is the length of the 95percentile of the region number:
138
-
139
-```{r, filtering_threshold_4}
140
-threshold <- region$length
141
-threshold <- as.numeric(threshold)
142
-threshold
143
-```
144
-
145
-Going back to RGQML remote processing, take only the regions with region lengths greater than 100 and smaller than the threshold:
146
-
147
-```{r, threshold_filtering}
148
-TF_rep_good_filtered_0 <- filter(TF_rep_good, r_predicate = length >= 100 & length <= threshold)
149
-
150
-```
151
-
152
-Create new metadata for each sample, with number of filtered regions and the sum of their lengths:
153
-
154
-```{r, attributes_after_filtering}
155
-TF_rep_good_filtered <- extend(TF_rep_good_filtered_0,
156
-                            Region_number_filtered = COUNT(),  
157
-                            sum_length_filtered = SUM("length"))
158
-```
159
-
160
-Combine multiple replicate samples of the same TF experiment:
161
-
162
-```{r, combine_TF_exp}
163
-TF_0 <- cover(TF_rep_good_filtered, 1, ANY(), groupBy = 
164
-               conds("experiment_target"))
165
-                 
166
-```
167
-
168
-Add new region attribute as length of each region after sample combination:
169
-
170
-```{r, regions_update}
171
-TF_1 <- select(TF_0, regions_update = list(length = right - left))
172
-```
173
-
174
-Create new metadata for each sample, with number of combined regions, and min, max and sum of their lengths:
175
-
176
-```{r, attributes_after_cover}
177
-TF <- extend(TF_1, Region_number_cover = COUNT(), sum_length_cover =
178
-              SUM("length"), min_length_cover = MIN("length"), max_length_cover = MAX("length"))
179
-
180
-```
181
-
182
-Materialize TF dataset into repository and downlad it on mass memory but also in main memory as GRangesList
183
-
184
-```{r, main_execution}
185
-collect(TF, name= "TF_res")
186
-res <- execute()
187
-```
188
-
189
-```{r main_job_monitoring}
190
-#Monitor job status:
191
-trace_job(remote_url, res$id)
192
-```
193
-
194
-```{r, download_in_FS}
195
-# Download dataset as folder:
196
-res_name <- res$datasets[[1]]$name
197
-download_dataset(remote_url, res_name, path = './Results_use_case_3')
198
-```
199
-
200
-```{r, GRangesList}
201
-# Download dataset as GRangesList:
202
-samples <- download_as_GRangesList(remote_url, res_name)
203
-```
204
-
205
-Post-processing before the analysis with TFHAZ
206
-
207
-```{r, Post-processing}
208
-TF=vector()
209
-len_rep <- sapply(samples, function(x) len <- length(x))
210
-
211
-TF_rep <- mapply(function(x, l){
212
-  exp <- x$experiment_target
213
-  TF_temp <- rep(exp, l)
214
-  TF <- append(TF, TF_temp)
215
-}, samples@metadata, len_rep) #metadata(samples)
216
-
217
-TF <- unlist(TF_rep)
218
-H1_hESC <- unlist(samples)
219
-data <- data.frame(H1_hESC, TF)
220
-H1_hESC <- as(data, "GRanges")
221
-```
222
-
223
-After loading the TFHAZ package, find the transcription factor HOT DNA zones focusing on one chromosome at at time, by executing the following instructions:
224
-
225
-```{r, TFHAZ_analysis}
226
-library(TFHAZ)
227
-TF_acc_21_w_0 <- accumulation(H1_hESC, "TF", "chr21", 0)
228
-d_zones <- high_accumulation_zones(TF_acc_21_w_0, method =
229
-                                     "overlaps", threshold = "std")
230
-print(d_zones)
231
-
232
-```
233
-
234
-Log out from remote engine:
235
-
236
-```{r, logout}
237
-logout_gmql(remote_url)
238
-```