1
|
1
|
deleted file mode 100644
|
...
|
...
|
@@ -1,296 +0,0 @@
|
1
|
|
-output:
|
2
|
|
- html_document: default
|
3
|
|
- pdf_document: default
|
4
|
|
-
|
5
|
|
-title: 'RGMQL Example R Notebook: Use case 1'
|
6
|
|
-author: "Silvia Cascianelli"
|
7
|
|
-date: "`r Sys.Date()`"
|
8
|
|
-output:
|
9
|
|
- pdf_document: default
|
10
|
|
- html_notebook: default
|
11
|
|
- html_document: default
|
12
|
|
- BiocStyle::html_document:
|
13
|
|
- chunk_output_type: inline
|
14
|
|
-
|
15
|
|
-In this example we investigate the TCGA mutational data, extracted from the GMQL remote repository, to evaluate the most mutated gene regions in patients affected by Kidney Renal Clear Cell Carcinoma and younger than 65 years.
|
16
|
|
-
|
17
|
|
-Load the RGMQL package and initialize the remote GMQL context of scalable data management engine, specifying remote_processing = TRUE, and, possibly, an authenticated login:
|
18
|
|
-
|
19
|
|
-```{r, initialization}
|
20
|
|
-library(RGMQL)
|
21
|
|
-remote_url <- "http://www.gmql.eu/gmql-rest"
|
22
|
|
-init_gmql( url = remote_url, remote_processing = TRUE) # , username = 'XXXX', password = 'XXXX')
|
23
|
|
-```
|
24
|
|
-
|
25
|
|
-Download and extract the list of datasets in the curated remote repository and focus on those concerning mutation events:
|
26
|
|
-
|
27
|
|
-```{r, available_datasets}
|
28
|
|
-dataset_list <- show_datasets_list(remote_url)
|
29
|
|
-list <- unlist(lapply(dataset_list[["datasets"]], function(x) x$name))
|
30
|
|
-grep(pattern = 'mutation', x = list, value = TRUE)
|
31
|
|
-
|
32
|
|
-```
|
33
|
|
-
|
34
|
|
-Choose one dataset of interest and explore its schema and metadata:
|
35
|
|
-
|
36
|
|
-```{r,schema_and_metadata}
|
37
|
|
-schema<-show_schema(remote_url,datasetName = "public.GRCh38_TCGA_somatic_mutation_masked_2019_10")
|
38
|
|
-all_metadata <- show_all_metadata(dataset = "public.GRCh38_TCGA_somatic_mutation_masked_2019_10")
|
39
|
|
-```
|
40
|
|
-
|
41
|
|
-Once the dataset is chosen, read it together with the dataset containing the *RefSeq* gene annotations of the GRCh38 reference genome:
|
42
|
|
-
|
43
|
|
-```{r, read_datasets}
|
44
|
|
-GRCh38_TCGA_mut <- read_gmql(dataset = "public.GRCh38_TCGA_somatic_mutation_masked_2019_10", is_local = FALSE)
|
45
|
|
-RefSeq_GRCh38 <- read_gmql(dataset = "public.GRCh38_ANNOTATION_REFSEQ", is_local = FALSE)
|
46
|
|
-```
|
47
|
|
-
|
48
|
|
-Filter GRCh38_TCGA_mut based on a metadata predicate to keep Kidney Renal Clear Cell Carcinoma mutations only; then, stratify based on patient age:
|
49
|
|
-
|
50
|
|
-```{r, mut_stratification}
|
51
|
|
-mut <- filter(GRCh38_TCGA_mut, biospecimen__admin__disease_code == "KIRC")
|
52
|
|
-
|
53
|
|
-mut_under65 <- filter(GRCh38_TCGA_mut, clinical__clin_shared__age_at_initial_pathologic_diagnosis < 65 &
|
54
|
|
- biospecimen__admin__disease_code == "KIRC")
|
55
|
|
-```
|
56
|
|
-
|
57
|
|
-Filter out RefSeq_GRCh38 based on a metadata predicate to keep *RefSeq* gene regions only:
|
58
|
|
-
|
59
|
|
-```{r, RefSeq_GRCh38}
|
60
|
|
-genes = filter(RefSeq_GRCh38, annotation_type == 'gene' &
|
61
|
|
- provider == 'RefSeq')
|
62
|
|
-```
|
63
|
|
-
|
64
|
|
-For each mutation sample, map the mutations to the involved gene using the map() function and count mutations within each gene region storing automatically the value in the default region attribute *'count_left_right':*
|
65
|
|
-
|
66
|
|
-```{r, map}
|
67
|
|
-geneMut_data1_under65 <- map(genes, mut_under65)
|
68
|
|
-```
|
69
|
|
-
|
70
|
|
-In each sample, remove the genes that do not contain mutations:
|
71
|
|
-
|
72
|
|
-```{r, removal}
|
73
|
|
-geneMut_data2_under65 <- filter(geneMut_data1_under65, r_predicate = count_left_right >= 1)
|
74
|
|
-
|
75
|
|
-```
|
76
|
|
-
|
77
|
|
-Using the *extend()* function, count how many genes remain in each sample and store the result as a new attribute named '*geneMut_count'*:
|
78
|
|
-
|
79
|
|
-```{r, geneMut_count}
|
80
|
|
-geneMut_data3_under65 <- extend(geneMut_data2_under65, geneMut_count = COUNT())
|
81
|
|
-```
|
82
|
|
-
|
83
|
|
-Order samples in descending order of *geneMut_count:*
|
84
|
|
-
|
85
|
|
-```{r, sorting}
|
86
|
|
-geneMut_data_res_under65 = arrange(geneMut_data3_under65, list(DESC("geneMut_count")))
|
87
|
|
-```
|
88
|
|
-
|
89
|
|
-Launch the remote processing execution to materialize resulting datasets:
|
90
|
|
-
|
91
|
|
-```{r, job_execution, eval=FALSE}
|
92
|
|
-collect(geneMut_data_res_under65, name = "geneMut_data_res_under65")
|
93
|
|
-JOB<-execute()
|
94
|
|
-```
|
95
|
|
-
|
96
|
|
-Monitor the job status:
|
97
|
|
-
|
98
|
|
-```{r, job_monitoring, eval=FALSE}
|
99
|
|
-trace_job(remote_url, JOB$id)
|
100
|
|
-```
|
101
|
|
-
|
102
|
|
-Once the job status is 'SUCCESS' download the resulting datasets obtained remotely in the working directory of the local File System:
|
103
|
|
-
|
104
|
|
-```{r, download_in_FS, eval=FALSE}
|
105
|
|
-name_dataset_under <- JOB$datasets[[1]]$name
|
106
|
|
-download_dataset(remote_url, name_dataset_under, path='./Results_use_case_1')
|
107
|
|
-
|
108
|
|
-```
|
109
|
|
-
|
110
|
|
-Download the resulting datasets as GRangesList objects also in the current R environment:
|
111
|
|
-
|
112
|
|
-```{r, GRangesList, eval=FALSE}
|
113
|
|
-grl_mut_under <- download_as_GRangesList(remote_url, name_dataset_under)
|
114
|
|
-
|
115
|
|
-```
|
116
|
|
-
|
117
|
|
-Log out from remote engine:
|
118
|
|
-
|
119
|
|
-```{r, logout}
|
120
|
|
-logout_gmql(remote_url)
|
121
|
|
-```
|
122
|
|
-
|
123
|
|
-Compute the main statistics related to the *geneMut_count* values distribution in the two assessed conditions:
|
124
|
|
-
|
125
|
|
-```{r, echo=FALSE}
|
126
|
|
-# path_under <- './Results_use_case_1/_20210531_150133_geneMut_data_res_under65 - compatibleWithGRangesList/files'
|
127
|
|
-# files <- list.files(path=path_under,
|
128
|
|
-# pattern='*.gtf$', full.names=FALSE)
|
129
|
|
-# f<-files[1]
|
130
|
|
-# setwd(path_under)
|
131
|
|
-# for (f in files){
|
132
|
|
-# D<-readLines(f)
|
133
|
|
-# D_u <- gsub("seqid", "sequence", D)
|
134
|
|
-# cat(D_u, file=f, sep="\n")
|
135
|
|
-# Data<-read.table(f, sep='\t')
|
136
|
|
-# }
|
137
|
|
-#
|
138
|
|
-remote_processing(FALSE)
|
139
|
|
-grl_mut_under_D <- import_gmql('./Results_use_case_1/_20210531_150133_geneMut_data_res_under65 - compatibleWithGRangesList', TRUE)
|
140
|
|
-
|
141
|
|
-###OR when map(mut_under65, genes)
|
142
|
|
-# path <- './Results_use_case_1/map(mutations,genes)'
|
143
|
|
-# setwd(path)
|
144
|
|
-# library('xlsx')
|
145
|
|
-# write.xlsx(grl_mut_under@unlistData@elementMetadata, file='KIRC_MUTATIONAL_EVENTS.xlsx', sheetName = "mut",
|
146
|
|
-# col.names = TRUE, row.names = FALSE, append = FALSE)
|
147
|
|
-```
|
148
|
|
-
|
149
|
|
-```{r, statistics}
|
150
|
|
-summary(as.numeric(grl_mut_under_D@unlistData@elementMetadata@listData[["count_left_right"]]))
|
151
|
|
-quantile(as.numeric(grl_mut_under_D@unlistData@elementMetadata@listData[["count_left_right"]]), c(.90, .95, .99))
|
152
|
|
-```
|
153
|
|
-
|
154
|
|
-Plot distributions of *geneMut_count,* i.e. number of mutated genes per patient:
|
155
|
|
-
|
156
|
|
-```{r, geneMut_count_distributions}
|
157
|
|
-
|
158
|
|
-meta_lst <- grl_mut_under_D@metadata
|
159
|
|
-
|
160
|
|
-
|
161
|
|
-mut_lst <- lapply(meta_lst, function(x) as.numeric(x[["geneMut_count"]]))
|
162
|
|
-mut_ord <- mut_lst[order(unlist(mut_lst), decreasing = FALSE)]
|
163
|
|
-
|
164
|
|
-library(ggplot2)
|
165
|
|
-p1 <- plot(x = seq(1,5*length(mut_ord), 5), mut_ord, xlab = '', ylab= 'Number of mutated genes', xaxt = "n")
|
166
|
|
-title(xlab="Patient samples", line=-1, cex.lab=1)
|
167
|
|
-
|
168
|
|
-xtick <- seq(1, 5*length(mut_ord), 5)
|
169
|
|
-xlabels <- names(mut_ord)
|
170
|
|
-axis(side = 1, at = xtick, labels = xlabels, las = 2, cex.axis = 0.5)
|
171
|
|
-
|
172
|
|
-
|
173
|
|
-summary(unlist(mut_lst))
|
174
|
|
-```
|
175
|
|
-
|
176
|
|
-```{r, results = 'hide'}
|
177
|
|
-final <- list()
|
178
|
|
-matrix(nrow = length(grl_mut_under_D@metadata), ncol = 3) #gene, mut, gene_length=irange_width
|
179
|
|
-
|
180
|
|
-library(GenomicRanges)
|
181
|
|
-for (n in names(grl_mut_under_D@metadata)){ #for each of the 217 Granges
|
182
|
|
- gr <- grl_mut_under_D[[n]]
|
183
|
|
- final[[n]] <- data.frame('gene' = gr@elementMetadata@listData[["gene_symbol"]], 'mut_count' = as.numeric(gr@elementMetadata@listData[["count_left_right"]]), 'width' = width(ranges(grl_mut_under_D[[n]])))
|
184
|
|
-}
|
185
|
|
-
|
186
|
|
-
|
187
|
|
-```
|
188
|
|
-
|
189
|
|
-Compute and plot the distributions of all the mutational events occurring on each patient:
|
190
|
|
-
|
191
|
|
-```{r}
|
192
|
|
-tot_mut_pat <- list()
|
193
|
|
-max_mut_pat <- list()
|
194
|
|
-mut_pat <- sapply(final, '[[', 'mut_count')
|
195
|
|
-
|
196
|
|
-for (i in 1:length(mut_pat)){
|
197
|
|
- tot_mut_pat[[i]] <- sum(as.numeric(mut_pat[[i]]))
|
198
|
|
- max_mut_pat[[i]] <-max(as.numeric(mut_pat[[i]]))
|
199
|
|
-}
|
200
|
|
-names(tot_mut_pat) <- names(mut_pat)
|
201
|
|
-tot_mut_pat_ord <- tot_mut_pat[order(unlist(tot_mut_pat), decreasing = FALSE)]
|
202
|
|
-
|
203
|
|
-library(ggplot2)
|
204
|
|
-p2 <- plot(x = seq(1,5*length(tot_mut_pat_ord), 5), tot_mut_pat_ord, xlab = '', ylab= 'Number of mutational events', xaxt = "n")
|
205
|
|
-title(xlab="Patient samples", line = -1, cex.lab = 1)
|
206
|
|
-
|
207
|
|
-xtick <- seq(1, 5*length(mut_ord), 5)
|
208
|
|
-xlabels <- names(tot_mut_pat_ord)
|
209
|
|
-axis(side = 1, at = xtick, labels = xlabels, las = 2, cex.axis = 0.5)
|
210
|
|
-```
|
211
|
|
-
|
212
|
|
-```{r}
|
213
|
|
-genes <- unlist(sapply(final, select, "gene"))
|
214
|
|
-lengths <- unlist(sapply(final, select, "width"))
|
215
|
|
-mapping <- data.frame('gene' = genes, 'gene_length' = lengths, row.names = NULL)
|
216
|
|
-all_genes <- unique(genes)
|
217
|
|
-all_genes_len <- mapping[which(!duplicated(mapping[,1])),]
|
218
|
|
-
|
219
|
|
-gene_m <- matrix(nrow = length(all_genes), ncol = 6)
|
220
|
|
-gene_df <- data.frame(gene_m)
|
221
|
|
-colnames(gene_df) <- c('Gene', 'Length', 'Mutated_Patients', 'Mutation_counts', 'Mutation_counts_norm', 'Mutation_counts_div_len')
|
222
|
|
-gene_df[,1] <- all_genes_len[,1]
|
223
|
|
-gene_df[,2] <- all_genes_len[,2]
|
224
|
|
-for (i in 1:length(all_genes)){ #for each gene
|
225
|
|
- counter_pt <- 0
|
226
|
|
- counter_mut <- 0
|
227
|
|
- counter_mut_norm_pat <- 0
|
228
|
|
- counter_mut_norm_len <- 0
|
229
|
|
- for(j in 1:length(final)){ #for each patient
|
230
|
|
- f <- final[[j]]
|
231
|
|
- if (gene_df[i,1] %in% f[["gene"]]){
|
232
|
|
- counter_pt <- counter_pt+1
|
233
|
|
- counter_mut <- counter_mut + as.numeric(f[["mut_count"]][which(f[["gene"]] == gene_df[i,1])[1]]) #[1] to avoid issues when there are multiple regions of the same gene and duplicated symbols
|
234
|
|
- counter_mut_norm_len <- counter_mut / gene_df[i,2]
|
235
|
|
- counter_mut_norm_pat <- counter_mut_norm_pat + as.numeric(f[["mut_count"]][which(f[["gene"]] == gene_df[i,1])[1]]) / tot_mut_pat[[j]]
|
236
|
|
- }
|
237
|
|
- gene_df[i,3] <- counter_pt
|
238
|
|
- gene_df[i,4] <- counter_mut
|
239
|
|
- gene_df[i,5] <- counter_mut_norm_pat
|
240
|
|
- gene_df[i,6] <- counter_mut_norm_len
|
241
|
|
-
|
242
|
|
- }
|
243
|
|
-
|
244
|
|
-}
|
245
|
|
-
|
246
|
|
-```
|
247
|
|
-
|
248
|
|
-```{r}
|
249
|
|
-gene_df <- gene_df[order(gene_df$Mutation_counts, decreasing=TRUE),]
|
250
|
|
-summary(gene_df)
|
251
|
|
-
|
252
|
|
-library(ggplot2)
|
253
|
|
-p<-plot(x=seq(1:20), y = gene_df[1:20,4], main='Mutation counts in top mutated genes', ylab = 'Number of mutations across patient samples', xlab = NA, xaxt="n")
|
254
|
|
-
|
255
|
|
-xtick <- seq(1, 20)
|
256
|
|
-axis(side = 1, at = xtick, labels = gene_df[1:20,1], las = 2, cex.axis = 0.75)
|
257
|
|
-
|
258
|
|
-```
|
259
|
|
-
|
260
|
|
-```{r}
|
261
|
|
-library(ggplot2)
|
262
|
|
-p <- plot(x = gene_df[1:20,2], y = gene_df[1:20,4], ylab = 'Number of mutations across samples', xlab = NA, main='Mutation counts compared to gene lengths in top mutated genes', xaxt="n") #xlab = 'Genes'
|
263
|
|
-
|
264
|
|
-xtick <- gene_df[1:20,2]
|
265
|
|
-axis(side = 1, at = xtick, labels = gene_df[1:20,1], las = 2, cex.axis = 0.75)
|
266
|
|
-```
|
267
|
|
-
|
268
|
|
-```{r}
|
269
|
|
-gene_df <- gene_df[order(gene_df$Mutation_counts_div_len, decreasing=TRUE),] #descending gene_mut_count_order
|
270
|
|
-summary(gene_df)
|
271
|
|
-library(ggplot2)
|
272
|
|
-p <- plot(x = seq(1:20), y = gene_df[1:20,6], ylab = 'Mutations divided by gene lengths', xlab = NA, main = 'Top mutated genes based on mutation count divided by gene length', xaxt = "n") #xlab = 'Genes'
|
273
|
|
-
|
274
|
|
-xtick <- seq(1:20)
|
275
|
|
-axis(side = 1, at = xtick, labels = gene_df[1:20,1], las = 2, cex.axis = 0.7)
|
276
|
|
-```
|
277
|
|
-
|
278
|
|
-```{r}
|
279
|
|
-gene_df <- gene_df[order(gene_df$Mutated_Patients, decreasing=TRUE),] #descending gene_mut_count_order
|
280
|
|
-summary(gene_df)
|
281
|
|
-library(ggplot2)
|
282
|
|
-p <- plot(x = seq(1:20), y = gene_df[1:20,3]/217*100, ylab = 'Percentage of mutated patients (%)', xlab = NA, main = 'Top mutated genes based on percentage of mutated patients', xaxt = "n") #xlab = 'Genes'
|
283
|
|
-
|
284
|
|
-xtick <- seq(1:20)
|
285
|
|
-axis(side = 1, at = xtick, labels = gene_df[1:20,1], las = 2, cex.axis = 0.75)
|
286
|
|
-```
|
287
|
|
-
|
288
|
|
-```{r, end}
|
289
|
|
-library('xlsx')
|
290
|
|
-#write.xlsx(gene_df, file='KIRC_MUTATIONS.xlsx', sheetName = "mut",
|
291
|
|
-# col.names = TRUE, row.names = FALSE, append = FALSE)
|
292
|
|
-```
|