In this example we investigate the TCGA messenger and micro RNA gene expression data of patients affected by Adrenocortical carcinoma, showing how to to merge and process local data (even possibly private) with remote public data in mixed processing, performing analyses both remotely and on the local R/Bioconductor environment.

Load the RGMQL package and initialize the remote GMQL context of scalable data management engine, specifying remote_processing = TRUE, and, possibly, an authenticated login:

library(RGMQL)
## Caricamento del pacchetto richiesto: RGMQLlib
## GMQL successfully loaded
remote_url <- "http://www.gmql.eu/gmql-rest"
init_gmql( url = remote_url, remote_processing = TRUE) # , username = 'XXXX', password = 'XXXX')
## [1] "your Token is 30d972ed-c75e-4093-99cc-f4f7ab804df2"

Read the TCGA miRNA gene expression data of Adrenocortical carcinoma aligned to GRCh38 from a local GDM dataset:

GRCh38_miRNA_path <- 'GRCh38_miRNA_ACC'
GRCh38_miRNA_ACC <- read_gmql(GRCh38_miRNA_path, is_local = TRUE)

Download and extract the list of datasets in the curated remote repository and focus on those concerning TCGA:

remote_processing(TRUE)
## [1] "Remote processing On"
dataset_list <- show_datasets_list(remote_url)
list <- unlist(lapply(dataset_list[["datasets"]], function(x) x$name))
grep(pattern = 'TCGA', x = list, value = TRUE)
##  [1] "GRCh38_TCGA_copy_number"                     
##  [2] "GRCh38_TCGA_copy_number_2018_12"             
##  [3] "GRCh38_TCGA_copy_number_2019_10"             
##  [4] "GRCh38_TCGA_copy_number_masked"              
##  [5] "GRCh38_TCGA_copy_number_masked_2018_12"      
##  [6] "GRCh38_TCGA_copy_number_masked_2019_10"      
##  [7] "GRCh38_TCGA_gene_expression"                 
##  [8] "GRCh38_TCGA_gene_expression_2018_12"         
##  [9] "GRCh38_TCGA_gene_expression_2019_10"         
## [10] "GRCh38_TCGA_methylation"                     
## [11] "GRCh38_TCGA_methylation_2018_12"             
## [12] "GRCh38_TCGA_methylation_2019_10"             
## [13] "GRCh38_TCGA_miRNA_expression"                
## [14] "GRCh38_TCGA_miRNA_expression_2018_12"        
## [15] "GRCh38_TCGA_miRNA_expression_2019_10"        
## [16] "GRCh38_TCGA_miRNA_isoform_expression"        
## [17] "GRCh38_TCGA_miRNA_isoform_expression_2018_12"
## [18] "GRCh38_TCGA_miRNA_isoform_expression_2019_10"
## [19] "GRCh38_TCGA_somatic_mutation_masked"         
## [20] "GRCh38_TCGA_somatic_mutation_masked_2018_12" 
## [21] "GRCh38_TCGA_somatic_mutation_masked_2019_10" 
## [22] "HG19_TCGA_cnv"                               
## [23] "HG19_TCGA_dnamethylation"                    
## [24] "HG19_TCGA_dnaseq"                            
## [25] "HG19_TCGA_mirnaseq_isoform"                  
## [26] "HG19_TCGA_mirnaseq_mirna"                    
## [27] "HG19_TCGA_rnaseq_exon"                       
## [28] "HG19_TCGA_rnaseq_gene"                       
## [29] "HG19_TCGA_rnaseq_spljxn"                     
## [30] "HG19_TCGA_rnaseqv2_exon"                     
## [31] "HG19_TCGA_rnaseqv2_gene"                     
## [32] "HG19_TCGA_rnaseqv2_isoform"                  
## [33] "HG19_TCGA_rnaseqv2_spljxn"

Choose the latest TCGA gene expression dataset of interest, aligned to GRCh38, read it and extract all the samples of patients affected by Adrenocortical carcinoma (ACC) :

GRCh38_TCGA_RNAseq <- read_gmql(dataset = "public.GRCh38_TCGA_gene_expression_2019_10", is_local = FALSE)

GRCh38_TCGA_RNAseq_ACC <- filter(GRCh38_TCGA_RNAseq, gdc__project__project_id == "TCGA-ACC")

Join the two datasets based on the biospecimen_aliquot__bcr_aliquot_barcode and keeping for each miRNA region the first gene region at minimum distance ((MD(1)):

ACC_mRNA_miRNA <- merge(GRCh38_miRNA_ACC,
                  GRCh38_TCGA_RNAseq_ACC,    
                  genometric_predicate = list(MD(1)),
                  region_output = "BOTH",
                  joinBy = conds
                  ('biospecimen__bio__bcr_analyte_barcode'))

Launch the remote processing execution to materialize the resulting ordered dataset:

collect(ACC_mRNA_miRNA, name = "ACC_mRNA_miRNA")
job<-execute()

Monitor the job status:

trace_job(remote_url, job$id)

Once the job status is ‘SUCCESS’ download the resulting dataset obtained remotely in the working directory of the local File System and turn the remote processing off:

name_dataset <- job$datasets[[1]]$name
download_dataset(remote_url, name_dataset, path = './Results_use_case_2')

Import mRNA and microRNA gene annotations together with their corresponding raw count values from the dataset just saved in the local File System to the current R environment, within a GRanges object:

remote_processing(FALSE)
## [1] "Remote processing off"
path <- paste('./Results_use_case_2', name_dataset, sep = '/')
GR_ACC <- filter_and_extract(path, metadata = NULL, region_attributes = c('right.gene_symbol','right.htseq_count','left.mirna_id','left.read_count'))
## New names:
## * right.gene_symbol -> right.gene_symbol...1
## * right.htseq_count -> right.htseq_count...2
## * left.mirna_id -> left.mirna_id...3
## * left.read_count -> left.read_count...4
## * right.gene_symbol -> right.gene_symbol...5
## * ...

Show all metadata to extract sample IDs and clinical annotations:

setwd('./Results_use_case_2')
meta_table <- show_all_metadata(name_dataset, show_value = TRUE)
sample_IDs<-unlist(meta_table['left.biospecimen__shared__bcr_patient_barcode',])
status<- unlist(meta_table["left.clinical__clin_shared__person_neoplasm_cancer_status",])
stages<- unlist(meta_table['left.gdc__diagnoses__tumor_stage',])
grades<- unlist(meta_table['left.gdc__diagnoses__tumor_grade',])

Log out from remote engine:

logout_gmql(remote_url)
## [1] "Logout"

Extract different samples as rows, considering as features an ordered joint list of raw counts for each mRNA and miRNA gene under analysis

all_genes <- c(GR_ACC@elementMetadata@listData[['right.gene_symbol...1']],GR_ACC@elementMetadata@listData[['left.mirna_id...3']])
all_genes_ord <- unique(sort(all_genes))

columns <- length(all_genes_ord)
rawMatrix <- matrix(nrow = 0, ncol = columns)

for (i in seq(1, length(GR_ACC@elementMetadata@listData), 4)) {
  g <- paste('right.gene_symbol...', toString(i), sep = '')
  m <- paste('left.mirna_id...', toString(i+2), sep = '')
  gr <- paste('right.htseq_count...', toString(i+1), sep = '')
  mr <- paste('left.read_count...', toString(i+3), sep = '')
  all_genes <- c(GR_ACC@elementMetadata@listData[[g]]
               ,GR_ACC@elementMetadata@listData[[m]])
  all_reads <- as.numeric(c(GR_ACC@elementMetadata@listData[[gr]],                      GR_ACC@elementMetadata@listData[[mr]]))
  names(all_reads) <- all_genes
  all_reads_ord <- all_reads[c(all_genes_ord)]
  rawMatrix <- rbind(rawMatrix, all_reads_ord)
}

rawData <- data.frame(rawMatrix)
rownames(rawData) <- sample_IDs

Discard all the mRNA or miRNA genes having a null first quartile value across the samples and normalize each sample by centering and dividing by standard deviation:

LQ <- apply(rawMatrix, 2, function(x) summary(x)[2])
kept <- which(LQ > 0)
dataset <- rawData[,kept]

library(BBmisc)
## 
## Caricamento pacchetto: 'BBmisc'
## Il seguente oggetto è mascherato da 'package:base':
## 
##     isFALSE
dataset_n <- normalize(dataset, 'standardize')

Use the remaining genes as features of each sample to perform a clustering analysis and identify subgroups of samples through hierarchical clustering. First, identify the optimal number of clusters based on the average silhouette score:

library(factoextra)
## Caricamento del pacchetto richiesto: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#library(cluster)

#Choice of the number of clusters
res <- fviz_nbclust(dataset_n, hcut , method = "silhouette", k.max = 10) 
plot(res)

Apply Ward Hierarchical Clustering using Spearman correlation as similarity metric; then cut the dendogram to highlight three sample clusters:

# 
d <- get_dist(x = dataset_n, method = "spearman") # distance matrix
hclust_model <- hclust(d, method = "ward.D")
plot(hclust_model) # display dendogram
clusters <- cutree(hclust_model, k = 3) # cut tree into clusters
# draw dendogram with coloured borders around the clusters
rect.hclust(hclust_model, k = 3, border = 2:5)

Plot the so-obtained clusters:

fviz_cluster(list(data = dataset_n, cluster = clusters),labelsize = 6)

Now compare by a left join the clusters obtained with this clustering analysis and the clustering results published by the TCGA consortium in their Comprehensive Pan-Genomic Characterization of Adrenocortical Carcinoma:

clustering_results <- data.frame('sample' = sample_IDs , 'cluster' = clusters)
rownames(clustering_results)<- NULL

library(readr)
 CLUSTERS_TCGA_ACC <- read_delim("Results_use_case_2/CLUSTERS_TCGA_ACC.csv", ";", escape_double = FALSE, trim_ws = TRUE)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   sample = col_character(),
##   Histology = col_character(),
##   mRNA_K4 = col_character(),
##   `Methylation Level` = col_character(),
##   miRNA = col_character(),
##   `Cluster of Clusters` = col_character()
## )
comparison<-merge(x = clustering_results, y =  CLUSTERS_TCGA_ACC,  by = 'sample', all.x = TRUE)

library('xlsx')
write.xlsx(comparison, file='Results_use_case_2/Comparison_clusterings.xlsx', col.names = TRUE, row.names = FALSE, append = FALSE)


for(i in 3:dim(comparison)[2]){
  comparison_table <- table(comparison$cluster, comparison[,i])
  subt<-paste('comparison with patient clusters based on', colnames(comparison)[i], collapse=' ')

  mosaicplot(comparison_table, color = TRUE, main = "Mosaic plot of two alternative clustering results",
           sub = subt,
           xlab = "Hierachical Clustering results on mRNA and miRNA genes:",
           ylab = colnames(comparison)[i],
           las = 1)
}