#' @title Hierarchical cluster analysis #' @description Hierarchical cluster analysis using several methods such as #' ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), #' "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). #' @param tabDF is a dataframe or numeric matrix, each row represents a gene, #' each column represents a sample come from TCGAPrepare. #' @param method is method to be used for generic cluster such as 'hclust' #' or 'consensus' #' @param methodHC is method to be used for Hierarchical cluster. #' @import stats #' @export #' @return object of class hclust if method selected is 'hclust'. #' If method selected is 'Consensus' returns a list of length maxK #' (maximum cluster number to evaluate.). Each element is a list containing #' consensusMatrix (numerical matrix), consensusTree (hclust), consensusClass #' (consensus class assignments). ConsensusClusterPlus also produces images. TCGAanalyze_Clustering <- function( tabDF, method, methodHC = "ward.D2" ) { if (!requireNamespace("ConsensusClusterPlus", quietly = TRUE)) { stop("ConsensusClusterPlus is needed. Please install it.", call. = FALSE) } if (method == "hclust") { ans <- hclust(ddist <- dist(tabDF), method = methodHC) } if (method == "consensus") { sHc <- hclust(ddist <- dist(tabDF), method = methodHC) ans <- ConsensusClusterPlus::ConsensusClusterPlus( ddist, maxK = 7, pItem = 0.9, reps = 1000, title = "mc_consensus_k7_1000", clusterAlg = "hc", innerLinkage = "ward.D2", finalLinkage = "complete", plot = 'pdf', writeTable = TRUE ) } return(ans) } #' @title Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier #' @description TCGAanalyze_Preprocessing perform Array Array Intensity correlation (AAIC). #' It defines a square symmetric matrix of spearman correlation among samples. #' According this matrix and boxplot of correlation samples by samples it is possible #' to find samples with low correlation that can be identified as possible outliers. #' @param object gene expression of class RangedSummarizedExperiment from TCGAprepare #' @param cor.cut is a threshold to filter samples according their spearman correlation in #' samples by samples. default cor.cut is 0 #' @param filename Filename of the image file #' @param width Image width #' @param height Image height #' @param datatype is a string from RangedSummarizedExperiment assay #' @importFrom grDevices dev.list #' @importFrom SummarizedExperiment assays #' @export #' @return Plot with array array intensity correlation and boxplot of correlation samples by samples TCGAanalyze_Preprocessing <- function( object, cor.cut = 0, datatype = names(assays(object))[1], filename = NULL, width = 1000, height = 1000 ) { # This is a work around for raw_counts and raw_count if(missing(object)) stop("Please set object argument") if(!is(object,"RangedSummarizedExperiment")) stop("Input object should be a RangedSummarizedExperiment") if (grepl("raw_counts", datatype) & any(grepl("raw_counts", names(assays(object))))) datatype <- names(assays(object))[grepl("raw_counts", names(assays(object)))] if (!any(grepl(datatype, names(assays(object))))) stop( paste0( datatype, " not found in the assay list: ", paste(names(assays(object)), collapse = ", "), "\n Please set the correct datatype argument." ) ) if (!(is.null(dev.list()["RStudioGD"]))) { dev.off() } if (is.null(filename)) filename <- "PreprocessingOutput.png" png(filename, width = width, height = height) par(oma = c(10, 10, 10, 10)) ArrayIndex <- as.character(1:length(colData(object)$barcode)) pmat_new <- matrix(0, length(ArrayIndex), 4) colnames(pmat_new) <- c("Disease", "platform", "SampleID", "Study") rownames(pmat_new) <- as.character(colData(object)$barcode) pmat_new <- as.data.frame(pmat_new) pmat_new$Disease <- as.character(colData(object)$definition) pmat_new$platform <- "platform" pmat_new$SampleID <- as.character(colData(object)$barcode) pmat_new$Study <- "study" tabGroupCol <- cbind(pmat_new, Color = matrix(0, nrow(pmat_new), 1)) for (i in seq_along(unique(tabGroupCol$Disease))) { tabGroupCol[which(tabGroupCol$Disease == tabGroupCol$Disease[i]), "Color"] <- rainbow(length(unique(tabGroupCol$Disease)))[i] } pmat <- pmat_new phenodepth <- min(ncol(pmat), 3) order <- switch( phenodepth + 1, ArrayIndex, order(pmat[, 1]), order(pmat[, 1], pmat[, 2]), order(pmat[, 1], pmat[, 2], pmat[, 3]) ) arraypos <- (1:length(ArrayIndex)) * (1 / (length(ArrayIndex) - 1)) - (1 / (length(ArrayIndex) - 1)) arraypos2 = seq(1:length(ArrayIndex) - 1) for (i in 2:length(ArrayIndex)) { arraypos2[i - 1] <- (arraypos[i] + arraypos[i - 1]) / 2 } layout(matrix(c(1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 3, 3, 3, 4), 4, 4, byrow = TRUE)) c <- cor(assay(object, datatype)[, order], method = "spearman") image(c, xaxt = "n", yaxt = "n", #xlab = "Array Samples", #ylab = "Array Samples", main = "Array-Array Intensity Correlation after RMA" ) for (i in 1:length(names(table(tabGroupCol$Color)))) { currentCol <- names(table(tabGroupCol$Color))[i] pos.col <- arraypos[which(tabGroupCol$Color == currentCol)] lab.col <- colnames(c)[which(tabGroupCol$Color == currentCol)] #axis(1, labels = lab.col , at = pos.col, col = currentCol,lwd = 6,las = 2) axis( 2, labels = lab.col , at = pos.col, col = currentCol, lwd = 6, las = 2 ) } m <- matrix(pretty(c, 10), nrow = 1, ncol = length(pretty(c, 10))) image( m, xaxt = "n", yaxt = "n", ylab = "Correlation Coefficient" ) axis(2, labels = as.list(pretty(c, 10)), at = seq(0, 1, by = (1 / ( length(pretty(c, 10)) - 1 )))) abline( h = seq( (1 / ( length(pretty(c, 10)) - 1)) / 2, 1 - (1 / ( length(pretty(c, 10)) - 1)), by = (1 / ( length(pretty(c, 10)) - 1)) ) ) box() boxplot( c, outline = FALSE, las = 2, lwd = 6, # names = NULL, col = tabGroupCol$Color, main = "Boxplot of correlation samples by samples after normalization" ) dev.off() samplesCor <- rowMeans(c) message("Number of outliers: ", sum(samplesCor < cor.cut)) objectWO <- assay(object, datatype)[, names(samplesCor)[samplesCor > cor.cut]] return(objectWO) } #' @title survival analysis (SA) univariate with Kaplan-Meier (KM) method. #' @description TCGAanalyze_SurvivalKM perform an univariate Kaplan-Meier (KM) survival analysis (SA). #' It performed Kaplan-Meier survival univariate using complete follow up with all days #' taking one gene a time from Genelist of gene symbols. #' For each gene according its level of mean expression in cancer samples, #' defining two thresholds for quantile #' expression of that gene in all samples (default ThreshTop=0.67,ThreshDown=0.33) it is possible #' to define a threshold of intensity of gene expression to divide the samples in 3 groups #' (High, intermediate, low). #' TCGAanalyze_SurvivalKM performs SA between High and low groups using following functions #' from survival package #' \enumerate{ #' \item survival::Surv #' \item survival::survdiff #' \item survival::survfit #' } #' @param clinical_patient is a data.frame using function 'clinic' with information #' related to barcode / samples such as bcr_patient_barcode, days_to_death , #' days_to_last_follow_up , vital_status, etc #' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare #' @param Genelist is a list of gene symbols where perform survival KM. #' @param Survresult is a parameter (default = FALSE) if is TRUE will show KM plot and results. #' @param ThreshTop is a quantile threshold to identify samples with high expression of a gene #' @param ThreshDown is a quantile threshold to identify samples with low expression of a gene #' @param p.cut p.values threshold. Default: 0.05 #' @param group1 a string containing the barcode list of the samples in in control group #' @param group2 a string containing the barcode list of the samples in in disease group #' @export #' @return table with survival genes pvalues from KM. #' @examples #' # Selecting only 20 genes for example #' dataBRCAcomplete <- log2(dataBRCA[1:20,] + 1) #' #' # clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA","clinical") #' clinical_patient_Cancer <- data.frame( #' bcr_patient_barcode = substr(colnames(dataBRCAcomplete),1,12), #' vital_status = c(rep("alive",3),"dead",rep("alive",2),rep(c("dead","alive"),2)), #' days_to_death = c(NA,NA,NA,172,NA,NA,3472,NA,786,NA), #' days_to_last_follow_up = c(3011,965,718,NA,1914,423,NA,5,656,1417) #' ) #' #' group1 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("NT")) #' group2 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("TP")) #' #' tabSurvKM <- TCGAanalyze_SurvivalKM(clinical_patient_Cancer, #' dataBRCAcomplete, #' Genelist = rownames(dataBRCAcomplete), #' Survresult = FALSE, #' p.cut = 0.4, #' ThreshTop = 0.67, #' ThreshDown = 0.33, #' group1 = group1, # Control group #' group2 = group2) # Disease group #' #' # If the groups are not specified group1 == group2 and all samples are used #' \dontrun{ #' tabSurvKM <- TCGAanalyze_SurvivalKM( #' clinical_patient_Cancer, #' dataBRCAcomplete, #' Genelist = rownames(dataBRCAcomplete), #' Survresult = TRUE, #' p.cut = 0.2, #' ThreshTop = 0.67, #' ThreshDown = 0.33 #' ) #' } TCGAanalyze_SurvivalKM <- function( clinical_patient, dataGE, Genelist, Survresult = FALSE, ThreshTop = 0.67, ThreshDown = 0.33, p.cut = 0.05, group1, group2 ) { check_package("survival") # Check which genes we really have in the matrix Genelist <- intersect(rownames(dataGE), Genelist) # Split gene expression matrix btw the groups dataCancer <- dataGE[Genelist, group2, drop = FALSE] dataNormal <- dataGE[Genelist, group1, drop = FALSE] colnames(dataCancer) <- substr(colnames(dataCancer), 1, 12) cfu <- clinical_patient[clinical_patient[, "bcr_patient_barcode"] %in% substr(colnames(dataCancer), 1, 12), ] if ("days_to_last_followup" %in% colnames(cfu)) colnames(cfu)[grep("days_to_last_followup", colnames(cfu))] <- "days_to_last_follow_up" cfu <- as.data.frame(subset( cfu, select = c( "bcr_patient_barcode", "days_to_death", "days_to_last_follow_up", "vital_status" ) )) # Set alive death to inf if (length(grep("alive", cfu$vital_status, ignore.case = TRUE)) > 0) cfu[grep("alive", cfu$vital_status, ignore.case = TRUE), "days_to_death"] <- "-Inf" # Set dead follow up to inf if (length(grep("dead", cfu$vital_status, ignore.case = TRUE)) > 0) cfu[grep("dead", cfu$vital_status, ignore.case = TRUE), "days_to_last_follow_up"] <- "-Inf" cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ] cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ] followUpLevel <- FALSE cfu$days_to_death <- as.numeric(as.character(cfu$days_to_death)) cfu$days_to_last_follow_up <- as.numeric(as.character(cfu$days_to_last_follow_up)) rownames(cfu) <- cfu[, "bcr_patient_barcode"] #mod1 cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ] cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ] cfu_complete <- cfu ngenes <- nrow(as.matrix(rownames(dataNormal))) # Evaluate each gene tabSurv_Matrix <- plyr::adply(.data =1:length(rownames(dataNormal)),.margins = 1,.fun = function(i){ mRNAselected <- as.matrix(rownames(dataNormal))[i] mRNAselected_values <- dataCancer[rownames(dataCancer) == mRNAselected, ] mRNAselected_values_normal <- dataNormal[rownames(dataNormal) == mRNAselected, ] if (all(mRNAselected_values == 0)) return(NULL) # All genes are 0 tabSurv_Matrix <- data.frame("mRNA" = mRNAselected) # Get Thresh values for cancer expression mRNAselected_values_ordered <- sort(mRNAselected_values, decreasing = TRUE) mRNAselected_values_ordered_top <- as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshTop)[1]) mRNAselected_values_ordered_down <- as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshDown)[1]) mRNAselected_values_newvector <- mRNAselected_values if (!is.na(mRNAselected_values_ordered_top)) { # How many samples do we have numberOfSamples <- length(mRNAselected_values_ordered) # High group (above ThreshTop) lastelementTOP <- max(which( mRNAselected_values_ordered > mRNAselected_values_ordered_top )) # Low group (below ThreshDown) firstelementDOWN <- min( which( mRNAselected_values_ordered <= mRNAselected_values_ordered_down ) ) samples_top_mRNA_selected <- names(mRNAselected_values_ordered[1:lastelementTOP]) samples_down_mRNA_selected <- names(mRNAselected_values_ordered[firstelementDOWN:numberOfSamples]) # Which samples are in the intermediate group (above ThreshLow and below ThreshTop) samples_UNCHANGED_mRNA_selected <- names(mRNAselected_values_newvector[ which((mRNAselected_values_newvector) > mRNAselected_values_ordered_down & mRNAselected_values_newvector < mRNAselected_values_ordered_top )]) cfu_onlyTOP <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_top_mRNA_selected, ] cfu_onlyDOWN <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_down_mRNA_selected, ] cfu_onlyUNCHANGED <- cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_UNCHANGED_mRNA_selected, ] cfu_ordered <- NULL cfu_ordered <- rbind(cfu_onlyTOP, cfu_onlyDOWN) cfu <- cfu_ordered ttime <- as.numeric(cfu[, "days_to_death"]) sum(status <- ttime > 0) # morti deads_complete <- sum(status <- ttime > 0) ttime_only_top <- cfu_onlyTOP[, "days_to_death"] deads_top <- sum(ttime_only_top > 0) if (dim(cfu_onlyDOWN)[1] >= 1) { ttime_only_down <- cfu_onlyDOWN[, "days_to_death"] deads_down <- sum(ttime_only_down > 0) } else { deads_down <- 0 } tabSurv_Matrix[1, "Cancer Deaths"] <- deads_complete tabSurv_Matrix[1, "Cancer Deaths with Top"] <- deads_top tabSurv_Matrix[1, "Cancer Deaths with Down"] <- deads_down tabSurv_Matrix[1, "Mean Normal"] <- mean(as.numeric(mRNAselected_values_normal)) dataCancer_onlyTop_sample <- dataCancer[, samples_top_mRNA_selected, drop = FALSE] dataCancer_onlyTop_sample_mRNASelected <- dataCancer_onlyTop_sample[rownames(dataCancer_onlyTop_sample) == mRNAselected, ] dataCancer_onlyDown_sample <- dataCancer[, samples_down_mRNA_selected, drop = FALSE] dataCancer_onlyDown_sample_mRNASelected <- dataCancer_onlyDown_sample[rownames(dataCancer_onlyDown_sample) == mRNAselected, ] tabSurv_Matrix[1, "Mean Tumor Top"] <- mean(as.numeric(dataCancer_onlyTop_sample_mRNASelected)) tabSurv_Matrix[1, "Mean Tumor Down"] <- mean(as.numeric(dataCancer_onlyDown_sample_mRNASelected)) ttime[!status] <- as.numeric(cfu[!status, "days_to_last_follow_up"]) ttime[which(ttime == -Inf)] <- 0 ttime <- survival::Surv(ttime, status) rownames(ttime) <- rownames(cfu) legendHigh <- paste(mRNAselected, "High") legendLow <- paste(mRNAselected, "Low") tabSurv_pvalue <- tryCatch({ tabSurv <- survival::survdiff(ttime ~ c(rep( "top", nrow(cfu_onlyTOP) ), rep( "down", nrow(cfu_onlyDOWN) ))) tabSurv_chis <- unlist(tabSurv)$chisq tabSurv_pvalue <- as.numeric(1 - pchisq(abs(tabSurv$chisq), df = 1)) }, error = function(e) { return(Inf) }) tabSurv_Matrix[1, "pvalue"] <- tabSurv_pvalue if (Survresult == TRUE) { titlePlot <- paste("Kaplan-Meier Survival analysis, pvalue = ",tabSurv_pvalue) plot( survival::survfit(ttime ~ c( rep("low", nrow(cfu_onlyTOP)), rep("high", nrow(cfu_onlyDOWN)) )), col = c("green", "red"), main = titlePlot, xlab = "Days", ylab = "Survival" ) legend( 100, 1, legend = c(legendLow, legendHigh), col = c("green", "red"), text.col = c("green", "red"), pch = 15 ) print(tabSurv) } } #end if tabSurv_Matrix },.progress = "time") #end for tabSurv_Matrix[tabSurv_Matrix == "-Inf"] <- 0 tabSurvKM <- tabSurv_Matrix # Filtering by selected pvalue < 0.01 tabSurvKM <- tabSurvKM[tabSurvKM$mRNA != 0, ] tabSurvKM <- tabSurvKM[tabSurvKM$pvalue < p.cut, ] tabSurvKM <- tabSurvKM[!duplicated(tabSurvKM$mRNA), ] rownames(tabSurvKM) <- tabSurvKM$mRNA tabSurvKM <- tabSurvKM[, -1] tabSurvKM <- tabSurvKM[order(tabSurvKM$pvalue, decreasing = FALSE), ] colnames(tabSurvKM) <- gsub("Cancer", "Group2", colnames(tabSurvKM)) colnames(tabSurvKM) <- gsub("Tumor", "Group2", colnames(tabSurvKM)) colnames(tabSurvKM) <- gsub("Normal", "Group1", colnames(tabSurvKM)) return(tabSurvKM) } #' @title Filtering mRNA transcripts and miRNA selecting a threshold. #' @description #' TCGAanalyze_Filtering allows user to filter mRNA transcripts and miRNA, #' samples, higher than the threshold defined quantile mean across all samples. #' @param tabDF is a dataframe or numeric matrix, each row represents a gene, #' each column represents a sample come from TCGAPrepare #' @param method is method of filtering such as 'quantile', 'varFilter', 'filter1', 'filter2' #' @param qnt.cut is threshold selected as mean for filtering #' @param var.func is function used as the per-feature filtering statistic. #' See genefilter documentation #' @param var.cutoff is a numeric value. See genefilter documentation #' @param eta is a parameter for filter1. default eta = 0.05. #' @param foldChange is a parameter for filter2. default foldChange = 1. #' @export #' @return A filtered dataframe or numeric matrix where each row represents a gene, #' each column represents a sample #' @examples #' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo) #' dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA, #' geneInfo = geneInfo, #' method = "geneLength") #' dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, method = "quantile", qnt.cut = 0.25) TCGAanalyze_Filtering <- function( tabDF, method, qnt.cut = 0.25, var.func = IQR, var.cutoff = 0.75, eta = 0.05, foldChange = 1 ) { if (method == "quantile") { GeneThresh <- as.numeric(quantile(rowMeans(tabDF,na.rm = TRUE), qnt.cut)) geneFiltered <- names(which(rowMeans(tabDF) > GeneThresh)) tabDF_Filt <- tabDF[geneFiltered,] } if (method == "varFilter") { check_package("genefilter") tabDF_Filt <- genefilter::varFilter( tabDF, var.func = IQR, var.cutoff = 0.75, filterByQuantile = TRUE ) } if (method == "filter1") { normCounts <- tabDF geData <- t(log(1 + normCounts, 2)) filter <- apply(geData, 2, function(x) sum(quantile(x, probs = c(1 - eta, eta)) * c(1, -1))) tabDF_Filt <- geData[, which(filter > foldChange)] } if (method == "filter2") { geData <- tabDF filter <- apply(geData, 2, function(x) prod(quantile(x, probs = c(1 - eta, eta)) - 10) < 0) tabDF_Filt <- geData[, which(filter)] } return(tabDF_Filt) } #' @title normalization mRNA transcripts and miRNA using EDASeq package. #' @description #' TCGAanalyze_Normalization allows user to normalize mRNA transcripts and miRNA, #' using EDASeq package. #' #' Normalization for RNA-Seq Numerical and graphical #' summaries of RNA-Seq read data. Within-lane normalization procedures #' to adjust for GC-content effect (or other gene-level effects) on read counts: #' loess robust local regression, global-scaling, and full-quantile normalization #' (Risso et al., 2011). Between-lane normalization procedures to adjust for #' distributional differences between lanes (e.g., sequencing depth): #' global-scaling and full-quantile normalization (Bullard et al., 2010). #' #' For istance returns all mRNA or miRNA with mean across all #' samples, higher than the threshold defined quantile mean across all samples. #' #' TCGAanalyze_Normalization performs normalization using following functions #' from EDASeq #' \enumerate{ #' \item EDASeq::newSeqExpressionSet #' \item EDASeq::withinLaneNormalization #' \item EDASeq::betweenLaneNormalization #' \item EDASeq::counts #' } #' @param tabDF Rnaseq numeric matrix, each row represents a gene, #' each column represents a sample #' @param geneInfo Information matrix of 20531 genes about geneLength and gcContent. #' Two objects are provided: TCGAbiolinks::geneInfoHT,TCGAbiolinks::geneInfo #' @param method is method of normalization such as 'gcContent' or 'geneLength' #' @export #' @return Rnaseq matrix normalized with counts slot holds the count data as a matrix #' of non-negative integer count values, one row for each observational unit (gene or the like), #' and one column for each sample. #' @examples #' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo) TCGAanalyze_Normalization <- function( tabDF, geneInfo, method = "geneLength" ) { check_package("EDASeq") # Check if we have a SE, we need a gene expression matrix if (is(tabDF, "SummarizedExperiment")) tabDF <- assay(tabDF) geneInfo <- geneInfo[!is.na(geneInfo[, 1]), ] geneInfo <- as.data.frame(geneInfo) geneInfo$geneLength <- as.numeric(as.character(geneInfo$geneLength)) geneInfo$gcContent <- as.numeric(as.character(geneInfo$gcContent)) if(any(grepl("\\|",rownames(tabDF)))){ tmp <- as.character(rownames(tabDF)) geneNames <- stringr::str_split(tmp, "\\|",simplify = T) tmp <- which(geneNames[, 1] == "?") geneNames[tmp, 1] <- geneNames[tmp, 2] tmp <- table(geneNames[, 1]) tmp <- which(geneNames[, 1] %in% names(tmp[which(tmp > 1)])) geneNames[tmp, 1] <- paste(geneNames[tmp, 1], geneNames[tmp, 2], sep = ".") tmp <- table(geneNames[, 1]) rownames(tabDF) <- geneNames[, 1] } else if(any(grepl("ENSG",rownames(tabDF)))){ rownames(tabDF) <- gsub("\\.[0-9]*","",rownames(tabDF)) } if (method == "gcContent") { rawCounts <- tabDF commonGenes <- intersect(rownames(geneInfo), rownames(rawCounts)) geneInfo <- geneInfo[commonGenes, ] rawCounts <- rawCounts[commonGenes, ] timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2) message( messageEstimation <- paste( "I Need about ", timeEstimated, "seconds for this Complete Normalization Upper Quantile", " [Processing 80k elements /s] " ) ) ffData <- as.data.frame(geneInfo) rawCounts <- floor(rawCounts) message("Step 1 of 4: newSeqExpressionSet ...") tmp <- EDASeq::newSeqExpressionSet(as.matrix(rawCounts), featureData = ffData) #fData(tmp)[, "gcContent"] <- as.numeric(geneInfo[, "gcContent"]) message("Step 2 of 4: withinLaneNormalization ...") tmp <- EDASeq::withinLaneNormalization( tmp, "gcContent", which = "upper", offset = TRUE ) message("Step 3 of 4: betweenLaneNormalization ...") if(any(is.na(EDASeq::normCounts(tmp)))) { tmp <- tmp[rowSums(is.na(EDASeq::normCounts(tmp))) == 0,] } tmp <- EDASeq::betweenLaneNormalization( tmp, which = "upper", offset = TRUE ) normCounts <- log(rawCounts[rownames(tmp),] + .1) + EDASeq::offst(tmp) normCounts <- floor(exp(normCounts) - .1) message("Step 4 of 4: .quantileNormalization ...") tmp <- t(.quantileNormalization(t(normCounts))) tabDF_norm <- floor(tmp) } if (method == "geneLength") { tabDF <- tabDF[!duplicated(rownames(tabDF)), !duplicated(colnames(tabDF))] tabDF <- tabDF[rownames(tabDF) %in% rownames(geneInfo), ] #tabDF <- tabDF[rowMeans(tabDF) > 1,] tabDF <- tabDF[which(rowSums(tabDF == 0) < ncol(tabDF)),] tabDF <- as.matrix(tabDF) geneInfo <- geneInfo[rownames(geneInfo) %in% rownames(tabDF),] geneInfo <- geneInfo[!duplicated(rownames(geneInfo)),] toKeep <- which(geneInfo[, "geneLength"] != 0) geneInfo <- geneInfo[toKeep,] tabDF <- tabDF[toKeep,] geneInfo <- as.data.frame(geneInfo) tabDF <- round(tabDF) commonGenes <- intersect(rownames(tabDF), rownames(geneInfo)) tabDF <- tabDF[commonGenes, ] geneInfo <- geneInfo[commonGenes, ] timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2) message( messageEstimation <- paste( "I Need about ", timeEstimated, "seconds for this Complete Normalization Upper Quantile", " [Processing 80k elements /s] " ) ) message("Step 1 of 4: newSeqExpressionSet ...") tabDF_norm <- EDASeq::newSeqExpressionSet( tabDF, featureData = geneInfo ) message("Step 2 of 4: withinLaneNormalization ...") tabDF_norm <- EDASeq::withinLaneNormalization( tabDF_norm, "geneLength", which = "upper", offset = FALSE ) message("Step 3 of 4: betweenLaneNormalization ...") if(any(is.na(EDASeq::normCounts(tabDF_norm)))) { tabDF_norm <- tabDF_norm[rowSums(is.na(EDASeq::normCounts(tabDF_norm))) == 0,] } tabDF_norm <- EDASeq::betweenLaneNormalization( tabDF_norm, which = "upper", offset = FALSE ) message("Step 4 of 4: exprs ...") tabDF_norm <- EDASeq::counts(tabDF_norm) } # In case NA's were produced to all rows if(any(rowSums(is.na(tabDF_norm)) == ncol(tabDF_norm))){ tabDF_norm <- tabDF_norm[rowSums(is.na(tabDF_norm)) != ncol(tabDF_norm),] } return(tabDF_norm) } #' @title Differential expression analysis (DEA) using edgeR or limma package. #' @description #' TCGAanalyze_DEA allows user to perform Differentially expression analysis (DEA), #' using edgeR package or limma to identify differentially expressed genes (DEGs). #' It is possible to do a two-class analysis. #' #' TCGAanalyze_DEA performs DEA using following functions from edgeR: #' \enumerate{ #' \item edgeR::DGEList converts the count matrix into an edgeR object. #' \item edgeR::estimateCommonDisp each gene gets assigned the same dispersion estimate. #' \item edgeR::exactTest performs pair-wise tests for differential expression between two groups. #' \item edgeR::topTags takes the output from exactTest(), adjusts the raw p-values using the #' False Discovery Rate (FDR) correction, and returns the top differentially expressed genes. #' } #' TCGAanalyze_DEA performs DEA using following functions from limma: #' \enumerate{ #' \item limma::makeContrasts construct matrix of custom contrasts. #' \item limma::lmFit Fit linear model for each gene given a series of arrays. #' \item limma::contrasts.fit Given a linear model fit to microarray data, compute estimated coefficients and standard errors for a given set of contrasts. #' \item limma::eBayes Given a microarray linear model fit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a common value. #' \item limma::toptable Extract a table of the top-ranked genes from a linear model fit. #' } #' @param mat1 numeric matrix, each row represents a gene, #' each column represents a sample with Cond1type #' @param mat2 numeric matrix, each row represents a gene, #' each column represents a sample with Cond2type #' @param metadata Add metadata #' @param Cond1type a string containing the class label of the samples in mat1 #' (e.g., control group) #' @param Cond2type a string containing the class label of the samples in mat2 #' (e.g., case group) #' @param pipeline a string to specify which package to use ("limma" or "edgeR") #' @param method is 'glmLRT' (1) or 'exactTest' (2) used for edgeR #' (1) Fit a negative binomial generalized log-linear model to #' the read counts for each gene #' (2) Compute genewise exact tests for differences in the means between #' two groups of negative-binomially distributed counts. #' @param fdr.cut is a threshold to filter DEGs according their p-value corrected #' @param logFC.cut is a threshold to filter DEGs according their logFC #' @param batch.factors a vector containing strings to specify options for batch correction. Options are "Plate", "TSS", "Year", "Portion", "Center", and "Patients" #' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data #' @param paired boolean to account for paired or non-paired samples. Set to TRUE for paired case #' @param log.trans boolean to perform log cpm transformation. Set to TRUE for log transformation #' @param trend boolean to perform limma-trend pipeline. Set to TRUE to go through limma-trend #' @param MAT matrix containing expression set as all samples in columns and genes as rows. Do not provide if mat1 and mat2 are used #' @param contrast.formula string input to determine coefficients and to design contrasts in a customized way #' @param Condtypes vector of grouping for samples in MAT #' @param voom boolean to perform voom transformation for limma-voom pipeline. Set to TRUE for voom transformation #' @export #' @examples #' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo) #' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25) #' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT")) #' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP")) #' dataDEGs <- TCGAanalyze_DEA( #' mat1 = dataFilt[,samplesNT], #' mat2 = dataFilt[,samplesTP], #' Cond1type = "Normal", #' Cond2type = "Tumor" #' ) #' #' @return table with DEGs containing for each gene logFC, logCPM, pValue,and FDR, also for each contrast TCGAanalyze_DEA <- function ( mat1, mat2, metadata = TRUE, Cond1type, Cond2type, pipeline = "edgeR", method = "exactTest", fdr.cut = 1, logFC.cut = 0, batch.factors = NULL, ClinicalDF = data.frame(), paired = FALSE, log.trans = FALSE, voom = FALSE, trend = FALSE, MAT = data.frame(), contrast.formula = "", Condtypes = c() ){ if(pipeline == "limma") check_package("limma") if(pipeline == "edgeR") check_package("edgeR") table.code <- c( "TP", "TR", "TB", "TRBM", "TAP", "TM", "TAM", "THOC", "TBM", "NB", "NT", "NBC", "NEBV", "NBM", "CELLC", "TRB", "CELL", "XP", "XCL" ) names(table.code) <- c( "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "20", "40", "50", "60", "61" ) if (nrow(MAT) == 0) { TOC <- cbind(mat1, mat2) Cond1num <- ncol(mat1) Cond2num <- ncol(mat2) } else { TOC <- MAT } if (metadata == TRUE & all(grepl("TCGA",colnames(TOC)))){ my_IDs <- get_IDs(TOC) Plate <- factor(my_IDs$plate) Condition <- factor(my_IDs$condition) TSS <- factor(my_IDs$tss) Portion <- factor(my_IDs$portion) Center <- factor(my_IDs$center) Patients <- factor(my_IDs$patient) } # this makes non-sense for non-TCGA data if (paired == TRUE) { matched.query <- TCGAquery_MatchedCoupledSampleTypes( my_IDs$barcode, table.code[unique(my_IDs$sample)] ) my_IDs <- subset(my_IDs, barcode == matched.query) TOC <- TOC[, (names(TOC) %in% matched.query)] } if (nrow(ClinicalDF) > 0) { names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <- "patient" ClinicalDF$age_at_diag_year <- floor(clinical$age_at_diagnosis / 365) ClinicalDF$diag_year <- ClinicalDF$age_at_diag_year + clinical$year_of_birth diag_yearDF <- ClinicalDF[, c("patient", "diag_year")] my_IDs <- merge(my_IDs, ClinicalDF, by = "patient") Year <- as.factor(my_IDs$diag_year) } options <- c("Plate", "TSS", "Year", "Portion", "Center", "Patients") if (length(batch.factors) == 0) { message("Batch correction skipped since no factors provided") } else { for (o in batch.factors) { if (o %in% options == FALSE) stop(paste0(o, " is not a valid batch correction factor")) if (o == "Year" & nrow(ClinicalDF) == 0) stop( "batch correction using diagnosis year needs clinical info. Provide Clinical Data in arguments" ) } } additiveformula <- paste(batch.factors, collapse = "+") message("----------------------- DEA -------------------------------") if (nrow(MAT) == 0) { message("o ",Cond1num," samples in Cond1type ",Cond1type) message("o ",Cond2num," samples in Cond2type ",Cond2type) message("o ", nrow(TOC), " features as miRNA or genes ") } else { message("o ", nrow(TOC), " features as miRNA or genes ") } message("This may take some minutes...") colnames(TOC) <- paste0("s", 1:ncol(TOC)) if (length(Condtypes) > 0) { tumorType <- factor(x = Condtypes, levels = unique(Condtypes)) } else { tumorType <- factor( x = rep(c(Cond1type, Cond2type), c(Cond1num, Cond2num)), levels = c(Cond1type, Cond2type) ) } if (length(batch.factors) == 0 & length(Condtypes) > 0) { if (pipeline == "edgeR") design <- model.matrix( ~ tumorType) else design <- model.matrix( ~ 0 + tumorType) } else if (length(batch.factors) == 0 & length(Condtypes) == 0) { if (pipeline == "edgeR") design <- model.matrix( ~ tumorType) else design <- model.matrix( ~ 0 + tumorType) } else if (length(batch.factors) > 0 & length(Condtypes) == 0) { if (pipeline == "edgeR") formula <- paste0("~tumorType+", additiveformula) else formula <- paste0("~0+tumorType+", additiveformula) design <- model.matrix(eval(parse(text = formula))) } else if (length(batch.factors) > 0 & length(Condtypes) > 0) { if (pipeline == "edgeR") { formula <- paste0("~tumorType+", additiveformula) if (length(Condtypes) > 2) formula <- paste0("~0+tumorType+", additiveformula) } else { formula <- paste0("~0+tumorType+", additiveformula) } design <- model.matrix(eval(parse(text = formula))) } if (pipeline == "edgeR") { if (method == "exactTest") { DGE <- edgeR::DGEList(TOC, group = rep(c(Cond1type, Cond2type), c(Cond1num, Cond2num))) disp <- edgeR::estimateCommonDisp(DGE) tested <- edgeR::exactTest(disp, pair = c(Cond1type, Cond2type)) logFC_table <- tested$table tableDEA <- edgeR::topTags(tested, n = nrow(tested$table))$table tableDEA <- tableDEA[tableDEA$FDR <= fdr.cut,] tableDEA <- tableDEA[abs(tableDEA$logFC) >= logFC.cut,] } else if (method == "glmLRT") { if (length(unique(tumorType)) == 2) { aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType) aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList, design) aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList, design) aGlmFit <- edgeR::glmFit( aDGEList, design, dispersion = aDGEList$tagwise.dispersion, prior.count.total = 0 ) aGlmLRT <- edgeR::glmLRT(aGlmFit, coef = 2) tableDEA <- cbind(aGlmLRT$table, FDR = p.adjust(aGlmLRT$table$PValue, "fdr")) tableDEA <- tableDEA[tableDEA$FDR < fdr.cut, ] tableDEA <- tableDEA[abs(tableDEA$logFC) > logFC.cut, ] if (all(grepl("ENSG", rownames(tableDEA)))){ tableDEA <- cbind(tableDEA, map.ensg(genes = rownames(tableDEA))[, c("gene_name","gene_type")]) } } else if (length(unique(tumorType)) > 2) { aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType) colnames(design)[1:length(levels(tumorType))] <- levels(tumorType) prestr = "makeContrasts(" poststr = ",levels=design)" commandstr = paste(prestr, contrast.formula, poststr, sep = "") commandstr = paste0("limma::", commandstr) cont.matrix <- eval(parse(text = commandstr)) aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList, design) aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList, design) aGlmFit <- edgeR::glmFit( aDGEList, design, dispersion = aDGEList$tagwise.dispersion, prior.count.total = 0 ) tableDEA <- list() for (mycoef in colnames(cont.matrix)) { message(paste0("DEA for", " :", mycoef)) aGlmLRT <- edgeR::glmLRT(aGlmFit, contrast = cont.matrix[, mycoef]) tt <- aGlmLRT$table tt <- cbind(tt, FDR = p.adjust(aGlmLRT$table$PValue,"fdr")) tt <- tt[(tt$FDR < fdr.cut & abs(as.numeric(tt$logFC)) > logFC.cut),] tableDEA[[as.character(mycoef)]] <- tt if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]])))) tableDEA[[as.character(mycoef)]] <- cbind(tableDEA[[as.character(mycoef)]], map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,2:3]) } } } else { stop( paste0( method, " is not a valid DEA method option. Choose 'exactTest' or 'glmLRT' " ) ) } } else if (pipeline == "limma") { if (log.trans == TRUE){ logCPM <- edgeR::cpm(TOC, log = TRUE, prior.count = 3) } else { logCPM <- TOC } if (voom == TRUE) { message("Voom Transformation...") logCPM <- limma::voom(logCPM, design) } if (length(unique(tumorType)) == 2) { colnames(design)[1:2] <- c(Cond1type, Cond2type) contr <- paste0(Cond2type, "-", Cond1type) cont.matrix <- limma::makeContrasts(contrasts = contr, levels = design) fit <- limma::lmFit(logCPM, design) fit <- limma::contrasts.fit(fit, cont.matrix) if (trend == TRUE) { fit <- limma::eBayes(fit, trend = TRUE) } else { fit <- limma::eBayes(fit, trend = FALSE) } tableDEA <- limma::topTable( fit, coef = 1, adjust.method = "fdr", number = nrow(TOC) ) limma::volcanoplot(fit, highlight = 10) index <- which(tableDEA[, 4] < fdr.cut) tableDEA <- tableDEA[index,] neg_logFC.cut <- -1 * logFC.cut index <- which(abs(as.numeric(tableDEA$logFC)) > logFC.cut) tableDEA <- tableDEA[index,] } else if (length(unique(tumorType)) > 2) { DGE <- edgeR::DGEList(TOC, group = tumorType) colnames(design)[1:length(levels(tumorType))] <- levels(tumorType) prestr = "makeContrasts(" poststr = ",levels=colnames(design))" commandstr = paste(prestr, contrast.formula, poststr, sep = "") commandstr = paste0("limma::", commandstr) cont.matrix <- eval(parse(text = commandstr)) fit <- limma::lmFit(logCPM, design) fit <- limma::contrasts.fit(fit, cont.matrix) if (trend == TRUE){ fit <- limma::eBayes(fit, trend = TRUE) } else { fit <- limma::eBayes(fit, trend = FALSE) } tableDEA <- list() for (mycoef in colnames(cont.matrix)) { tableDEA[[as.character(mycoef)]] <- limma::topTable( fit, coef = mycoef, adjust.method = "fdr", number = nrow(MAT) ) message(paste0("DEA for", " :", mycoef)) tempDEA <- tableDEA[[as.character(mycoef)]] index.up <- which(tempDEA$adj.P.Val < fdr.cut & abs(as.numeric(tempDEA$logFC)) > logFC.cut) tableDEA[[as.character(mycoef)]] <- tempDEA[index.up,] if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]])))) tableDEA[[as.character(mycoef)]] <- cbind(tableDEA[[as.character(mycoef)]], map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[, 2:3]) } } } else { stop(paste0( pipeline, " is not a valid pipeline option. Choose 'edgeR' or 'limma'" )) } message("----------------------- END DEA -------------------------------") return(tableDEA) } #' @title Batch correction using ComBat and Voom transformation using limma package. #' @description #' TCGAbatch_correction allows user to perform a Voom correction on gene expression data and have it ready for DEA. #' One can also use ComBat for batch correction for exploratory analysis. If batch.factor or adjustment argument is "Year" #' please provide clinical data. If no batch factor is provided, the data will be voom corrected only #' #' TCGAanalyze_DEA performs DEA using following functions from sva and limma: #' \enumerate{ #' \item limma::voom Transform RNA-Seq Data Ready for Linear Modelling. #' \item sva::ComBat Adjust for batch effects using an empirical Bayes framework. #' } #' @param tabDF numeric matrix, each row represents a gene, #' each column represents a sample #' @param batch.factor a string containing the batch factor to use for correction. #' Options are "Plate", "TSS", "Year", "Portion", "Center" #' @param adjustment vector containing strings for factors to adjust for using ComBat. #' Options are "Plate", "TSS", "Year", "Portion", "Center" #' @param UnpublishedData if TRUE perform a batch correction after adding new data #' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data #' @param AnnotationDF a dataframe with column Batch indicating different batches of the samples in the tabDF #' @export #' @return data frame with ComBat batch correction applied TCGAbatch_Correction <- function ( tabDF, batch.factor = NULL, adjustment = NULL, ClinicalDF = data.frame(), UnpublishedData = FALSE, AnnotationDF = data.frame() ){ check_package("sva") # code for non-TCGA samples if (UnpublishedData == TRUE) { if(!"Batch" %in% colnames(AnnotationDF)) { stop("AnnotationDF should have a Batch column") } else { batch.factor <- as.factor(AnnotationDF$Batch) } if(!"Condition" %in% colnames(AnnotationDF)) { mod <- model.matrix(~as.factor(Condition),data = AnnotationDF) } else { mod <- NULL } batch_corr <- sva::ComBat( dat = tabDF, batch = batch.factor, mod = mod, par.prior = TRUE, prior.plots = TRUE ) } if (UnpublishedData == FALSE) { if (length(batch.factor) == 0 & length(adjustment) == 0) message("batch correction will be skipped") else if (batch.factor %in% adjustment) { stop(paste0("Cannot adjust and correct for the same factor|")) } my_IDs <- get_IDs(tabDF) if (length(batch.factor) > 0 || length(adjustment) > 0) if ((nrow(ClinicalDF) > 0 & batch.factor == "Year") || ("Year" %in% adjustment == TRUE & nrow(ClinicalDF) > 0)) { names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <- "patient" ClinicalDF$age_at_diag_year <- floor(ClinicalDF$age_at_diagnosis / 365) ClinicalDF$diag_year <- ClinicalDF$age_at_diag_year + ClinicalDF$year_of_birth diag_yearDF <- ClinicalDF[, c("patient", "diag_year")] Year <- merge(my_IDs, diag_yearDF, by = "patient") Year <- Year$diag_year Year <- as.factor(Year) } else if (nrow(ClinicalDF) == 0 & batch.factor == "Year") { stop("Cannot extract Year data. Clinical data was not provided") } Plate <- as.factor(my_IDs$plate) Condition <- as.factor(my_IDs$condition) TSS <- as.factor(my_IDs$tss) Portion <- as.factor(my_IDs$portion) Sequencing.Center <- as.factor(my_IDs$center) design.mod.combat <- model.matrix( ~ Condition) options <- c("Plate", "TSS", "Year", "Portion", "Sequencing Center") if (length(batch.factor) > 1) stop("Combat can only correct for one batch variable. Provide one batch factor") if (batch.factor %in% options == FALSE) stop(paste0(o, " is not a valid batch correction factor")) for (o in adjustment) { if (o %in% options == FALSE) stop(paste0(o, " is not a valid adjustment factor")) } adjustment.data <- c() for (a in adjustment) { if (a == "Sequencing Center") a <- Sequencing.Center adjustment.data <- cbind(eval(parse(text = a)), adjustment.data) } if (batch.factor == "Sequencing Center") batch.factor <- Sequencing.Center batchCombat <- eval(parse(text = batch.factor)) if (length(adjustment) > 0) { adjustment.formula <- paste(adjustment, collapse = "+") adjustment.formula <- paste0("+", adjustment.formula) adjustment.formula <- paste0("~Condition", adjustment.formula) print(adjustment.formula) model <- data.frame(batchCombat, row.names = colnames(tabDF)) design.mod.combat <- model.matrix( eval(parse(text = adjustment.formula)), data = model ) } print(unique(batchCombat)) batch_corr <- sva::ComBat( dat = tabDF, batch = batchCombat, mod = design.mod.combat, par.prior = TRUE, prior.plots = TRUE ) } return(batch_corr) } ##Function to take raw counts by removing rows filtered after norm and filter process### #' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps. #' @description function to keep raw counts after filtering and/or normalizing. #' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing() #' @param DataFilt Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps #' @examples #' \dontrun{ #' dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt) #' } #' @export #' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process. UseRaw_afterFilter <- function(DataPrep, DataFilt) { rownames(DataPrep) <- lapply(rownames(DataPrep), function(x) gsub("[[:punct:]]\\d*", "", x)) filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt)) Res <- DataPrep[!rownames(DataPrep) %in% filtered.list,] return(Res) } #' @title Adding information related to DEGs genes from DEA as mean values in two conditions. #' @description #' TCGAanalyze_LevelTab allows user to add information related to DEGs genes from #' Differentially expression analysis (DEA) such as mean values and in two conditions. #' @param FC_FDR_table_mRNA Output of dataDEGs filter by abs(LogFC) >=1 #' @param typeCond1 a string containing the class label of the samples #' in TableCond1 (e.g., control group) #' @param typeCond2 a string containing the class label of the samples #' in TableCond2 (e.g., case group) #' @param TableCond1 numeric matrix, each row represents a gene, each column #' represents a sample with Cond1type #' @param TableCond2 numeric matrix, each row represents a gene, each column #' represents a sample with Cond2type #' @param typeOrder typeOrder #' @export #' @return table with DEGs, log Fold Change (FC), false discovery rate (FDR), #' the gene expression level #' for samples in Cond1type, and Cond2type, and Delta value (the difference #' of gene expression between the two #' conditions multiplied logFC) #' @examples #' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo) #' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25) #' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT")) #' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP")) #' dataDEGs <- TCGAanalyze_DEA( #' dataFilt[,samplesNT], #' dataFilt[,samplesTP], #' Cond1type = "Normal", #' Cond2type = "Tumor" #' ) #' dataDEGsFilt <- dataDEGs[abs(dataDEGs$logFC) >= 1,] #' dataTP <- dataFilt[,samplesTP] #' dataTN <- dataFilt[,samplesNT] #' dataDEGsFiltLevel <- TCGAanalyze_LevelTab( #' FC_FDR_table_mRNA = dataDEGsFilt, #' typeCond1 = "Tumor", #' typeCond2 = "Normal", #' TableCond1 = dataTP, #' TableCond2 = dataTN #' ) TCGAanalyze_LevelTab <- function( FC_FDR_table_mRNA, typeCond1, typeCond2, TableCond1, TableCond2, typeOrder = TRUE ) { TableLevel <- data.frame( "mRNA" = rownames(FC_FDR_table_mRNA), "logFC" = FC_FDR_table_mRNA$logFC, "FDR" = FC_FDR_table_mRNA$FDR, "Delta" = FC_FDR_table_mRNA$logFC * rowMeans(TableCond1[rownames(FC_FDR_table_mRNA),],na.rm = TRUE) ) TableLevel[[typeCond1]] <- rowMeans(TableCond1[TableLevel$mRNA,],na.rm = TRUE) TableLevel[[typeCond2]] <- rowMeans(TableCond2[TableLevel$mRNA,],na.rm = TRUE) TableLevel <- TableLevel[order(as.numeric(TableLevel[, "Delta"]), decreasing = typeOrder),] rownames(TableLevel) <- TableLevel[, "mRNA"] if (all(grepl("ENSG", rownames(TableLevel)))) TableLevel <- cbind(TableLevel, map.ensg(genes = rownames(TableLevel))[, 2:3]) return(TableLevel) } #' @title Enrichment analysis for Gene Ontology (GO) [BP,MF,CC] and Pathways #' @description #' Researchers, in order to better understand the underlying biological #' processes, often want to retrieve a functional profile of a set of genes #' that might have an important role. This can be done by performing an #' enrichment analysis. #' #'We will perform an enrichment analysis on gene sets using the TCGAanalyze_EAcomplete #'function. Given a set of genes that are #'up-regulated under certain conditions, an enrichment analysis will find #'identify classes of genes or proteins that are #'over-represented using #'annotations for that gene set. #' @param TFname is the name of the list of genes or TF's regulon. #' @param RegulonList List of genes such as TF's regulon or DEGs where to find enrichment. #' @export #' @return Enrichment analysis GO[BP,MF,CC] and Pathways complete table enriched by genelist. #' @examples #' Genelist <- c("FN1","COL1A1") #' ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist) #' \dontrun{ #' Genelist <- rownames(dataDEGsFiltLevel) #' system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)) #' } TCGAanalyze_EAcomplete <- function(TFname, RegulonList) { # This is a verification of the input # in case the List is like Gene|ID # we will get only the Gene if (all(grepl("\\|", RegulonList))) { RegulonList <- strsplit(RegulonList, "\\|") RegulonList <- unlist(lapply(RegulonList, function(x) x[1])) } print( paste( "I need about ", "1 minute to finish complete ", "Enrichment analysis GO[BP,MF,CC] and Pathways... " ) ) ResBP <- TCGAanalyze_EA(TFname, RegulonList, DAVID_BP_matrix, EAGenes, GOtype = "DavidBP") print("GO Enrichment Analysis BP completed....done") ResMF <- TCGAanalyze_EA(TFname, RegulonList, DAVID_MF_matrix, EAGenes, GOtype = "DavidMF") print("GO Enrichment Analysis MF completed....done") ResCC <- TCGAanalyze_EA(TFname, RegulonList, DAVID_CC_matrix, EAGenes, GOtype = "DavidCC") print("GO Enrichment Analysis CC completed....done") ResPat <- TCGAanalyze_EA(TFname, RegulonList, listEA_pathways, EAGenes, GOtype = "Pathway") print("Pathway Enrichment Analysis completed....done") ans <- list( ResBP = ResBP, ResMF = ResMF, ResCC = ResCC, ResPat = ResPat ) return(ans) } #' @title Enrichment analysis of a gene-set with GO [BP,MF,CC] and pathways. #' @description #' The rational behind a enrichment analysis ( gene-set, pathway etc) is to compute #' statistics of whether the overlap between the focus list (signature) and the gene-set #' is significant. ie the confidence that overlap between the list is not due to chance. #' The Gene Ontology project describes genes (gene products) using terms from #' three structured vocabularies: biological process, cellular component and molecular function. #' The Gene Ontology Enrichment component, also referred to as the GO Terms" component, allows #' the genes in any such "changed-gene" list to be characterized using the Gene Ontology terms #' annotated to them. It asks, whether for any particular GO term, the fraction of genes #' assigned to it in the "changed-gene" list is higher than expected by chance #' (is over-represented), relative to the fraction of genes assigned to that term in the #' reference set. #' In statistical terms it perform the analysis tests the null hypothesis that, #' for any particular ontology term, there is no difference in the proportion of genes #' annotated to it in the reference list and the proportion annotated to it in the test list. #' We adopted a Fisher Exact Test to perform the EA. #' @param GeneName is the name of gene signatures list #' @param TableEnrichment is a table related to annotations of gene symbols such as #' GO[BP,MF,CC] and Pathways. It was created from DAVID gene ontology on-line. #' @param RegulonList is a gene signature (lisf of genes) in which perform EA. #' @param GOtype is type of gene ontology Biological process (BP), Molecular Function (MF), #' Cellular componet (CC) #' @param FDRThresh pvalue corrected (FDR) as threshold to selected significant #' BP, MF,CC, or pathways. (default FDR < 0.01) #' @param EAGenes is a table with informations about genes #' such as ID, Gene, Description, Location and Family. #' @param GeneSymbolsTable if it is TRUE will return a table with GeneSymbols in common GO or pathways. # @export #' @import stats #' @return Table with enriched GO or pathways by selected gene signature. #' @examples #' \dontrun{ #' EAGenes <- get("EAGenes") #' RegulonList <- rownames(dataDEGsFiltLevel) #' ResBP <- TCGAanalyze_EA( #' GeneName="DEA genes Normal Vs Tumor", #' RegulonList = RegulonList, #' TableEnrichment = DAVID_BP_matrix, #' EAGenes = EAGenes, #' GOtype = "DavidBP" #' ) #'} TCGAanalyze_EA <- function ( GeneName, RegulonList, TableEnrichment, EAGenes, GOtype, FDRThresh = 0.01, GeneSymbolsTable = FALSE ) { topPathways <- nrow(TableEnrichment) topPathways_tab <- matrix(0, 1, topPathways) topPathways_tab <- as.matrix(topPathways_tab) rownames(topPathways_tab) <- GeneName rownames(EAGenes) <- toupper(rownames(EAGenes)) EAGenes <- EAGenes[!duplicated(EAGenes[, "ID"]),] rownames(EAGenes) <- EAGenes[, "ID"] allgene <- EAGenes[, "ID"] current_pathway_from_EA <- as.matrix(TableEnrichment[, GOtype]) TableNames <- gsub("David", "", paste("Top ", GOtype, " n. ", 1:topPathways, " of ", topPathways, sep = "")) colnames(topPathways_tab) <- TableNames topPathways_tab <- as.data.frame(topPathways_tab) table_pathway_enriched <- matrix(1, nrow(current_pathway_from_EA), 8) colnames(table_pathway_enriched) <- c( "Pathway", "GenesInPathway", "Pvalue", "FDR", "CommonGenesPathway", "PercentPathway", "PercentRegulon", "CommonGeneSymbols" ) table_pathway_enriched <- as.data.frame(table_pathway_enriched) for (i in 1:nrow(current_pathway_from_EA)) { table_pathway_enriched[i, "Pathway"] <- as.character(current_pathway_from_EA[i,]) if (nrow(TableEnrichment) == 589) { genes_from_current_pathway_from_EA <- GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] == as.character(current_pathway_from_EA[i,]),][, "Molecules"], ",") } else { genes_from_current_pathway_from_EA <- GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] == as.character(current_pathway_from_EA[i,]),][, "Molecules"], ", ") } genes_common_pathway_TFregulon <- as.matrix(intersect( toupper(RegulonList), toupper(genes_from_current_pathway_from_EA) )) if (length(genes_common_pathway_TFregulon) != 0) { current_pathway_commongenes_num <- length(genes_common_pathway_TFregulon) seta <- allgene %in% RegulonList setb <- allgene %in% genes_from_current_pathway_from_EA ft <- fisher.test(seta, setb) FisherpvalueTF <- ft$p.value table_pathway_enriched[i, "Pvalue"] <- as.numeric(FisherpvalueTF) if (FisherpvalueTF < 0.01) { current_pathway_commongenes_percent <- paste( "(", format(( current_pathway_commongenes_num / length(genes_from_current_pathway_from_EA) ) * 100, digits = 2 ), "%)" ) current_pathway_commongenes_num_with_percent <- gsub( " ", "", paste( current_pathway_commongenes_num, current_pathway_commongenes_percent, "pv=", format(FisherpvalueTF, digits = 2) ) ) table_pathway_enriched[i, "CommonGenesPathway"] <- length(genes_common_pathway_TFregulon) table_pathway_enriched[i, "CommonGeneSymbols"] <- paste0(genes_common_pathway_TFregulon, collapse = ";") table_pathway_enriched[i, "GenesInPathway"] <- length(genes_from_current_pathway_from_EA) table_pathway_enriched[i, "PercentPathway"] <- as.numeric(table_pathway_enriched[i, "CommonGenesPathway"]) / as.numeric(table_pathway_enriched[i, "GenesInPathway"]) * 100 table_pathway_enriched[i, "PercentRegulon"] <- as.numeric(table_pathway_enriched[i, "CommonGenesPathway"]) / length(RegulonList) * 100 } } } table_pathway_enriched <- table_pathway_enriched[order(table_pathway_enriched[, "Pvalue"], decreasing = FALSE),] table_pathway_enriched <- table_pathway_enriched[table_pathway_enriched[, "Pvalue"] < 0.01,] table_pathway_enriched[, "FDR"] <- p.adjust(table_pathway_enriched[, "Pvalue"], method = "fdr") table_pathway_enriched <- table_pathway_enriched[table_pathway_enriched[, "FDR"] < FDRThresh,] table_pathway_enriched <- table_pathway_enriched[order(table_pathway_enriched[, "FDR"], decreasing = FALSE),] table_pathway_enriched_filt <- table_pathway_enriched[table_pathway_enriched$FDR < FDRThresh, ] if (nrow(table_pathway_enriched) > 0) { tmp <- table_pathway_enriched tmp <- paste( tmp[, "Pathway"], "; FDR= ", format(tmp[, "FDR"], digits = 3), "; (ng=", round(tmp[, "GenesInPathway"]), "); (ncommon=", format(tmp[, "CommonGenesPathway"], digits = 2), ")", sep = "" ) tmp <- as.matrix(tmp) topPathways_tab <- topPathways_tab[, 1:nrow(table_pathway_enriched), drop = FALSE] topPathways_tab[1,] <- tmp } else { topPathways_tab <- NA } if (GeneSymbolsTable == FALSE) { return(topPathways_tab) } if (GeneSymbolsTable == TRUE) { return(table_pathway_enriched_filt) } } #' @title Differentially expression analysis (DEA) using limma package. #' @description Differentially expression analysis (DEA) using limma package. #' @param FC.cut write #' @param AffySet A matrix-like data object containing log-ratios or log-expression values #' for a series of arrays, with rows corresponding to genes and columns to samples #' @examples #' \dontrun{ #' to add example #' } #' @export #' @return List of list with tables in 2 by 2 comparison #' of the top-ranked genes from a linear model fitted by DEA's limma TCGAanalyze_DEA_Affy <- function(AffySet, FC.cut = 0.01) { if (!requireNamespace("Biobase", quietly = TRUE)) { stop("Biobase package is needed for this function to work. Please install it.", call. = FALSE) } if (!requireNamespace("limma", quietly = TRUE)) { stop("limma package is needed for this function to work. Please install it.", call. = FALSE) } Pdatatable <- Biobase::phenoData(AffySet) f <- factor(Pdatatable$Disease) groupColors <- names(table(f)) tmp <- matrix(0, length(groupColors), length(groupColors)) colnames(tmp) <- groupColors rownames(tmp) <- groupColors tmp[upper.tri(tmp)] <- 1 sample_tab <- Pdatatable f <- factor(Pdatatable$Disease) design <- model.matrix( ~ 0 + f) colnames(design) <- levels(f) fit <- limma::lmFit(AffySet, design) ## fit is an object of class MArrayLM. groupColors <- names(table(Pdatatable$Disease)) CompleteList <- vector("list", sum(tmp)) k <- 1 for (i in 1:length(groupColors)) { col1 <- colnames(tmp)[i] for (j in 1:length(groupColors)) { col2 <- rownames(tmp)[j] if (i != j) { if (tmp[i, j] != 0) { Comparison <- paste(col2, "-", col1, sep = "") if (i == 4 && j == 6) { Comparison <- paste(col1, "-", col2, sep = "") } if (i == 5 && j == 6) { Comparison <- paste(col1, "-", col2, sep = "") } print(paste(i, j, Comparison, "to do...")) cont.matrix <- limma::makeContrasts(I = Comparison, levels = design) fit2 <- limma::contrasts.fit(fit, cont.matrix) fit2 <- limma::eBayes(fit2) sigI <- limma::topTable( fit2, coef = 1, adjust.method = "BH", sort.by = "B", p.value = 0.05, lfc = FC.cut, number = 50000 ) sigIbis <- sigI[order(abs(as.numeric(sigI$logFC)), decreasing = TRUE), ] names(CompleteList)[k] <- gsub("-", "_", Comparison) CompleteList[[k]] <- sigIbis k <- k + 1 } } } } return(CompleteList) } #' @title Generate network #' @description TCGAanalyze_analyseGRN perform gene regulatory network. #' @param TFs a vector of genes. #' @param normCounts is a matrix of gene expression with genes in rows and samples in columns. #' @param kNum the number of nearest neighbors to consider to estimate the mutual information. #' Must be less than the number of columns of normCounts. #' @export #' @return an adjacent matrix TCGAanalyze_analyseGRN <- function(TFs, normCounts, kNum) { if (!requireNamespace("parmigene", quietly = TRUE)) { stop( "parmigene package is needed for this function to work. Please install it.", call. = FALSE ) } MRcandidates <- intersect(rownames(normCounts), TFs) # Mutual information between TF and genes sampleNames <- colnames(normCounts) geneNames <- rownames(normCounts) messageMI_TFgenes <- paste( "Estimation of MI among [", length(MRcandidates), " TRs and ", nrow(normCounts), " genes].....", sep = "" ) timeEstimatedMI_TFgenes1 <- length(MRcandidates) * nrow(normCounts) / 1000 timeEstimatedMI_TFgenes <- format(timeEstimatedMI_TFgenes1 * ncol(normCounts) / 17000, digits = 2) messageEstimation <- print( paste( "I Need about ", timeEstimatedMI_TFgenes, "seconds for this MI estimation. [Processing 17000k elements /s] " ) ) miTFGenes <- parmigene::knnmi.cross(normCounts[MRcandidates,], normCounts, k = kNum) return(miTFGenes) } #' @title Generate pathview graph #' @description TCGAanalyze_Pathview pathway based data integration and visualization. #' @param dataDEGs dataDEGs #' @param pathwayKEGG pathwayKEGG #' @export #' @return an adjacent matrix #' @examples #' \dontrun{ #' dataDEGs <- data.frame(mRNA = c("TP53","TP63","TP73"), logFC = c(1,2,3)) #' TCGAanalyze_Pathview(dataDEGs) #' } TCGAanalyze_Pathview <- function(dataDEGs, pathwayKEGG = "hsa05200") { if (!requireNamespace("clusterProfiler", quietly = TRUE)) { stop("clusterProfiler needed for this function to work. Please install it.", call. = FALSE) } if (!requireNamespace("pathview", quietly = TRUE)) { stop("pathview needed for this function to work. Please install it.", call. = FALSE) } # Converting Gene symbol to gene ID eg = as.data.frame( clusterProfiler::bitr( dataDEGs$mRNA, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db" ) ) eg <- eg[!duplicated(eg$SYMBOL), ] dataDEGs <- dataDEGs[dataDEGs$mRNA %in% eg$SYMBOL, ] dataDEGs <- dataDEGs[order(dataDEGs$mRNA, decreasing = FALSE), ] eg <- eg[order(eg$SYMBOL, decreasing = FALSE), ] dataDEGs$GeneID <- eg$ENTREZID dataDEGsFiltLevel_sub <- subset(dataDEGs, select = c("GeneID", "logFC")) genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC) names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID hsa05200 <- pathview::pathview( gene.data = genelistDEGs, pathway.id = pathwayKEGG, species = "hsa", limit = list(gene = as.integer(max( abs(genelistDEGs) ))) ) } #' @title infer gene regulatory networks #' @description TCGAanalyze_networkInference taking expression data as input, #' this will return an adjacency matrix of interactions #' @param data expression data, genes in columns, samples in rows #' @param optionMethod inference method, chose from aracne, c3net, clr and mrnet #' @export #' @return an adjacent matrix TCGAanalyze_networkInference <- function(data, optionMethod = "clr") { # Converting Gene symbol to gene ID if (optionMethod == "c3net") { if (!requireNamespace("c3net", quietly = TRUE)) { stop( "c3net package is needed for this function to work. Please install it.", call. = FALSE ) } net <- c3net::c3net(t(data)) } else { if (!requireNamespace("minet", quietly = TRUE)) { stop( "minet package is needed for this function to work. Please install it.", call. = FALSE ) } net <- minet::minet(data, method = optionMethod) } return(net) } #' Creates a plot for GAIA output (all significant aberrant regions.) #' @description #' This function is a auxiliary function to visualize GAIA output #' (all significant aberrant regions.) #' @param calls A matrix with the following columns: Chromossome, Aberration Kind #' Region Start, Region End, Region Size and score #' @param threshold Score threshold (orange horizontal line in the plot) #' @export #' @importFrom graphics abline axis legend plot points #' @return A plot with all significant aberrant regions. #' @examples #' call <- data.frame("Chromossome" = rep(9,100), #' "Aberration Kind" = rep(c(-2,-1,0,1,2),20), #' "Region Start [bp]" = 18259823:18259922, #' "Region End [bp]" = 18259823:18259922, #' "score" = rep(c(1,2,3,4),25)) #' gaiaCNVplot(call,threshold = 0.01) #' call <- data.frame("Chromossome" = rep(c(1,9),50), #' "Aberration Kind" = rep(c(-2,-1,0,1,2),20), #' "Region Start [bp]" = 18259823:18259922, #' "Region End [bp]" = 18259823:18259922, #' "score" = rep(c(1,2,3,4),25)) #' gaiaCNVplot(call,threshold = 0.01) gaiaCNVplot <- function (calls, threshold = 0.01) { Calls <- calls[order(calls[, grep("start", colnames(calls), ignore.case = TRUE)]), ] Calls <- Calls[order(Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]), ] rownames(Calls) <- NULL Chromo <- Calls[, grep("chr", colnames(calls), ignore.case = TRUE)] Gains <- apply(Calls, 1, function(x) ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 1, x["score"], 0)) Losses <- apply(Calls, 1, function(x) ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 0, x["score"], 0)) plot( Gains, ylim = c(-max(Calls[, "score"] + 2), max(Calls[, "score"] + 2)), type = "h", col = "red", xlab = "Chromosome", ylab = "Score", xaxt = "n" ) points(-(Losses), type = "h", col = "blue") # Draw origin line abline(h = 0, cex = 4) # Draw threshold lines abline( h = -log10(threshold), col = "orange", cex = 4, main = "test" ) abline( h = log10(threshold), col = "orange", cex = 4, main = "test" ) uni.chr <- unique(Chromo) temp <- rep(0, length(uni.chr)) for (i in 1:length(uni.chr)) { temp[i] <- max(which(uni.chr[i] == Chromo)) } for (i in 1:length(temp)) { abline(v = temp[i], col = "black", lty = "dashed") } nChroms <- length(uni.chr) begin <- c() for (d in 1:nChroms) { chrom <- sum(Chromo == uni.chr[d]) begin <- append(begin, chrom) } temp2 <- rep(0, nChroms) for (i in 1:nChroms) { if (i == 1) { temp2[1] <- (begin[1] * 0.5) } else if (i > 1) { temp2[i] <- temp[i - 1] + (begin[i] * 0.5) } } uni.chr[uni.chr == 23] <- "X" uni.chr[uni.chr == 24] <- "Y" for (i in 1:length(temp)) { axis(1, at = temp2[i], labels = uni.chr[i], cex.axis = 1) } legend( x = 1, y = max(Calls[, "score"] + 2), y.intersp = 0.8, c("Amp"), pch = 15, col = c("red"), text.font = 3 ) legend( x = 1, y = -max(Calls[, "score"] + 0.5), y.intersp = 0.8, c("Del"), pch = 15, col = c("blue"), text.font = 3 ) } #' Get a matrix of interactions of genes from biogrid #' @description #' Using biogrid database, it will create a matrix of gene interactions. #' If columns A and row B has value 1, it means the gene A and gene B interacts. #' @param tmp.biogrid Biogrid table #' @export #' @param names.genes List of genes to filter from output. Default: consider all genes #' @return A matrix with 1 for genes that interacts, 0 for no interaction. #' @examples #' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1") #' tmp.biogrid <- data.frame("Official.Symbol.Interactor.A" = names.genes.de, #' "Official.Symbol.Interactor.B" = rev(names.genes.de)) #' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de) #' \dontrun{ #' file <- paste0("http://thebiogrid.org/downloads/archives/", #' "Release%20Archive/BIOGRID-3.4.133/BIOGRID-ALL-3.4.133.tab2.zip") #' downloader::download(file,basename(file)) #' unzip(basename(file),junkpaths =TRUE) #' tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)), #' header=TRUE, sep="\t", stringsAsFactors=FALSE) #' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1") #' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de) #' } getAdjacencyBiogrid <- function(tmp.biogrid, names.genes = NULL) { it.a <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[1] it.b <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[2] if (is.null(names.genes)) { names.genes <- sort(union(unique(tmp.biogrid[, it.a]), unique(tmp.biogrid[, it.b]))) ind <- seq(1, nrow(tmp.biogrid)) } else { ind.A <- which(tmp.biogrid[, it.a] %in% names.genes) ind.B <- which(tmp.biogrid[, it.b] %in% names.genes) ind <- intersect(ind.A, ind.B) } mat.biogrid <- matrix( 0, nrow = length(names.genes), ncol = length(names.genes), dimnames = list(names.genes, names.genes) ) for (i in ind) { mat.biogrid[tmp.biogrid[i, it.a], tmp.biogrid[i, it.b]] <- mat.biogrid[tmp.biogrid[i, it.b], tmp.biogrid[i, it.a]] <- 1 } diag(mat.biogrid) <- 0 return(mat.biogrid) } #' Get GDC samples with both DNA methylation (HM450K) and Gene expression data from #' GDC databse #' @description #' For a given TCGA project it gets the samples (barcode) with both DNA methylation and Gene expression data #' from GDC database #' @param project A GDC project #' @param n Number of samples to return. If NULL return all (default) #' @param legacy Access legacy (hg19) or harmonized database (hg38). #' @return A vector of barcodes #' @export #' @examples #' # Get ACC samples with both DNA methylation (HM450K) and gene expression aligned to hg19 #' samples <- matchedMetExp("TCGA-UCS", legacy = TRUE) matchedMetExp <- function(project, legacy = FALSE, n = NULL) { if (legacy) { # get primary solid tumor samples: DNA methylation message("Download DNA methylation information") met450k <- GDCquery( project = project, data.category = "DNA methylation", platform = "Illumina Human Methylation 450", legacy = TRUE, sample.type = c("Primary Tumor") ) # get primary solid tumor samples: RNAseq message("Download gene expression information") exp <- GDCquery( project = project, data.category = "Gene expression", data.type = "Gene expression quantification", platform = "Illumina HiSeq", file.type = "results", sample.type = c("Primary Tumor"), legacy = TRUE ) } else { # get primary solid tumor samples: DNA methylation message("Download DNA methylation information") met450k <- GDCquery( project = project, data.category = "DNA Methylation", platform = "Illumina Human Methylation 450", sample.type = c("Primary Tumor") ) # get primary solid tumor samples: RNAseq message("Download gene expression information") exp <- GDCquery( project = project, data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "HTSeq - Counts" ) } met450k.tp <- met450k$results[[1]]$cases # Get patients with samples in both platforms exp.tp <- exp$results[[1]]$cases patients <- unique(substr(exp.tp, 1, 15)[substr(exp.tp, 1, 12) %in% substr(met450k.tp, 1, 12)]) if (!is.null(n)) patients <- patients[1:n] # get only n samples return(patients) }