#' Pick power to fit network to a scale-free topology #' #' @param exp A gene expression data frame with genes in row names and #' samples in column names or a `SummarizedExperiment` object. #' @param net_type Network type. One of 'signed', 'signed hybrid' or #' 'unsigned'. Default is signed. #' @param rsquared R squared cutoff. Default is 0.8. #' @param cor_method Correlation method. One of "pearson", "biweight" #' or "spearman". Default is "spearman". #' #' @return A list containing: \itemize{ #' \item{power}{Optimal power based on scale-free topology fit} #' \item{plot}{A ggplot object displaying main statistics of the SFT fit test} #' } #' #' @author Fabricio Almeida-Silva #' @seealso #' \code{\link[WGCNA]{pickSoftThreshold}} #' @rdname SFT_fit #' @export #' @importFrom WGCNA pickSoftThreshold #' @importFrom ggpubr ggarrange ggscatter #' @importFrom ggplot2 theme element_text geom_hline #' @examples #' data(filt.se) #' sft <- SFT_fit(filt.se, cor_method="pearson") SFT_fit <- function(exp, net_type="signed", rsquared=0.8, cor_method="spearman") { exp <- handleSE(exp) texp <- t(exp) if(cor_method == "pearson") { sft <- WGCNA::pickSoftThreshold(texp, networkType = net_type, powerVector=3:20, RsquaredCut = rsquared) } else if(cor_method == "biweight") { sft <- WGCNA::pickSoftThreshold(texp, networkType = net_type, powerVector=3:20, RsquaredCut = rsquared, corFnc = bicor, corOptions = list(use = 'p', maxPOutliers = 0.05)) } else if (cor_method == "spearman"){ sft <- WGCNA::pickSoftThreshold(texp, networkType = net_type, powerVector=3:20, RsquaredCut = rsquared, corOptions = list(use = 'p', method = "spearman")) } else { stop("Please, specify a correlation method (one of 'spearman', 'pearson' or 'biweight').") } wgcna_power <- sft$powerEstimate if(is.na(wgcna_power)){ message("No power reached R-squared cut-off, now choosing max R-squared based power") wgcna_power <- sft$fitIndices$Power[which(sft$fitIndices$SFT.R.sq == max(sft$fitIndices$SFT.R.sq))] } # Create data frame with indices to plot sft_df <- data.frame(power = sft$fitIndices$Power, fit = -sign(sft$fitIndices$slope) * sft$fitIndices$SFT.R.sq, meank = sft$fitIndices$mean.k.) # Plot 1 p1 <- ggpubr::ggscatter(sft_df, x="power", y="fit", xlab="Soft threshold (power)", ylab=expression(paste("Scale-free topology fit - ", R^{2})), title = "Scale independence", ylim=c(0, 1), label="power", size=1, font.label=10, color="gray10", font.tickslab=10, ytickslab.rt=90) + geom_hline(yintercept = rsquared, color="brown3") + theme(plot.title = element_text(hjust = 0.5)) # Plot 2 p2 <- ggscatter(sft_df, x="power", y="meank", xlab="Soft threshold (power)", ylab="Mean connectivity (k)", title="Mean connectivity", label="power", size=1, font.label=10, color="gray10", font.tickslab=10, ytickslab.rt=90) + theme(plot.title = element_text(hjust=0.5)) # Combined plot sft_plot <- ggpubr::ggarrange(p1, p2) result <- list(power=as.numeric(wgcna_power), plot=sft_plot) return(result) } #' Reconstruct gene coexpression network from gene expression #' #' @param exp A gene expression data frame with genes in row names and #' samples in column names or a `SummarizedExperiment` object. #' @param net_type Network type. One of 'signed', 'signed hybrid' or #' 'unsigned'. Default: 'signed'. #' @param module_merging_threshold Correlation threshold to merge similar #' modules into a single one. Default: 0.8. #' @param SFTpower SFT power generated by the function \code{SFT_fit}. #' @param cor_method Correlation method. One of "pearson", "biweight" or #' "spearman". Default is "spearman". #' @param verbose Logical indicating whether to display progress #' messages or not. Default: FALSE. #' #' @return List containing: \itemize{ #' \item Adjacency matrix #' \item Data frame of module eigengenes #' \item Data frame of genes and their corresponding modules #' \item Data frame of intramodular connectivity #' \item Vector of module assignment #' \item Correlation matrix #' \item Parameters used for network reconstruction #' \item Objects to plot the dendrogram in \code{plot_dendro_and_colors}. #' } #' @author Fabricio Almeida-Silva #' @seealso #' \code{\link[WGCNA]{adjacency.fromSimilarity}},\code{\link[WGCNA]{TOMsimilarity}},\code{\link[WGCNA]{standardColors}},\code{\link[WGCNA]{labels2colors}},\code{\link[WGCNA]{moduleEigengenes}},\code{\link[WGCNA]{plotEigengeneNetworks}},\code{\link[WGCNA]{mergeCloseModules}},\code{\link[WGCNA]{plotDendroAndColors}},\code{\link[WGCNA]{intramodularConnectivity}} #' \code{\link[dynamicTreeCut]{cutreeDynamicTree}} #' @rdname exp2gcn #' @export #' @importFrom WGCNA TOMsimilarity standardColors labels2colors moduleEigengenes plotEigengeneNetworks mergeCloseModules plotDendroAndColors intramodularConnectivity #' @importFrom dynamicTreeCut cutreeDynamicTree #' @importFrom stats as.dist median cor fisher.test hclust na.omit prcomp qnorm qqplot quantile sd var #' @importFrom grDevices colorRampPalette #' @examples #' data(filt.se) #' # The SFT fit was previously calculated and the optimal power was 16 #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") exp2gcn <- function(exp, net_type="signed", module_merging_threshold = 0.8, SFTpower = NULL, cor_method = "spearman", verbose = FALSE) { params <- list(net_type=net_type, module_merging_threshold=module_merging_threshold, SFTpower=SFTpower, cor_method=cor_method) norm.exp <- handleSE(exp) if(is.null(SFTpower)) { stop("Please, specify the SFT power.") } if(verbose) { message("Calculating adjacency matrix...") } cor_matrix <- calculate_cor_adj(cor_method, norm.exp, SFTpower, net_type)$cor adj_matrix <- calculate_cor_adj(cor_method, norm.exp, SFTpower, net_type)$adj #Convert to matrix gene_ids <- rownames(adj_matrix) adj_matrix <- matrix(adj_matrix, nrow=nrow(adj_matrix)) rownames(adj_matrix) <- gene_ids colnames(adj_matrix) <- gene_ids #Calculate TOM from adjacency matrix if(verbose) { message("Calculating topological overlap matrix (TOM)...") } tomtype <- get_TOMtype(net_type) TOM <- WGCNA::TOMsimilarity(adj_matrix, TOMType = tomtype) #Hierarchically cluster genes dissTOM <- 1-TOM #hclust takes a distance structure geneTree <- hclust(as.dist(dissTOM), method="average") #Detecting coexpression modules if(verbose) { message("Detecting coexpression modules...") } old.module_labels <- dynamicTreeCut::cutreeDynamicTree(dendro=geneTree, minModuleSize=30, deepSplit=TRUE) nmod <- length(unique(old.module_labels)) palette <- rev(WGCNA::standardColors(nmod)) old.module_colors <- WGCNA::labels2colors(old.module_labels, colorSeq = palette) #Calculate eigengenes and merge the ones who are highly correlated if(verbose) { message("Calculating module eigengenes (MEs)...") } old.MElist <- WGCNA::moduleEigengenes(t(norm.exp), colors = old.module_colors, softPower = SFTpower) old.MEs <- old.MElist$eigengenes #Calculate dissimilarity of module eigengenes MEDiss1 <- 1-cor(old.MEs) #Hierarchically cluster module eigengenes to see how they're related old.METree <- hclust(as.dist(MEDiss1), method="average") #Then, choose a height cut MEDissThreshold <- 1-module_merging_threshold #Merge the modules. if(verbose) { message("Merging similar modules...") } if(cor_method == "pearson") { merge1 <- WGCNA::mergeCloseModules(t(norm.exp), old.module_colors, cutHeight = MEDissThreshold, verbose = 0, colorSeq=palette) } else if(cor_method == "spearman") { merge1 <- WGCNA::mergeCloseModules(t(norm.exp), old.module_colors, cutHeight = MEDissThreshold, verbose = 0, corOptions = list(use = "p", method = "spearman"), colorSeq=palette) } else if(cor_method == "biweight") { merge1 <- WGCNA::mergeCloseModules(t(norm.exp), old.module_colors, cutHeight = MEDissThreshold, verbose = 0, corFnc = bicor, colorSeq=palette) } else { stop("Please, specify a correlation method. One of 'spearman', 'pearson' or 'biweight'.") } new.module_colors <- merge1$colors new.MEs <- merge1$newMEs #Plot dendrogram of merged modules eigengenes new.METree <- hclust(as.dist(1-cor(new.MEs)), method="average") #Get data frame with genes and modules they belong to genes_and_modules <- as.data.frame(cbind(gene_ids, new.module_colors), stringsAsFactors = FALSE) colnames(genes_and_modules) <- c("Genes", "Modules") if(verbose) { message("Calculating intramodular connectivity...") } kwithin <- WGCNA::intramodularConnectivity(adj_matrix, new.module_colors) result.list <- list(adjacency_matrix = adj_matrix, MEs = new.MEs, genes_and_modules = genes_and_modules, kIN = kwithin, moduleColors = new.module_colors, correlation_matrix = cor_matrix, params=params, dendro_plot_objects = list(tree = geneTree, unmerged = old.module_colors) ) return(result.list) } #' Plot eigengene network #' #' @param gcn List object returned by \code{exp2gcn}. #' #' @return A base plot with the eigengene network #' @importFrom WGCNA plotEigengeneNetworks #' @importFrom graphics par layout #' @export #' @rdname plot_eigengene_network #' @examples #' data(filt.se) #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' plot_eigengene_network(gcn) plot_eigengene_network <- function(gcn) { on.exit(graphics::layout(1)) opar <- par(no.readonly=TRUE) on.exit(par(opar), add=TRUE, after=FALSE) p <- WGCNA::plotEigengeneNetworks(gcn$MEs, "", marDendro = c(3,5,2,6), marHeatmap = c(3,4,2,2)) return(p) } #' Plot dendrogram of genes and modules #' #' @param gcn List object returned by \code{exp2gcn}. #' #' @return A base plot with the gene dendrogram and modules. #' @importFrom WGCNA plotDendroAndColors #' @importFrom graphics layout par #' @export #' @rdname plot_dendro_and_colors #' @examples #' data(filt.se) #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' plot_dendro_and_colors(gcn) plot_dendro_and_colors <- function(gcn) { on.exit(graphics::layout(1)) opar <- par(no.readonly=TRUE) on.exit(par(opar), add=TRUE, after=FALSE) p <- WGCNA::plotDendroAndColors(gcn$dendro_plot_objects$tree, cbind( gcn$dendro_plot_objects$unmerged, gcn$moduleColors ), c("Unmerged", "Merged"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) return(NULL) } #' Perform module stability analysis #' #' @param exp A gene expression data frame with genes in row names and #' samples in column names or a `SummarizedExperiment` object. #' @param net List object returned by \code{exp2gcn}. #' @param nRuns Number of times to resample. Default is 20. #' #' @return A base plot with the module stability results. #' @seealso #' \code{\link[WGCNA]{sampledBlockwiseModules}},\code{\link[WGCNA]{matchLabels}},\code{\link[WGCNA]{plotDendroAndColors}} #' @rdname module_stability #' @export #' @importFrom WGCNA sampledBlockwiseModules matchLabels plotDendroAndColors #' @importFrom graphics par layout #' @examples #' data(filt.se) #' # The SFT fit was previously calculated and the optimal power was 16 #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' # For simplicity, only 2 runs #' module_stability(exp = filt.se, net = gcn, nRuns = 2) module_stability <- function(exp, net, nRuns = 20) { norm.exp <- handleSE(exp) expr <- as.matrix(t(norm.exp)) net_type <- net$params$net_type SFTpower <- net$params$SFTpower cor_method <- net$params$cor_method module_merging_threshold <- 1 - net$params$module_merging_threshold TOMType <- get_TOMtype(net_type) mods0 <- WGCNA::sampledBlockwiseModules( nRuns = nRuns, replace = FALSE, datExpr = expr, maxBlockSize = 5000, checkSoftPower = FALSE, corType = cor_method, networkType = net_type, TOMType = TOMType, TOMDenom = "mean", mergeCutHeight = module_merging_threshold, reassignThreshold = 0, numericLabels = FALSE, checkMissingData = FALSE, quickCor = 0, verbose = 2 ) nGenes <- ncol(expr) # Define a matrix of labels for the original and all resampling runs labels <- matrix(0, nGenes, nRuns + 1) labels[, 1] <- mods0[[1]]$mods$colors # Relabel modules in each of the resampling runs labs <- Reduce(cbind, lapply(2:(nRuns+1), function(x) { labels[, x] <- WGCNA::matchLabels(mods0[[x-1]]$mods$colors, labels[, 1]) })) labels <- cbind(labels[,1], labs) on.exit(graphics::layout(1)) opar <- par(no.readonly=TRUE) on.exit(par(opar), add=TRUE, after=FALSE) p <- WGCNA::plotDendroAndColors(mods0[[1]]$mods$dendrograms[[1]], labels, c("Full data set", paste("Resampling", seq_len(nRuns))), main = "Dendrogram and modules: resampled data", autoColorHeight = FALSE, colorHeight = 0.65, dendroLabels = FALSE, hang = 0.03, guideHang = 0.05, addGuide = TRUE, guideAll = FALSE, cex.main = 1, cex.lab = 0.8, cex.colorLabels = 0.7, marAll = c(0, 5, 3, 0)) return(NULL) } #' Correlate module eigengenes to trait #' #' @param exp A gene expression data frame with genes in row names and #' samples in column names or a `SummarizedExperiment` object. #' @param metadata A data frame containing sample names in row names and #' sample annotation in the first column. Ignored if `exp` is #' a `SummarizedExperiment` object, since the function will extract colData. #' @param MEs Module eigengenes. It is the 2nd element of the result list #' generated by the function \code{exp2gcn}. #' @param cor_method Method to calculate correlation. One of 'pearson', #' 'spearman' or 'kendall'. Default is 'spearman'. #' @param transpose Logical indicating whether to transpose the heatmap of not. #' Default is FALSE. #' @param palette RColorBrewer's color palette to use. Default is "RdYlBu", #' a palette ranging from blue to red. #' @param continuous_trait Logical indicating if trait is a continuous variable. #' Default is FALSE. #' @param cex.lab.x Font size for x axis labels. Default: 0.6. #' @param cex.lab.y Font size for y axis labels. Default: 0.6. #' @param cex.text Font size for numbers inside matrix. Default: 0.6. #' #' @return A data frame with correlation and correlation p-values for each pair #' of ME and trait along with a heatmap. #' @details Significance levels: #' 1 asterisk: significant at alpha = 0.05. #' 2 asterisks: significant at alpha = 0.01. #' 3 asterisks: significant at alpha = 0.001. #' no asterisk: not significant. #' #' @author Fabricio Almeida-Silva #' @seealso #' \code{\link[WGCNA]{corPvalueStudent}},\code{\link[WGCNA]{labeledHeatmap}},\code{\link[WGCNA]{blueWhiteRed}} #' @rdname module_trait_cor #' @export #' @importFrom WGCNA corPvalueStudent labeledHeatmap #' @importFrom reshape2 melt #' @importFrom RColorBrewer brewer.pal #' @importFrom SummarizedExperiment colData #' @importFrom graphics par #' @examples #' data(filt.se) #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' module_trait_cor(filt.se, MEs=gcn$MEs) module_trait_cor <- function(exp, metadata, MEs, cor_method="spearman", transpose=FALSE, palette="RdYlBu", continuous_trait=FALSE, cex.lab.x=0.6, cex.lab.y=0.6, cex.text=0.6) { if(is(exp, "SummarizedExperiment")) { metadata <- as.data.frame(SummarizedExperiment::colData(exp)) } exp <- handleSE(exp) metadata <- metadata[colnames(exp), , drop=FALSE] trait <- handle_trait_type(metadata, continuous_trait) modtraitcor <- cor(as.matrix(MEs), trait, use = "p", method=cor_method) nSamples <- ncol(exp) modtraitpvalue <- WGCNA::corPvalueStudent(modtraitcor, nSamples) modtraitsymbol <- pval2symbol(modtraitpvalue) modtrait_long <- reshape2::melt(modtraitcor) pvals_long <- reshape2::melt(modtraitpvalue) combined_long <- merge(modtrait_long, pvals_long, by=c("Var1", "Var2")) colnames(combined_long) <- c("ME", "trait", "cor", "pvalue") combined_long$ME <- as.character(combined_long$ME) combined_long$trait <- as.character(combined_long$trait) textMatrix <- paste(signif(modtraitcor, 2), modtraitsymbol, sep = "") dim(textMatrix) <- dim(modtraitcor) if(transpose) { modtraitcor <- t(modtraitcor) textMatrix <- t(textMatrix) yLabels <- colnames(trait) xLabels <- names(MEs) xSymbols <- names(MEs) ySymbols <- NULL xColorLabels <- TRUE par(mar = c(5, 5, 1, 1)) } else { par(mar = c(6, 8.5, 3, 3)) yLabels <- names(MEs) xLabels <- colnames(trait) xColorLabels <- FALSE xSymbols <- NULL ySymbols <- names(MEs) } cols <- colorRampPalette(rev(RColorBrewer::brewer.pal(10, palette)))(100) hm <- WGCNA::labeledHeatmap(Matrix = modtraitcor, yLabels = yLabels, xLabels = xLabels, ySymbols = ySymbols, xSymbols = xSymbols, colorLabels=FALSE, colors = cols, textMatrix = textMatrix, setStdMargins = FALSE, cex.text = cex.text, cex.lab.x = cex.lab.x, cex.lab.y = cex.lab.y, zlim = c(-1,1), cex.main=1, main = paste("Module-trait relationships")) return(combined_long) } #' Calculate gene significance for a given group of genes #' #' @param exp A gene expression data frame with genes in row names and #' samples in column names or a `SummarizedExperiment` object. #' @param metadata A data frame containing sample names in row names and #' sample annotation in the first column. Ignored if `exp` is #' a `SummarizedExperiment` object, since the function will extract colData. #' @param genes Character vector of genes to be correlated with traits. #' If not given, all genes in `exp` will be considered. #' @param alpha Significance level. Default is 0.05. #' @param min_cor Minimum correlation coefficient. Default is 0.2. #' @param use_abs Logical indicating whether to filter by correlation using #' absolute value or not. If TRUE, a \code{min_cor} of say 0.2 would keep all #' correlations above 0.2 and below -0.2. Default is TRUE. #' @param palette RColorBrewer's color palette to use. Default is "RdYlBu", #' a palette ranging from blue to red. #' @param show_rownames Logical indicating whether to show row names or not. #' Default is FALSE. #' @param continuous_trait Logical indicating if trait is a continuous variable. #' Default is FALSE. #' #' @return A heatmap of gene significance (GS) and a list containing: \itemize{ #' \item{filtered_corandp}{Filtered matrix of correlation and p-values} #' \item{raw_GS}{Raw matrix of gene significances} #' } #' @seealso #' \code{\link[WGCNA]{corPvalueStudent}} #' \code{\link[RColorBrewer]{RColorBrewer}} #' @rdname gene_significance #' @export #' @importFrom reshape2 melt #' @importFrom WGCNA corPvalueStudent #' @importFrom ComplexHeatmap pheatmap #' @importFrom RColorBrewer brewer.pal #' @importFrom SummarizedExperiment colData #' @examples #' data(filt.se) #' gs <- gene_significance(filt.se) gene_significance <- function(exp, metadata, genes=NULL, alpha = 0.05, min_cor = 0.2, use_abs = TRUE, palette="RdYlBu", show_rownames=FALSE, continuous_trait=FALSE) { if(is(exp, "SummarizedExperiment")) { metadata <- as.data.frame(SummarizedExperiment::colData(exp)) } exp <- handleSE(exp) final_exp <- exp[, rownames(metadata)] if(!is.null(genes)) { final_exp <- final_exp[genes, , drop=FALSE] } trait <- handle_trait_type(metadata, continuous_trait) GS <- cor(as.matrix(t(final_exp)), trait, use = "p") # Filter by correlation coefficient and p-value melt.cor <- reshape2::melt(GS) nSamples <- ncol(final_exp) GS.pvalue <- WGCNA::corPvalueStudent(GS, nSamples) melt.pvalue <- reshape2::melt(GS.pvalue) corandp <- merge(melt.cor, melt.pvalue, by = c("Var1", "Var2")) corandp$Var1 <- as.character(corandp$Var1) corandp$Var2 <- as.character(corandp$Var2) colnames(corandp) <- c("gene", "trait", "cor", "pvalue") if(use_abs) { corandp <- corandp[corandp$pvalue < alpha & abs(corandp$cor) > min_cor, ] } else { corandp <- corandp[corandp$pvalue < alpha & corandp$cor > min_cor, ] } cols <- colorRampPalette(rev(RColorBrewer::brewer.pal(10, palette)))(100) p <- ComplexHeatmap::pheatmap(as.matrix(GS), border_color = NA, color=cols, show_rownames=show_rownames, main="Gene-trait correlations") resultlist <- list(filtered_corandp = corandp, raw_GS = GS) return(resultlist) } #' Get GCN hubs #' #' @param exp A gene expression data frame with genes in row names and #' samples in column names or a `SummarizedExperiment` object. #' @param net List object returned by \code{exp2gcn}. #' #' @return Data frame containing gene IDs, modules and #' intramodular connectivity of all hubs. #' @author Fabricio Almeida-Silva #' @seealso #' \code{\link[WGCNA]{signedKME}} #' @rdname get_hubs_gcn #' @export #' @importFrom WGCNA signedKME #' @examples #' data(filt.se) #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' hubs <- get_hubs_gcn(filt.se, gcn) get_hubs_gcn <- function(exp, net) { exp <- handleSE(exp) cor_method <- net$params$cor_method genes_modules <- net$genes_and_modules MEs <- net$MEs kIN <- net$kIN[, 2, drop=FALSE] # Add kIN info genes_modulesk <- merge(genes_modules, kIN, by.x="Genes", by.y="row.names") # Calculate kME if(cor_method == "spearman") { MM <- WGCNA::signedKME(t(exp), MEs, outputColumnName="", corOptions = "use = 'p', method = 'spearman'") } else if(cor_method == "pearson") { MM <- WGCNA::signedKME(t(exp), MEs, outputColumnName="") } else if(cor_method == "biweight") { MM <- WGCNA::signedKME(t(exp), MEs, corFcn="bicor", outputColumnName="", corOptions = "maxPOutliers = 0.1") } else { stop("Invalid correlation method. Pick one of 'spearman' or 'pearson'.") } # Add kME info kme_info <- merge(genes_modulesk, MM, by.x="Genes", by.y="row.names") # Keep top 10% genes with the highest degree for each module genes_mod_list <- split(kme_info, kme_info[,2]) genes_mod_list$grey <- NULL top_10 <- lapply(genes_mod_list, function(x) { indices <- round(nrow(x) * 0.1) return(x[order(x$kWithin, decreasing=TRUE), ][seq_len(indices), ]) }) # Pick genes from the top 10% degree with kME above 0.8 hubs <- Reduce(rbind, lapply(top_10, function(x) { cols <- c("Genes", "Modules", "kWithin", unique(x$Modules)) h <- x[, cols] h <- h[h[,4] > 0.8, c(1,2,3)] })) rownames(hubs) <- seq_len(nrow(hubs)) colnames(hubs) <- c("Gene", "Module", "kWithin") return(hubs) } #' Helper function to perform Fisher's Exact Test with parallel computing #' #' @param genes Character vector containing genes on which enrichment will be tested. #' @param reference Character vector containing genes to be used as background. #' @param genesets List of functional annotation categories. #' (e.g., GO, pathway, etc.) with their associated genes. #' @param adj Multiple testing correction method. #' @return Results of Fisher's Exact Test in a data frame with TermID, #' number of associated genes, number of genes in reference set, #' P-value and adjusted P-value. #' @importFrom stats p.adjust fisher.test #' @importFrom BiocParallel bplapply #' @noRd par_enrich <- function(genes, reference, genesets, adj = "BH") { reference <- reference[!reference %in% genes] tab <- BiocParallel::bplapply(seq_along(genesets), function(i) { RinSet <- sum(reference %in% genesets[[i]]) RninSet <- length(reference) - RinSet GinSet <- sum(genes %in% genesets[[i]]) GninSet <- length(genes) - GinSet fmat <- matrix(c(GinSet, RinSet, GninSet, RninSet), nrow = 2, ncol = 2, byrow = FALSE) colnames(fmat) <- c("inSet", "ninSet") rownames(fmat) <- c("genes", "reference") fish <- stats::fisher.test(fmat, alternative = "greater") pval <- fish$p.value inSet <- RinSet + GinSet res <- c(GinSet, inSet, pval) return(res) }) rtab <- do.call(rbind, tab) rtab <- data.frame(as.vector(names(genesets)), rtab) rtab <- rtab[order(rtab[, 4]), ] colnames(rtab) <- c("TermID", "genes", "all", "pval") padj <- stats::p.adjust(rtab$pval, method = adj) tab.out <- data.frame(rtab, padj) return(tab.out) } #' Perform enrichment analysis for a set of genes #' #' @param genes Character vector containing genes for overrepresentation analysis. #' @param background_genes Character vector of genes to be used as background #' for the Fisher's Exact Test. #' @param annotation Annotation data frame with genes in the first column and #' functional annotation in the other columns. This data frame can be exported #' from Biomart or similar databases. #' @param column Column or columns of \code{annotation} to be used for enrichment. #' Both character or numeric values with column indices can be used. #' If users want to supply more than one column, input a character or #' numeric vector. Default: all columns from \code{annotation}. #' @param correction Multiple testing correction method. One of "holm", #' "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none". #' Default is "BH". #' @param p P-value threshold. P-values below this threshold will be #' considered significant. Default is 0.05. #' @param bp_param BiocParallel back-end to be used. #' Default: BiocParallel::SerialParam() #' @return Data frame containing significant terms, p-values and associated genes. #' @author Fabricio Almeida-Silva #' @rdname enrichment_analysis #' @export #' @importFrom BiocParallel bplapply #' @examples #' \donttest{ #' data(filt.se) #' data(zma.interpro) #' genes <- rownames(filt.se)[1:50] #' background_genes <- rownames(filt.se) #' annotation <- zma.interpro #' # Using p = 1 to show all results #' enrich <- enrichment_analysis(genes, background_genes, annotation, p = 1) #' } enrichment_analysis <- function(genes, background_genes, annotation, column = NULL, correction = "BH", p = 0.05, bp_param = BiocParallel::SerialParam()) { ## Get a dataframe of expressed genes and their annotations gene_column <- colnames(annotation)[1] # Handle missing values annotation[is.na(annotation)] <- "Unannotated" if(is.null(column)) { #all columns? gene_annotation <- annotation } else { if(is.numeric(column)) { # Input is column indices gene_annotation <- annotation[, c(1, column)] } else { #Input is column names gene_annotation <- annotation[, c(paste(gene_column), column)] } } background <- gene_annotation[gene_annotation[,1] %in% background_genes, ] if(ncol(background) == 2) { #only one annotation category # Named list of expressed genes and their annotations annotation_list <- split(background[,1], background[,2]) # Perform the enrichment analysis enrich.table <- par_enrich(genes, background_genes, annotation_list, adj = correction) signif_enrich <- as.data.frame(enrich.table[enrich.table$padj <= p, ], stringsAsFactors = FALSE) if(nrow(signif_enrich) > 0) { signif_terms_genes <- annotation_list[as.character(signif_enrich[,1])] signif_terms_genes <- lapply(signif_terms_genes, function(x) unique(x[x %in% genes])) # Save final result with a column containing gene IDs of associated genes signif_enrich$GeneID <- unlist(lapply(signif_terms_genes, paste, collapse = ",")) df_signif_enrich <- signif_enrich } else { df_signif_enrich <- NULL } } else { # more than 1 annotation category annotation_list <- lapply(seq_along(background)[-1], function(x) { return(split(background[,1], background[,x])) }) # Remove unannotated genes annotation_list_final <- lapply(annotation_list, function(x) { return(x[names(x) != "Unannotated"]) }) # Perform the enrichment analysis signif_enrich <- BiocParallel::bplapply(annotation_list_final, function(x) { enrich.table <- par_enrich(genes, background_genes, x, adj=correction) sig_enrich <- as.data.frame(enrich.table[enrich.table$padj < p, ], stringsAsFactors=FALSE) return(sig_enrich) }, BPPARAM = bp_param) # Remove empty data frames from list signif_enrich <- signif_enrich[vapply(signif_enrich, nrow, numeric(1)) > 0] if(length(signif_enrich) > 0) { # Create a data frame containing annotations and the annotation class annot_correspondence <- Reduce(rbind, lapply(seq_along(background)[-1], function(x) { annot_correspondence <- data.frame(TermID = background[,x], Category = names(background)[x], stringsAsFactors = FALSE) annot_correspondence <- annot_correspondence[!duplicated(annot_correspondence[,c(1,2)]),] return(annot_correspondence) })) # Add column containing the annotation class list_signif_enrich <- lapply(signif_enrich, function(x) { merge(x, annot_correspondence) }) # Reduce list of data frames to a single data frame df_signif_enrich <- Reduce(rbind, list_signif_enrich) # Get genes associated to each term annotation_list_concat <- unlist(annotation_list, recursive = FALSE) # create list from list of lists signif_terms_genes <- annotation_list_concat[as.character(df_signif_enrich[,1])] signif_terms_genes <- lapply(signif_terms_genes, function(x) unique(x[x %in% genes])) # Get final table with enriched terms and a column containing the associated genes df_signif_enrich$GeneID <- unlist(lapply(signif_terms_genes, paste, collapse = ",")) } else { df_signif_enrich <- NULL } } return(df_signif_enrich) } #' Perform enrichment analysis for coexpression network modules #' #' @param net List object returned by \code{exp2gcn}. #' @param background_genes Character vector of genes to be used as background #' for the Fisher's Exact Test. #' @param annotation Annotation data frame with genes in the first column #' and functional annotation in the other columns. This data frame can #' be exported from Biomart or similar databases. #' @param column Column or columns of \code{annotation} to be used for enrichment. #' Both character or numeric values with column indices can be used. #' If users want to supply more than one column, input a character or #' numeric vector. Default: all columns from \code{annotation}. #' @param correction Multiple testing correction method. One of "holm", #' "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" or "none". #' Default is "BH". #' @param p P-value threshold. P-values below this threshold will be considered #' significant. Default is 0.05. #' @param bp_param BiocParallel back-end to be used. #' Default: BiocParallel::SerialParam() #' @return A data frame containing enriched terms, p-values, gene IDs #' and module names. #' @author Fabricio Almeida-Silva #' @rdname module_enrichment #' @importFrom BiocParallel bplapply SerialParam #' @export #' @examples #' \donttest{ #' data(filt.se) #' data(zma.interpro) #' background <- rownames(filt.se) #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' mod_enrich <- module_enrichment(gcn, background, zma.interpro, p=1) #' } module_enrichment <- function(net=NULL, background_genes, annotation, column = NULL, correction = "BH", p = 0.05, bp_param = BiocParallel::SerialParam()) { # Divide modules in different data frames of a list genes.modules <- net$genes_and_modules list.gmodules <- split(genes.modules, genes.modules$Modules) list.gmodules <- list.gmodules[names(list.gmodules) != "grey"] enrichment_allmodules <- BiocParallel::bplapply(seq_along(list.gmodules), function(x) { message("Enrichment analysis for module ", names(list.gmodules)[x], "...") l <- enrichment_analysis(genes = as.character(list.gmodules[[x]][,1]), background_genes = background_genes, annotation = annotation, column = column, correction = correction, p = p) return(l) }, BPPARAM = bp_param) names(enrichment_allmodules) <- names(list.gmodules) # Remove NULL elements from list enrichment_filtered <- enrichment_allmodules[!vapply(enrichment_allmodules, is.null, logical(1))] if(length(enrichment_filtered) == 0) { message("None of the modules had significant enrichment.") enrichment_final <- NULL } else { # Add module name to each data frame enrichment_modnames <- lapply(seq_along(enrichment_filtered), function(x) { return(cbind(enrichment_filtered[[x]], Module=names(enrichment_filtered)[x])) }) # Reduce list of data frames to a single data frame enrichment_final <- Reduce(rbind, enrichment_modnames) } return(enrichment_final) } #' Get 1st-order neighbors of a given gene or group of genes #' #' @param genes Character vector containing genes from which #' direct neighbors will be extracted. #' @param net List object returned by \code{exp2gcn}. #' @param cor_threshold Correlation threshold to filter connections. #' As a weighted network is a fully connected graph, a cutoff must be selected. #' Default is 0.7. #' #' @return List containing 1st-order neighbors for each input gene. #' @seealso \code{exp2gcn} \code{SFT_fit} #' @author Fabricio Almeida-Silva #' @export #' @rdname get_neighbors #' @examples #' data(filt.se) #' genes <- rownames(filt.se)[1:10] #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' neighbors <- get_neighbors(genes, gcn) get_neighbors <- function(genes, net, cor_threshold = 0.7) { net_type <- net$params$net_type power <- net$params$SFTpower edges <- net$correlation_matrix edges <- cormat_to_edgelist(edges) colnames(edges) <- c("Gene1", "Gene2", "Weight") filt_edges <- edges[edges$Gene1 %in% genes | edges$Gene2 %in% genes, ] if(net_type == "signed") { filt_edges <- filt_edges[abs(filt_edges$Weight) >= cor_threshold, ] } else { filt_edges <- filt_edges[filt_edges$Weight >= cor_threshold, ] } result_list <- lapply(genes, function(x) { y <- filt_edges[rowSums(filt_edges == x) > 0, ] y <- c(as.character(y$Gene1), as.character(y$Gene2)) y <- unique(y[y != x]) return(y) }) names(result_list) <- genes return(result_list) } #' Get edge list from an adjacency matrix for a group of genes #' #' @param net List object returned by \code{exp2gcn}. #' @param genes Character vector containing a subset of genes from which #' edges will be extracted. It can be ignored if the user wants to extract #' an edge list for a given module instead of individual genes. #' @param module Character with module name from which edges will be extracted. #' To include 2 or more modules, input the names in a character vector. #' @param filter Logical indicating whether to filter the edge list or not. #' @param method Method to filter spurious correlations. One of "Zscore", #' "optimalSFT", "pvalue" or "min_cor". See details for more information #' on the methods. Default: 'optimalSFT' #' @param r_optimal_test Numeric vector with the correlation thresholds #' to be tested for optimal scale-free topology fit. Only valid #' if \code{method} equals "optimalSFT". Default: seq(0.4, 0.9, by = 0.1) #' @param Zcutoff Minimum Z-score threshold. Only valid if #' \code{method} equals "Zscore". Default: 1.96 #' @param pvalue_cutoff Maximum P-value threshold. Only valid #' if \code{method} equals "pvalue". Default: 0.05 #' @param rcutoff Minimum correlation threshold. Only valid #' if \code{method} equals "min_cor". Default: 0.7 #' @param nSamples Number of samples in the data set from which #' the correlation matrix was calculated. Only required #' if \code{method} equals "pvalue". #' @param check_SFT Logical indicating whether to test if the resulting network #' is close to a scale-free topology or not. Default: FALSE. #' @param bp_param BiocParallel back-end to be used. #' Default: BiocParallel::SerialParam() #' #' @return Data frame with edge list for the input genes. #' @details The default method ("optimalSFT") will create several different #' edge lists by filtering the original correlation matrix by the thresholds #' specified in \code{r_optimal_test}. Then, it will calculate a scale-free #' topology fit index for each of the possible networks and return the network #' that best fits the scale-free topology. #' The method "Zscore" will apply a Fisher Z-transformation for the correlation #' coefficients and remove the Z-scores below the threshold specified #' in \code{Zcutoff}. #' The method "pvalue" will calculate Student asymptotic p-value for the #' correlations and remove correlations whose p-values are above the threshold #' specified in \code{pvalue_cutoff}. #' The method "min_cor" will remove correlations below the minimum correlation #' threshold specified in \code{rcutoff}. #' @seealso #' \code{\link[WGCNA]{scaleFreeFitIndex}},\code{\link[WGCNA]{corPvalueStudent}} #' \code{\link[igraph]{fit_power_law}} #' @seealso \code{SFT_fit} #' @seealso \code{exp2gcn}. #' @author Fabricio Almeida-Silva #' @rdname get_edge_list #' @export #' @importFrom WGCNA scaleFreeFitIndex corPvalueStudent #' @importFrom ggpubr ggline #' @importFrom ggplot2 theme element_text #' @importFrom BiocParallel bplapply SerialParam #' @examples #' data(filt.se) #' gcn <- exp2gcn(filt.se, SFTpower = 18, cor_method = "pearson") #' genes <- rownames(filt.se)[1:50] #' edges <- get_edge_list(gcn, genes=genes, filter = FALSE) get_edge_list <- function(net, genes = NULL, module = NULL, filter = FALSE, method = "optimalSFT", r_optimal_test = seq(0.4, 0.9, by=0.1), Zcutoff = 1.96, pvalue_cutoff = 0.05, rcutoff = 0.7, nSamples = NULL, check_SFT = FALSE, bp_param = BiocParallel::SerialParam()) { # Define objects containing correlation matrix and data frame of genes and modules cor_matrix <- net$correlation_matrix # Should we extract genes in a module? if(!is.null(module)) { genes_modules <- net$genes_and_modules keep <- genes_modules[genes_modules$Modules %in% module, 1] cor_matrix <- cor_matrix[keep, keep] } # Should we extract a user-defined gene set? if(!is.null(genes)) { cor_matrix <- cor_matrix[genes, genes] } # Should we filter the matrix? if(filter) { # Create edge list from correlation matrix edges <- cormat_to_edgelist(cor_matrix) colnames(edges) <- c("Gene1", "Gene2", "Weight") if(method == "Zscore") { # Apply Fisher-Z transformation to correlation values edgesZ <- edges edgesZ$Weight <- 0.5 * log((1+edges$Weight) / (1-edges$Weight)) edgelist <- edgesZ[edgesZ$Weight >= Zcutoff, ] } else if(method == "optimalSFT") { cutoff <- r_optimal_test # Create list of 2 elements: edge lists, each with a different correlation threshold, and degree list_mat <- replicate(length(cutoff), cor_matrix, simplify = FALSE) list_cormat_filtered <- BiocParallel::bplapply(seq_along(list_mat), function(x) { # Convert r values below threshold to NA and set diagonals to 0 matrix <- list_mat[[x]] matrix[matrix < cutoff[x] ] <- NA diag(matrix) <- 0 # Calculate degree degree <- rowSums(matrix, na.rm=TRUE) # Convert lower triangle and diagonals to NA matrix[lower.tri(matrix, diag=TRUE)] <- NA # Convert symmetrical matrix to edge list (Gene1, Gene2, Weight) matrix <- na.omit(data.frame(as.table(matrix), stringsAsFactors=FALSE)) result <- list(matrix=matrix, degree=degree) return(result) }, BPPARAM = bp_param) degree_list <- lapply(list_cormat_filtered, function(x) return(x[[2]])) # Calculate scale-free topology sft.rsquared <- unlist(lapply(degree_list, function(x) WGCNA::scaleFreeFitIndex(x)$Rsquared.SFT)) max.index <- which.max(sft.rsquared) # Plot scale-free topology fit for r values plot.data <- data.frame(x=cutoff, y=sft.rsquared, stringsAsFactors = FALSE) plot <- ggpubr::ggline(plot.data, x = "x", y = "y", size=2, color="firebrick", xlab = "Correlation (r) values", ylab = expression(paste("Scale-free topology fit - ", R^{2})), title = "Scale-free topology fit for given r values", font.title = c(13, "bold")) + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) print(plot) optimalr <- cutoff[max.index] message("The correlation threshold that best fits the scale-free topology is ", optimalr) edgelist <- list_cormat_filtered[[max.index]][[1]] } else if(method == "pvalue") { if(is.null(nSamples)) { stop("Please, specify the number of samples used to calculate the correlation matrix") } # Calculate Student asymptotic p-value for given correlations cor.pvalue <- WGCNA::corPvalueStudent(edges$Weight, nSamples) # Create a data frame of correlations and p-values corandp <- edges corandp$pvalue <- cor.pvalue # Create a final edge list containing only significant correlations edgelist <- corandp[corandp$pvalue < pvalue_cutoff, c(1,2,3)] } else if(method == "min_cor") { edgelist <- edges[edges$Weight >= rcutoff, ] } else{ stop("Please, specify a valid filtering method.") } } else { # Create edge list from correlation matrix without filtering edgelist <- cormat_to_edgelist(cor_matrix) colnames(edgelist) <- c("Gene1", "Gene2", "Weight") } # Check scale-free topology fit if(check_SFT) { test <- check_SFT(edgelist, net_type="gcn") } return(edgelist) }