# FUNCTION # - arguments: collection,2 genesets (1 per phenotype), top correlated genesets # - returns: data.frame which contains the pairs between the phenotype_genesets # and their top corralated genesets after filtering. Results are sorted in # descending correlation order # NOTES # - Collection: only accepting pathprint,MSigDB_H,MSigDB_C2_CP,MSigDB_C5_GO_BP # - Phenotype_genesets: must belong to the collection and be spelled exactly # right! # - Do we need a search feature that allows the user to search for genesets? # - If either of the above does not hold, the function specifies which geneset/s # do not belong to the collection and also states their phenotype # - The initial data.frame is filtered by minimum absolute correlation and # maximum p-value utils::globalVariables(c("pathprint.Hs.gs","pheatmap", "pathCor_pathprint_v1.2.3_dframe", "pathCor_CPv5.1_dframe", "gobp_gs_v5.1", "PathCor", "p.value", "Pathway.A", "Pathway.B")) #' Discover correlation relationships among multiple pathways/gene sets #' identified by GSEA (gene set enrichment analysis). All the input #' pathways/gene sets should come from the same collection. MSigDB H hallmark #' gene sets, MSigDB C2 Canonical pathways, MSigDB C5 GO BP gene sets, #' and Pathprint are treated as four separate collections. #' #' @param collection pathway's collection #' @param phenotype_0_genesets genesets/pathways of the first group of pathways #' @param phenotype_1_genesets genesets/pathways of the second group of pathways #' @param adj_overlap whether the correlation coefficients are adjusted for gene #' overlap #' @param top most correlated genesets/pathways #' @param min_abs_corr minimum absolute correlation #' @param max_pval maximum p-value #' #' @return pcxn object #' @export #' #' @examples #' \dontrun{ #' pcxn_analyze("pathprint",c("ABC transporters (KEGG)", #' "ACE Inhibitor Pathway (Wikipathways)","AR down reg. targets (Netpath)"), #' adj_overlap = TRUE, c("DNA Repair (Reactome)"), 10, 0.05, 0.05) #' } pcxn_analyze <- function(collection = c("pathprint","MSigDB_H","MSigDB_C2_CP", "MSigDB_C5_GO_BP"), phenotype_0_genesets = NULL,phenotype_1_genesets = NULL, adj_overlap = FALSE, top = 10, min_abs_corr = 0.05, max_pval = 0.05 ) { if(missing(phenotype_0_genesets) && missing(phenotype_1_genesets)) stop( "Please insert phenotype_0_genesets and/or phenotype_1_genesets arguments") pathprint.Hs.gs.rda <- NULL pathprint.Hs.gs <- NULL h_gs_v5.1 <- NULL cp_gs_v5.1 <- NULL gobp_gs_v5.1.rda <- NULL pathCor_Hv5.1_dframe <- NULL pathCor_GOBPv5.1_dframe <- NULL pathCor_CPv5.1_dframe.rda <- NULL pathCor_Hv5.1_unadjusted_dframe <- NULL pathCor_GOBPv5.1_unadjusted_dframe <- NULL pathCor_CPv5.1_unadjusted_dframe.rda <- NULL # loading the datasets, when the package is complete use data(...) correct_genesets_flag_1 <- TRUE correct_genesets_flag_2 <- TRUE acceptable_collections = c("pathprint","MSigDB_H","MSigDB_C2_CP", "MSigDB_C5_GO_BP") if ( collection == "pathprint" ) { if ( adj_overlap ) { data(pathCor_pathprint_v1.2.3_dframe, envir = environment()) matrix <- pathCor_pathprint_v1.2.3_dframe } else { data(pathCor_pathprint_v1.2.3_unadjusted_dframe, envir = environment()) matrix <- pathCor_pathprint_v1.2.3_unadjusted_dframe } data(pathprint.Hs.gs, envir = environment()) names <- names(pathprint.Hs.gs) } if ( collection == "MSigDB_H" ) { if ( adj_overlap ) { data(pathCor_Hv5.1_dframe, envir = environment()) matrix <- pathCor_Hv5.1_dframe } else { data(pathCor_Hv5.1_unadjusted_dframe, envir = environment()) matrix <- pathCor_Hv5.1_unadjusted_dframe } data(h_gs_v5.1, envir = environment()) names <- names(h_gs_v5.1) } if ( collection == "MSigDB_C2_CP" ) { if ( adj_overlap ) { data(pathCor_CPv5.1_dframe, envir = environment()) matrix <- pathCor_CPv5.1_dframe } else { data(pathCor_CPv5.1_unadjusted_dframe, envir = environment()) matrix <- pathCor_CPv5.1_unadjusted_dframe } data(cp_gs_v5.1, envir = environment()) names <- names(cp_gs_v5.1) } if ( collection == "MSigDB_C5_GO_BP" ) { if ( adj_overlap ) { data(pathCor_GOBPv5.1_dframe, envir = environment()) matrix <- pathCor_GOBPv5.1_dframe } else { data(pathCor_GOBPv5.1_unadjusted_dframe, envir = environment()) matrix <- pathCor_GOBPv5.1_unadjusted_dframe } data(gobp_gs_v5.1.rda, envir = environment()) names <- names(gobp_gs_v5.1) } # Checking if phenotype_genesets are correctly spelled commons_0 <- intersect(phenotype_0_genesets, names) difs_0 <- c() ph_0 <- c() if ( length(commons_0) != length(phenotype_0_genesets)) { correct_genesets_flag_1 <- FALSE difs_0 <- setdiff(phenotype_0_genesets,commons_0) ph_0 <- rep(0, length(difs_0)) } commons_1 <- intersect(phenotype_1_genesets, names) difs_1 <- c() ph_1 <- c() if ( length(commons_1) != length(phenotype_1_genesets)) { correct_genesets_flag_2 <- FALSE difs_1 <- setdiff(phenotype_1_genesets,commons_1) ph_1 <- rep(1, length(difs_1)) } difs <- c(difs_0,difs_1) phs <- c(ph_0,ph_1) # stop if wrong collection inserted or either phenotype genesets do not # match the collection if(!(collection %in% acceptable_collections)) stop( "Invalid collection name, please choose between: pathprint, MSigDB_H, MSigDB_C2_CP or MSigDB_C5_GO_BP") if((correct_genesets_flag_1 == FALSE) || (correct_genesets_flag_2 == FALSE)) stop( paste("Phenotype ",phs," geneset \"",difs, "\" does not belong to the ",collection , " collection\n ", sep = "")) # find un-used genesets and create the placeholder data.frame # for their correlation scores used_genesets <- c(phenotype_0_genesets,phenotype_1_genesets) unused_genesets <- setdiff(names,used_genesets) candidate_genesets <- data.frame(unused_genesets) candidate_genesets$score <- rep(0,nrow(candidate_genesets)) # step 1: Filtering on absolute correlation (>=) and p-value (<=) step1_matrix <- subset(matrix, abs(PathCor) >= min_abs_corr & p.value <= max_pval) # generate score for each un-used geneset and find the top correlated for(i in 1:length(unused_genesets)) { temp_cor <- subset(step1_matrix,(Pathway.A == unused_genesets[i] | Pathway.B == unused_genesets[i])) temp_cor2 <- subset(temp_cor, (Pathway.A %in% used_genesets | Pathway.B %in% used_genesets)) correlation_vector <- temp_cor2[3] if (dim(correlation_vector)[1] != 0) { # transpose vector correlation_vector <- unname(t(correlation_vector)) max_position <- which.max(abs(as.numeric( unlist(correlation_vector)))) max <- correlation_vector[max_position] candidate_genesets[i,2] <- max } } candidate_genesets<- candidate_genesets[ with(candidate_genesets, order(-abs(candidate_genesets$score))),] top_correlated_genesets <- candidate_genesets[,"unused_genesets"][1:top] # create matrix with geneset groups info <- list(phenotype_0_genesets, phenotype_1_genesets, as.vector(top_correlated_genesets)) names(info) <- c("phenotype_0_genesets", "phenotype_1_genesets", "top_correlated_genesets") # step 2: Find the possible pairs between phenotype_0_genesets, # phenotype_1_genesets and correlated genesets. interesting_genesets <- c(as.list(used_genesets), as.vector(top_correlated_genesets)) step2_matrix <- subset(step1_matrix, Pathway.A %in% interesting_genesets & Pathway.B %in% interesting_genesets) if(length(phenotype_1_genesets) == 0) print(paste("Successful analyzing: Based on phenotype 0 [", paste(phenotype_0_genesets, collapse=', ' ),"] and ", top, " top correlated genesets, ", dim(step2_matrix)[1], " correlation pairs were found.", sep ="")) else print(paste("Successful analyzing: Based on phenotype 0 [", paste(phenotype_0_genesets, collapse=', ' ), "], phenotype 1 [", paste(phenotype_1_genesets, collapse=', ' ),"] and ", top, " top correlated genesets, ", dim(step2_matrix)[1], " correlation pairs were found.", sep ="")) po = new("pcxn",type = "pcxn_analyze", data = as.matrix(step2_matrix), geneset_groups = as.list(info)) return(po) }