#' An S4 class to represent enrichment #' #' @slot ID factor #' @slot Term factor #' @slot geneList factor #' @slot ncGeneList factor #' @slot pvalue factor #' @slot pAdj factor #' @slot GeneRatio factor #' @slot BckRatio factor #' #' #' #' @export setClass( Class = "NoRCE", slots = c( ID = "character", Term = "character", geneList = "list", pvalue = "numeric", pAdj = "numeric", GeneRatio = "character", BckRatio = "character", ncGeneList = "list" ) ) #' Perform enrichment analysis of the given genes #' #' @param genes Set of input genes. Supported format HUGO. #' @param org_assembly Genome assembly of interest for the analysis. Possible #' assemblies are "mm10" for mouse, "dre10" for zebrafish, "rn6" for rat, #' "dm6" for fruit fly, "ce11" for worm, "sc3" for yeast, "hg19" and #' "hg38" for human #' @param GOtype Hierarchical category of the GO ontology. Possible values are #' "BP"(default), "CC", "MF". #' @param pCut Threshold value for the pvalue. Default value is 0.05 #' @param pAdjCut Cutoff value for the adjusted p-values using one of given #' method. Default value is 0.05. #' @param pAdjust Methods of the adjusted p-values. Possible methods are #' "bonferroni", "holm", "BH"(default) #' @param min Minimum number of gene that are required for enrichment. By #' default, it is set to 5 #' @param enrichTest Types of enrichment methods to perform enrichment #' analysis. Possible values are "hyper"(default), "binom", "fisher", #' "chi". #' @param backG The set of genes that tested against to the input #' (background gene) #' @param backGType Type of the background gene. If miRNA gene set is used for #' background gene, backGType should be set to the 'mirna' #' #' @return GO enrichment results #' #' @examples #' subsetGene <- breastmRNA[1:30,] #' breastEnr <- goEnrichment(genes = subsetGene, #' org_assembly = 'hg19', #' GOtype = 'MF', #' min = 2) #' #' @importFrom stats chisq.test cor cor.test fisher.test na.omit p.adjust #' @importFrom stats pbinom phyper reorder setNames var #' #' @export goEnrichment <- function(genes, org_assembly = c("hg19", "hg38", "mm10", "dre10", "rn6", "dm6", "ce11", "sc3"), GOtype = c("BP", "CC", "MF"), pCut = 0.05, pAdjCut = 0.05, pAdjust = c("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"), min = 5, backG = '', backGType = 'pc_gene', enrichTest = c("hyper", "binom", "fisher", "chi")) { if (missing(org_assembly)) { message("Genome assembly version is missing.") assembly(org_assembly) } if (missing(genes)) { message("Genes are missing. Expected input: FOXP2 SOX4 HOXC6") } # annot <- unique(annGO(genes, GOtype, org_assembly)) goData <- annGO(genes, GOtype, org_assembly) annot <- goData[[2]] uniqueGO <- annot$GOID[!duplicated(annot$GOID)] gofreq <- as.data.frame(table(annot$GOID)) notGene <- getBackGenes( backgroundGene = backG, all = goData[[1]], GOtype = GOtype, gofreq = gofreq, org_assembly = org_assembly, type = backGType ) if (nrow(gofreq) > nrow(notGene)) { gofreq <- gofreq[gofreq$Var1 %in% notGene$Var1,] } freq <- merge(gofreq, notGene, by = "Var1") found <- freq$Freq.x ifelse(backG == '', geneSize <- length(unique(goData[[1]]$Gene)), geneSize <- length(unique(backG))) M <- freq$Freq.y n <- rep(length(unique(annot$Gene)), length(M)) # if (enrichTest == "binom") { # pvalues <- 2 * (1 - pbinom(found, n, pCut)) # } # if (enrichTest == "fisher") { # pvalues <- # fisher.test(matrix(c( # found, (M - found), (n - found), (geneSize - M - n + found) # ), 2, 2), alternative = 'greater')$p.value # } # if (enrichTest == "chi") { # pvalues <- # chisq.test(matrix(c( # found, (M - found), (n - found), (geneSize - M - n + found) # )))$p.value # } # else { # pvalues <- phyper(found - 1, M, geneSize - M, n, lower.tail = FALSE) # } ifelse( enrichTest == "binom", pvalues <- 2 * (1 - pbinom(found, n, pCut)), ifelse( enrichTest == "fisher", pvalues <- fisher.test(matrix(c( found, (M - found), (n - found), (geneSize - M - n + found) ), 2, 2), alternative = 'greater')$p.value, ifelse( enrichTest == "chi", pvalues <- chisq.test(matrix(c( found, (M - found), (n - found), (geneSize - M - n + found) )))$p.value, pvalues <- phyper(found - 1, M, geneSize - M, n, lower.tail = FALSE) ) ) ) pAdjust1 <- p.adjust(pvalues, method = pAdjust) GeneRatio <- apply(data.frame(found, n), 1, function(x) file.path(x[1], x[2])) BgRatio <- apply(data.frame(M, geneSize), 1, function(x) file.path(x[1], x[2])) enrich <- which (pvalues <= pCut & pAdjust1 <= pAdjCut & found >= min) goT <- as.character(freq$Var1[enrich]) gratio <- GeneRatio[enrich] bgratio <- BgRatio[enrich] padj <- pAdjust1[enrich] pval <- pvalues[enrich] r <- annot[annot$GOID %in% goT, seq_len(2)] enric <- list() for (i in seq_along(goT)) { if (length(which(goT[i] == r$GOID)) > 0) { enric <- c(enric, setNames(list(as.character(r[which(goT[i] == r$GOID), ]$Gene)), paste(goT[i]))) } } goTe <- as.character(annot[match(goT, annot[, 'GOID']),]$GOTerm) return( methods::new( "NoRCE", ID = goT, Term = goTe, geneList = enric, pvalue = pval, pAdj = padj, GeneRatio = gratio, BckRatio = bgratio ) ) } getBackGenes <- function(backgroundGene, all, GOtype = c("BP", "CC", "MF"), gofreq, org_assembly = c("hg19", "hg38", "mm10", "dre10", "rn6", "dm6", "ce11", "sc3"), type = 'pc_gene') { if (backgroundGene == '') { bckfreq <- as.data.frame(table(all$GOID)) } else{ backgroundGene = as.data.frame(backgroundGene) colnames(backgroundGene) = 'bg' if (type == 'mirna') { a <- as.data.frame(gsub(paste(c("-3p", "-5p"), collapse = "|"), "", backgroundGene$bg)) colnames(a) <- 'gene' geneTargetLoc <- convertGeneID( genetype = "mirna", genelist = a$gene, org_assembly = org_assembly ) backgroundGene <- getUCSC(geneTargetLoc, 10000, 10000, org_assembly) colnames(backgroundGene) = 'bg' } annot <- unique(annGO(backgroundGene$bg, GOtype, org_assembly)) uniqueGO <- annot$GOID[!duplicated(annot$GOID)] bckfreq <- as.data.frame(table(annot$GOID)) } bb <- bckfreq[bckfreq$Var1 %in% gofreq$Var1,] return(bb) }