#' Gene frequencies #' #' Creates a data frame of VDJ gene counts and frequencies. #' #' @param productive.nt A list of one or more data frames of productive sequences #' generated by the LymphoSeq function productiveSeq where the parameter #' aggregate is set to "nucleotide". #' @param locus A character vector indicating which VDJ genes to include in the #' output. Available options include "VDJ", "DJ", "VJ", "DJ", "V", "D", or "J". #' @param family A Boolean value indicating whether or not family names instead #' of gene names are used. If TRUE, then family names are used and if FALSE, #' gene names are used. #' @return Returns a data frame with the sample names, VDJ gene name, count, and #' \% frequency of the V, D, or J genes (each gene frequency should add to #' 100\% for each sample). #' @examples #' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") #' #' file.list <- readImmunoSeq(path = file.path) #' #' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") #' #' geneFreq(productive.nt, locus = "VDJ", family = FALSE) #' #' # Create a heat map of V gene usage #' vfamilies <- geneFreq(productive.nt, locus = "V", family = TRUE) #' #' require(reshape) #' #' vfamilies <- reshape::cast(vfamilies, familyName ~ samples, value = "frequencyGene", sum) #' #' rownames(vfamilies) <- as.character(vfamilies$familyName) #' #' vfamilies$familyName <- NULL #' #' RedBlue <- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdBu")))(256) #' #' require(pheatmap) #' #' pheatmap::pheatmap(vfamilies, color = RedBlue, scale = "row") #' #' # Create a word cloud of V gene usage #' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE) #' #' require(wordcloud) #' #' wordcloud::wordcloud(words = vgenes[vgenes$samples == "TCRB_Day83_Unsorted", "geneName"], #' freq = vgenes[vgenes$samples == "TCRB_Day83_Unsorted", "frequencyGene"], #' colors = RedBlue) #' #' # Create a cumulative frequency bar plot of V gene usage #' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE) #' #' require(ggplot2) #' #' ggplot2::ggplot(vgenes, aes(x = samples, y = frequencyGene, fill = geneName)) + #' geom_bar(stat = "identity") + #' theme_minimal() + #' scale_y_continuous(expand = c(0, 0)) + #' guides(fill = guide_legend(ncol = 3)) + #' labs(y = "Frequency (%)", x = "", fill = "") + #' theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) #' @export #' @importFrom stats aggregate geneFreq <- function(productive.nt, locus = "VDJ", family = FALSE) { gene.names <- data.frame() if (family == FALSE) { i <- 1 for (i in 1:length(productive.nt)) { file <- productive.nt[[i]] file[file == ""] <- "unresolved" name <- names(productive.nt) v <- data.frame() d <- data.frame() j <- data.frame() if (grepl("V|v", locus)) { v <- aggregate(count ~ vGeneName, data = file, FUN = sum) v <- data.frame(samples = name[i], geneName = v$vGeneName, count = v$count, frequencyGene = v$count/sum(v$count) * 100) } if (grepl("D|d", locus)) { d <- aggregate(count ~ dGeneName, data = file, FUN = sum) d <- data.frame(samples = name[i], geneName = d$dGeneName, count = d$count, frequencyGene = d$count/sum(d$count) * 100) } if (grepl("J|j", locus)) { j <- aggregate(count ~ jGeneName, data = file, FUN = sum) j <- data.frame(samples = name[i], geneName = j$jGeneName, count = j$count, frequencyGene = j$count/sum(j$count) * 100) } gene.names <- rbind(v, d, j, gene.names) } } else { i <- 1 for (i in 1:length(productive.nt)) { file <- productive.nt[[i]] file[file == ""] <- "unresolved" name <- names(productive.nt) v <- data.frame() d <- data.frame() j <- data.frame() if (grepl("V|v", locus)) { v <- aggregate(count ~ vFamilyName, data = file, FUN = sum) v <- data.frame(samples = name[i], familyName = v$vFamilyName, count = v$count, frequencyGene = v$count/sum(v$count) * 100) } if (grepl("D|d", locus)) { d <- aggregate(count ~ dFamilyName, data = file, FUN = sum) d <- data.frame(samples = name[i], familyName = d$dFamilyName, count = d$count, frequencyGene = d$count/sum(d$count) * 100) } if (grepl("J|j", locus)) { j <- aggregate(count ~ jFamilyName, data = file, FUN = sum) j <- data.frame(samples = name[i], familyName = j$jFamilyName, count = j$count, frequencyGene = j$count/sum(j$count) * 100) } gene.names <- rbind(v, d, j, gene.names) } } return(gene.names) }