#' Clonality
#' 
#' Creates a data frame giving the total number of sequences, number of unique 
#' productive sequences, number of genomes, entropy, clonality, Gini 
#' coefficient, and the frequency (\%) of the top productive sequences in a list 
#' of sample data frames.
#' 
#' @param file.list A list of data frames consisting of antigen receptor 
#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", 
#' "frequencyCount", and "estimatedNumberGenomes" are required columns.  Note 
#' that the function is not intended to be run using a productive sequence list 
#' generated by the function productiveSeq.
#' @return Returns a data frame giving the total number of sequences, number of 
#' unique productive sequences, number of genomes, entropy, clonality, 
#' Gini coefficient, and the frequency (\%) of the top productive sequence in each sample.
#' @details Clonality is derived from the Shannon entropy, which is calculated 
#' from the frequencies of all productive sequences divided by the logarithm of 
#' the total number of unique productive sequences.  This normalized entropy 
#' value is then inverted (1 - normalized entropy) to produce the clonality 
#' metric.  
#' 
#' The Gini coefficient is an alternative metric used to calculate repertoire 
#' diversity and is derived from the Lorenz curve.  The Lorenz curve is drawn 
#' such that x-axis represents the cumulative percentage of unique sequences and 
#' the y-axis represents the cumulative percentage of reads.  A line passing 
#' through the origin with a slope of 1 reflects equal frequencies of all clones.  
#' The Gini coefficient is the ratio of the area between the line of equality 
#' and the observed Lorenz curve over the total area under the line of equality.  
#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where 
#' 0 indicates all sequences have the same frequency and 1 indicates the 
#' repertoire is dominated by a single sequence.
#' @examples
#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
#' 
#' file.list <- readImmunoSeq(path = file.path)
#' 
#' clonality(file.list = file.list)
#' @seealso \code{\link{lorenzCurve}}
#' @export
#' @importFrom ineq Gini
clonality <- function(file.list) {
    table <- data.frame(samples = names(file.list))
    i <- 1
    for (i in 1:length(file.list)) {
        file <- file.list[[i]]
        total.reads <- length(file$nucleotide)
        total.genomes <- sum(file$estimatedNumberGenomes)
        total.count <- sum(file$count)
        productive <- file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ]
        frequency <- productive$count/sum(productive$count)
        entropy <- -sum(frequency * log2(frequency))
        unique.productive <- length(productive$nucleotide)
        clonality <- 1 - (entropy/log2(unique.productive))
        table$totalSequences[i] <- total.reads
        table$uniqueProductiveSequences[i] <- unique.productive
        table$totalGenomes[i] <- total.genomes
        table$totalCount[i] <- total.count
        table$entropy[i] <- entropy
        table$clonality[i] <- clonality
        table$giniCoefficient[i] <- ineq::Gini(frequency)
        table$topProductiveSequence[i] <- max(frequency) * 100
    }
    table$samples <- as.character(table$samples)
    table <- table[order(nchar(table$samples), table$samples, decreasing = FALSE), ]
    rownames(table) <- NULL
    return(table)
}