From: davidcoffey <dcoffey@fredhutch.org>
gitsvnid: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/LymphoSeq@128435 bc3139a867e503109ffcced21a209358
1  1 
deleted file mode 100644 
...  ... 
@@ 1,74 +0,0 @@ 
1 
#' Clonality 

2 
#' 

3 
#' Creates a data frame giving the total number of sequences, number of unique 

4 
#' productive sequences, number of genomes, entropy, clonality, Gini 

5 
#' coefficient, and the frequency (\%) of the top productive sequences in a list 

6 
#' of sample data frames. 

7 
#' 

8 
#' @param file.list A list of data frames consisting of antigen receptor 

9 
#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", 

10 
#' and "frequencyCount" are required columns. "estimatedNumberGenomes" is optional. 

11 
#' Note that clonality is usually calculated from productive nucleotide sequences. 

12 
#' Therefore, it is not recommended to run this function using a productive sequence 

13 
#' list aggregated by amino acids. 

14 
#' @return Returns a data frame giving the total number of sequences, number of 

15 
#' unique productive sequences, number of genomes, clonality, Gini coefficient, 

16 
#' and the frequency (\%) of the top productive sequence in each sample. 

17 
#' @details Clonality is derived from the Shannon entropy, which is calculated 

18 
#' from the frequencies of all productive sequences divided by the logarithm of 

19 
#' the total number of unique productive sequences. This normalized entropy 

20 
#' value is then inverted (1  normalized entropy) to produce the clonality 

21 
#' metric. 

22 
#' 

23 
#' The Gini coefficient is an alternative metric used to calculate repertoire 

24 
#' diversity and is derived from the Lorenz curve. The Lorenz curve is drawn 

25 
#' such that xaxis represents the cumulative percentage of unique sequences and 

26 
#' the yaxis represents the cumulative percentage of reads. A line passing 

27 
#' through the origin with a slope of 1 reflects equal frequencies of all clones. 

28 
#' The Gini coefficient is the ratio of the area between the line of equality 

29 
#' and the observed Lorenz curve over the total area under the line of equality. 

30 
#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where 

31 
#' 0 indicates all sequences have the same frequency and 1 indicates the 

32 
#' repertoire is dominated by a single sequence. 

33 
#' @examples 

34 
#' file.path < system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") 

35 
#' 

36 
#' file.list < readImmunoSeq(path = file.path) 

37 
#' 

38 
#' clonality(file.list = file.list) 

39 
#' @seealso \code{\link{lorenzCurve}} 

40 
#' @export 

41 
#' @importFrom ineq Gini 

42 
clonality < function(file.list) { 

43 
 table < data.frame(samples = names(file.list)) 

44 
 i < 1 

45 
 for (i in 1:length(file.list)) { 

46 
 file < file.list[[i]] 

47 
 total.reads < nrow(file) 

48 
 file$estimatedNumberGenomes < suppressWarnings(as.integer(file$estimatedNumberGenomes)) 

49 
 total.genomes < sum(file$estimatedNumberGenomes) 

50 
 total.count < sum(file$count) 

51 
 productive < file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ] 

52 
 frequency < productive$count/sum(productive$count) 

53 
 entropy < sum(frequency * log2(frequency), na.rm = TRUE) 

54 
 unique.productive < nrow(productive) 

55 
 clonality < 1  round(entropy/log2(unique.productive), digits = 6) 

56 
 table$totalSequences[i] < total.reads 

57 
 table$uniqueProductiveSequences[i] < unique.productive 

58 
 table$totalGenomes[i] < total.genomes 

59 
 table$totalCount[i] < total.count 

60 
 table$clonality[i] < clonality 

61 
 table$giniCoefficient[i] < ineq::Gini(frequency) 

62 
 table$topProductiveSequence[i] < max(frequency) * 100 

63 
 if (any(grepl("estimatedNumberGenomes", colnames(file)))) { 

64 
 file$estimatedNumberGenomes < suppressWarnings(as.integer(file$estimatedNumberGenomes)) 

65 
 total.genomes < sum(file$estimatedNumberGenomes) 

66 
 table$totalGenomes[i] < ifelse(total.genomes == 0, NA, total.genomes) 

67 
 } else { 

68 
 table$totalGenomes[i] < NA 

69 
 } 

70 
 } 

71 
 table < table[order(table$topProductiveSequence, decreasing = TRUE), ] 

72 
 rownames(table) < NULL 

73 
 return(table) 

74 
} 

75  0 
\ No newline at end of file 