From: davidcoffey <dcoffey@fredhutch.org>
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/LymphoSeq@128438 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,74 +0,0 @@ |
1 |
-#' Clonality |
|
2 |
-#' |
|
3 |
-#' Creates a data frame giving the total number of sequences, number of unique |
|
4 |
-#' productive sequences, number of genomes, entropy, clonality, Gini |
|
5 |
-#' coefficient, and the frequency (\%) of the top productive sequences in a list |
|
6 |
-#' of sample data frames. |
|
7 |
-#' |
|
8 |
-#' @param file.list A list of data frames consisting of antigen receptor |
|
9 |
-#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", |
|
10 |
-#' and "frequencyCount" are required columns. "estimatedNumberGenomes" is optional. |
|
11 |
-#' Note that clonality is usually calculated from productive nucleotide sequences. |
|
12 |
-#' Therefore, it is not recommended to run this function using a productive sequence |
|
13 |
-#' list aggregated by amino acids. |
|
14 |
-#' @return Returns a data frame giving the total number of sequences, number of |
|
15 |
-#' unique productive sequences, number of genomes, clonality, Gini coefficient, |
|
16 |
-#' and the frequency (\%) of the top productive sequence in each sample. |
|
17 |
-#' @details Clonality is derived from the Shannon entropy, which is calculated |
|
18 |
-#' from the frequencies of all productive sequences divided by the logarithm of |
|
19 |
-#' the total number of unique productive sequences. This normalized entropy |
|
20 |
-#' value is then inverted (1 - normalized entropy) to produce the clonality |
|
21 |
-#' metric. |
|
22 |
-#' |
|
23 |
-#' The Gini coefficient is an alternative metric used to calculate repertoire |
|
24 |
-#' diversity and is derived from the Lorenz curve. The Lorenz curve is drawn |
|
25 |
-#' such that x-axis represents the cumulative percentage of unique sequences and |
|
26 |
-#' the y-axis represents the cumulative percentage of reads. A line passing |
|
27 |
-#' through the origin with a slope of 1 reflects equal frequencies of all clones. |
|
28 |
-#' The Gini coefficient is the ratio of the area between the line of equality |
|
29 |
-#' and the observed Lorenz curve over the total area under the line of equality. |
|
30 |
-#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where |
|
31 |
-#' 0 indicates all sequences have the same frequency and 1 indicates the |
|
32 |
-#' repertoire is dominated by a single sequence. |
|
33 |
-#' @examples |
|
34 |
-#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
35 |
-#' |
|
36 |
-#' file.list <- readImmunoSeq(path = file.path) |
|
37 |
-#' |
|
38 |
-#' clonality(file.list = file.list) |
|
39 |
-#' @seealso \code{\link{lorenzCurve}} |
|
40 |
-#' @export |
|
41 |
-#' @importFrom ineq Gini |
|
42 |
-clonality <- function(file.list) { |
|
43 |
- table <- data.frame(samples = names(file.list)) |
|
44 |
- i <- 1 |
|
45 |
- for (i in 1:length(file.list)) { |
|
46 |
- file <- file.list[[i]] |
|
47 |
- total.reads <- nrow(file) |
|
48 |
- file$estimatedNumberGenomes <- suppressWarnings(as.integer(file$estimatedNumberGenomes)) |
|
49 |
- total.genomes <- sum(file$estimatedNumberGenomes) |
|
50 |
- total.count <- sum(file$count) |
|
51 |
- productive <- file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ] |
|
52 |
- frequency <- productive$count/sum(productive$count) |
|
53 |
- entropy <- -sum(frequency * log2(frequency), na.rm = TRUE) |
|
54 |
- unique.productive <- nrow(productive) |
|
55 |
- clonality <- 1 - round(entropy/log2(unique.productive), digits = 6) |
|
56 |
- table$totalSequences[i] <- total.reads |
|
57 |
- table$uniqueProductiveSequences[i] <- unique.productive |
|
58 |
- table$totalGenomes[i] <- total.genomes |
|
59 |
- table$totalCount[i] <- total.count |
|
60 |
- table$clonality[i] <- clonality |
|
61 |
- table$giniCoefficient[i] <- ineq::Gini(frequency) |
|
62 |
- table$topProductiveSequence[i] <- max(frequency) * 100 |
|
63 |
- if (any(grepl("estimatedNumberGenomes", colnames(file)))) { |
|
64 |
- file$estimatedNumberGenomes <- suppressWarnings(as.integer(file$estimatedNumberGenomes)) |
|
65 |
- total.genomes <- sum(file$estimatedNumberGenomes) |
|
66 |
- table$totalGenomes[i] <- ifelse(total.genomes == 0, NA, total.genomes) |
|
67 |
- } else { |
|
68 |
- table$totalGenomes[i] <- NA |
|
69 |
- } |
|
70 |
- } |
|
71 |
- table <- table[order(table$topProductiveSequence, decreasing = TRUE), ] |
|
72 |
- rownames(table) <- NULL |
|
73 |
- return(table) |
|
74 |
-} |
|
75 | 0 |
\ No newline at end of file |