From: davidcoffey <dcoffey@fredhutch.org>
git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/LymphoSeq@128429 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
deleted file mode 100644 |
... | ... |
@@ -1,134 +0,0 @@ |
1 |
-#' Align mutliple sequences |
|
2 |
-#' |
|
3 |
-#' Perform multiple sequence alignment using one of three methods and output results to the |
|
4 |
-<<<<<<< HEAD |
|
5 |
-#' console or as a pdf file. One may perform the alignment of all amino acid or nucleotide |
|
6 |
-#' sequences in a single sample. Alternatively, one may search for a given sequence |
|
7 |
-======= |
|
8 |
-#' consule or as a pdf file. One may perform the alignment of all amino acid or nucleotide |
|
9 |
-#' sequences in a single sample Alternatively, one may search for a given sequence |
|
10 |
->>>>>>> Added ability to print alignment to consule |
|
11 |
-#' within a list of samples using an edit distance threshold. |
|
12 |
-#' |
|
13 |
-#' @param list A list of data frames consisting of antigen receptor sequences |
|
14 |
-#' imported by the LymphoSeq function readImmunoSeq. |
|
15 |
-#' @param sample A character vector indicating the name of the sample in the productive |
|
16 |
-#' sequence list. |
|
17 |
-#' @param sequence A character vector of one ore more amino acid or nucleotide |
|
18 |
-#' CDR3 sequences to search. |
|
19 |
-#' @param editDistance An integer giving the minimum edit distance that the |
|
20 |
-#' sequence must be less than or equal to. See details below. |
|
21 |
-#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
22 |
-#' should be aligned. If "aminoAcid" is specified, then run productiveSeqs first. |
|
23 |
-#' @param method A character vector indicating the multiple sequence alignment method to |
|
24 |
-#' be used. Refer to the Bioconductor msa package for more details. Options incude |
|
25 |
-#' "ClustalW", "ClustalOmega", and "Muscle". |
|
26 |
-#' @param output A character vector indicating where the multiple sequence alignemnt should be |
|
27 |
-<<<<<<< HEAD |
|
28 |
-#' printed. Options include "console" or "pdf". If "pdf" is selected, the file is saved to |
|
29 |
-======= |
|
30 |
-#' printed. Options include "consule" or "pdf". If "pdf" is selected, the file is saved to |
|
31 |
->>>>>>> Added ability to print alignment to consule |
|
32 |
-#' the working directory. For "pdf" to work, Texshade must be installed. Refer to the |
|
33 |
-#' Bioconductor package msa installation instructions for more details. |
|
34 |
-#' @details Edit distance is a way of quantifying how dissimilar two sequences |
|
35 |
-#' are to one another by counting the minimum number of operations required to |
|
36 |
-#' transform one sequence into the other. For example, an edit distance of 0 |
|
37 |
-#' means the sequences are identical and an edit distance of 1 indicates that |
|
38 |
-#' the sequences different by a single amino acid or nucleotide. |
|
39 |
-#' |
|
40 |
-<<<<<<< HEAD |
|
41 |
-#' @return Performs a multiple sequence alignemnt and prints to the console or saves a pdf to |
|
42 |
-======= |
|
43 |
-#' @return Performs a multiple sequence alignemnt and prints to the consule or saves a pdf to |
|
44 |
->>>>>>> Added ability to print alignment to consule |
|
45 |
-#' the working directory. |
|
46 |
-#' @seealso If having trouble saving pdf files, refer to Biconductor package msa for |
|
47 |
-#' installation instructions |
|
48 |
-#' \url{http://bioconductor.org/packages/release/bioc/vignettes/msa/inst/doc/msa.pdf} |
|
49 |
-#' @examples |
|
50 |
-#' file.path <- system.file("extdata", "IGH_sequencing", package = "LymphoSeq") |
|
51 |
-#' |
|
52 |
-#' file.list <- readImmunoSeq(path = file.path) |
|
53 |
-#' |
|
54 |
-#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
55 |
-#' |
|
56 |
-#' alignSeq(list = productive.nt, sample = "IGH_MVQ92552A_BL", type = "nucleotide", |
|
57 |
-<<<<<<< HEAD |
|
58 |
-#' method = "ClustalW", output = "console") |
|
59 |
-======= |
|
60 |
-#' method = "ClustalW", output = "consule") |
|
61 |
->>>>>>> Added ability to print alignment to consule |
|
62 |
-#' @export |
|
63 |
-#' @importFrom Biostrings DNAStringSet |
|
64 |
-#' @importFrom Biostrings AAStringSet |
|
65 |
-#' @import msa |
|
66 |
-<<<<<<< HEAD |
|
67 |
-alignSeq = function(list, sample = NULL, sequence = NULL, editDistance = 15, output = "console", type = "nucleotide", method = "ClustalOmega") { |
|
68 |
-======= |
|
69 |
-alignSeq = function(list, sample = NULL, sequence = NULL, editDistance = 15, output = "consule", type = "nucleotide", method = "ClustalOmega") { |
|
70 |
->>>>>>> Added ability to print alignment to consule |
|
71 |
- if(!is.null(sequence) & is.null(sample)){ |
|
72 |
- file <- searchSeq(list = list, sequence = sequence, editDistance = editDistance, type = type, match = "partial") |
|
73 |
- if(is.null(file)){ |
|
74 |
- stop("There are no sequences to be aligned", call. = FALSE) |
|
75 |
- } |
|
76 |
- } |
|
77 |
- if(!is.null(sequence) & !is.null(sample)){ |
|
78 |
- file <- searchSeq(list = list[sample], sequence = sequence, editDistance = editDistance, type = type, match = "partial") |
|
79 |
- if(is.null(file)){ |
|
80 |
- stop("There are no sequences to be aligned", call. = FALSE) |
|
81 |
- } |
|
82 |
- } |
|
83 |
- if(is.null(sequence)){ |
|
84 |
- file <- list[[sample]] |
|
85 |
- } |
|
86 |
- if(nrow(file) > 150){ |
|
87 |
- file <- file[1:150,] |
|
88 |
- message("Only the first 150 sequences will be aligned") |
|
89 |
- } |
|
90 |
- file[file == ""] <- "unresolved" |
|
91 |
- if(type == "nucleotide"){ |
|
92 |
- names(file)[which(names(file) == "foundSequnece")] <- "nucleotide" |
|
93 |
- file <- file[nchar(file$nucleotide) > 3, ] |
|
94 |
- string <- Biostrings::DNAStringSet(file$nucleotide) |
|
95 |
- } |
|
96 |
- if(type == "aminoAcid"){ |
|
97 |
- names(file)[which(names(file) == "foundSequnece")] <- "aminoAcid" |
|
98 |
- file <- file[nchar(file$aminoAcid) > 3, ] |
|
99 |
- string <- Biostrings::AAStringSet(file$aminoAcid) |
|
100 |
- } |
|
101 |
- if(nrow(file) < 3){ |
|
102 |
- stop("There are less than 3 sequences to be aligned", call. = FALSE) |
|
103 |
- } |
|
104 |
- if(!is.null(sequence)){ |
|
105 |
- names(string) <- paste(file$sample) |
|
106 |
- } else { |
|
107 |
- names(string) <- paste(file$vFamilyName, file$dFamilyName, file$jFamilyName, file$count) |
|
108 |
- } |
|
109 |
- names(string) <- gsub(names(string), pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "") |
|
110 |
- names(string) <- gsub(names(string), pattern = "unresolved", replacement = "UNR") |
|
111 |
- alignment <- msa::msa(string, method = method) |
|
112 |
-<<<<<<< HEAD |
|
113 |
- if(output == "console"){ |
|
114 |
-======= |
|
115 |
- if(output == "consule"){ |
|
116 |
->>>>>>> Added ability to print alignment to consule |
|
117 |
- print(alignment, show = "complete") |
|
118 |
- } |
|
119 |
- if(output == "pdf"){ |
|
120 |
- if(!is.null(sample)){ |
|
121 |
- file.name <- paste(names(list[sample]), ".pdf", sep = "") |
|
122 |
- } else { |
|
123 |
- file.name <- "Sequence_aligment.pdf" |
|
124 |
- } |
|
125 |
- msa::msaPrettyPrint(alignment, output = "pdf", file = file.name, |
|
126 |
- showNumbering = "right", showNames = "left", showConsensus = "top", |
|
127 |
- shadingMode = "similar", shadingColors = "blues", |
|
128 |
- paperWidth = ncol(alignment)*0.1 + 5, paperHeight = nrow(alignment)*0.1 + 5, |
|
129 |
- showLogo = "bottom", showLogoScale = "right", logoColors = "rasmol", |
|
130 |
- psFonts = TRUE, showLegend = TRUE, askForOverwrite = FALSE, |
|
131 |
- furtherCode=c("\\defconsensus{.}{lower}{upper}","\\showruler{1}{top}")) |
|
132 |
- message(paste(file.name, "saved to", getwd())) |
|
133 |
- } |
|
134 |
-} |
135 | 0 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,74 @@ |
1 |
+#' Clonality |
|
2 |
+#' |
|
3 |
+#' Creates a data frame giving the total number of sequences, number of unique |
|
4 |
+#' productive sequences, number of genomes, entropy, clonality, Gini |
|
5 |
+#' coefficient, and the frequency (\%) of the top productive sequences in a list |
|
6 |
+#' of sample data frames. |
|
7 |
+#' |
|
8 |
+#' @param file.list A list of data frames consisting of antigen receptor |
|
9 |
+#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", |
|
10 |
+#' and "frequencyCount" are required columns. "estimatedNumberGenomes" is optional. |
|
11 |
+#' Note that clonality is usually calculated from productive nucleotide sequences. |
|
12 |
+#' Therefore, it is not recommended to run this function using a productive sequence |
|
13 |
+#' list aggregated by amino acids. |
|
14 |
+#' @return Returns a data frame giving the total number of sequences, number of |
|
15 |
+#' unique productive sequences, number of genomes, clonality, Gini coefficient, |
|
16 |
+#' and the frequency (\%) of the top productive sequence in each sample. |
|
17 |
+#' @details Clonality is derived from the Shannon entropy, which is calculated |
|
18 |
+#' from the frequencies of all productive sequences divided by the logarithm of |
|
19 |
+#' the total number of unique productive sequences. This normalized entropy |
|
20 |
+#' value is then inverted (1 - normalized entropy) to produce the clonality |
|
21 |
+#' metric. |
|
22 |
+#' |
|
23 |
+#' The Gini coefficient is an alternative metric used to calculate repertoire |
|
24 |
+#' diversity and is derived from the Lorenz curve. The Lorenz curve is drawn |
|
25 |
+#' such that x-axis represents the cumulative percentage of unique sequences and |
|
26 |
+#' the y-axis represents the cumulative percentage of reads. A line passing |
|
27 |
+#' through the origin with a slope of 1 reflects equal frequencies of all clones. |
|
28 |
+#' The Gini coefficient is the ratio of the area between the line of equality |
|
29 |
+#' and the observed Lorenz curve over the total area under the line of equality. |
|
30 |
+#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where |
|
31 |
+#' 0 indicates all sequences have the same frequency and 1 indicates the |
|
32 |
+#' repertoire is dominated by a single sequence. |
|
33 |
+#' @examples |
|
34 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
35 |
+#' |
|
36 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
37 |
+#' |
|
38 |
+#' clonality(file.list = file.list) |
|
39 |
+#' @seealso \code{\link{lorenzCurve}} |
|
40 |
+#' @export |
|
41 |
+#' @importFrom ineq Gini |
|
42 |
+clonality <- function(file.list) { |
|
43 |
+ table <- data.frame(samples = names(file.list)) |
|
44 |
+ i <- 1 |
|
45 |
+ for (i in 1:length(file.list)) { |
|
46 |
+ file <- file.list[[i]] |
|
47 |
+ total.reads <- nrow(file) |
|
48 |
+ file$estimatedNumberGenomes <- suppressWarnings(as.integer(file$estimatedNumberGenomes)) |
|
49 |
+ total.genomes <- sum(file$estimatedNumberGenomes) |
|
50 |
+ total.count <- sum(file$count) |
|
51 |
+ productive <- file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ] |
|
52 |
+ frequency <- productive$count/sum(productive$count) |
|
53 |
+ entropy <- -sum(frequency * log2(frequency), na.rm = TRUE) |
|
54 |
+ unique.productive <- nrow(productive) |
|
55 |
+ clonality <- 1 - round(entropy/log2(unique.productive), digits = 6) |
|
56 |
+ table$totalSequences[i] <- total.reads |
|
57 |
+ table$uniqueProductiveSequences[i] <- unique.productive |
|
58 |
+ table$totalGenomes[i] <- total.genomes |
|
59 |
+ table$totalCount[i] <- total.count |
|
60 |
+ table$clonality[i] <- clonality |
|
61 |
+ table$giniCoefficient[i] <- ineq::Gini(frequency) |
|
62 |
+ table$topProductiveSequence[i] <- max(frequency) * 100 |
|
63 |
+ if (any(grepl("estimatedNumberGenomes", colnames(file)))) { |
|
64 |
+ file$estimatedNumberGenomes <- suppressWarnings(as.integer(file$estimatedNumberGenomes)) |
|
65 |
+ total.genomes <- sum(file$estimatedNumberGenomes) |
|
66 |
+ table$totalGenomes[i] <- ifelse(total.genomes == 0, NA, total.genomes) |
|
67 |
+ } else { |
|
68 |
+ table$totalGenomes[i] <- NA |
|
69 |
+ } |
|
70 |
+ } |
|
71 |
+ table <- table[order(table$topProductiveSequence, decreasing = TRUE), ] |
|
72 |
+ rownames(table) <- NULL |
|
73 |
+ return(table) |
|
74 |
+} |
|
0 | 75 |
\ No newline at end of file |