Browse code

Fix merge conflicts

From: davidcoffey <dcoffey@fredhutch.org>

git-svn-id: https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/LymphoSeq@128429 bc3139a8-67e5-0310-9ffc-ced21a209358

d.coffey authored on 10/04/2017 01:09:47
Showing 2 changed files

1 1
deleted file mode 100644
... ...
@@ -1,134 +0,0 @@
1
-#' Align mutliple sequences
2
-#' 
3
-#' Perform multiple sequence alignment using one of three methods and output results to the 
4
-<<<<<<< HEAD
5
-#' console or as a pdf file.  One may perform the alignment of all amino acid or nucleotide
6
-#' sequences in a single sample.  Alternatively, one may search for a given sequence 
7
-=======
8
-#' consule or as a pdf file.  One may perform the alignment of all amino acid or nucleotide
9
-#' sequences in a single sample  Alternatively, one may search for a given sequence 
10
->>>>>>> Added ability to print alignment to consule
11
-#' within a list of samples using an edit distance threshold.
12
-#' 
13
-#' @param list A list of data frames consisting of antigen receptor sequences 
14
-#' imported by the LymphoSeq function readImmunoSeq.
15
-#' @param sample A character vector indicating the name of the sample in the productive
16
-#' sequence list. 
17
-#' @param sequence A character vector of one ore more amino acid or nucleotide 
18
-#' CDR3 sequences to search.
19
-#' @param editDistance An integer giving the minimum edit distance that the 
20
-#' sequence must be less than or equal to.  See details below.
21
-#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences
22
-#' should be aligned.  If "aminoAcid" is specified, then run productiveSeqs first.
23
-#' @param method A character vector indicating the multiple sequence alignment method to 
24
-#' be used.  Refer to the Bioconductor msa package for more details.  Options incude 
25
-#' "ClustalW", "ClustalOmega", and "Muscle".
26
-#' @param output A character vector indicating where the multiple sequence alignemnt should be
27
-<<<<<<< HEAD
28
-#' printed.  Options include "console" or "pdf".  If "pdf" is selected, the file is saved to
29
-=======
30
-#' printed.  Options include "consule" or "pdf".  If "pdf" is selected, the file is saved to
31
->>>>>>> Added ability to print alignment to consule
32
-#' the working directory.  For "pdf" to work, Texshade must be installed.  Refer to the 
33
-#' Bioconductor package msa installation instructions for more details.
34
-#' @details Edit distance is a way of quantifying how dissimilar two sequences 
35
-#' are to one another by counting the minimum number of operations required to 
36
-#' transform one sequence into the other.  For example, an edit distance of 0 
37
-#' means the sequences are identical and an edit distance of 1 indicates that 
38
-#' the sequences different by a single amino acid or nucleotide.
39
-#' 
40
-<<<<<<< HEAD
41
-#' @return Performs a multiple sequence alignemnt and prints to the console or saves a pdf to 
42
-=======
43
-#' @return Performs a multiple sequence alignemnt and prints to the consule or saves a pdf to 
44
->>>>>>> Added ability to print alignment to consule
45
-#' the working directory.
46
-#' @seealso If having trouble saving pdf files, refer to Biconductor package msa for
47
-#' installation instructions
48
-#' \url{http://bioconductor.org/packages/release/bioc/vignettes/msa/inst/doc/msa.pdf}
49
-#' @examples
50
-#' file.path <- system.file("extdata", "IGH_sequencing", package = "LymphoSeq")
51
-#' 
52
-#' file.list <- readImmunoSeq(path = file.path)
53
-#' 
54
-#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
55
-#' 
56
-#' alignSeq(list = productive.nt, sample = "IGH_MVQ92552A_BL", type = "nucleotide", 
57
-<<<<<<< HEAD
58
-#'          method = "ClustalW", output = "console")
59
-=======
60
-#'          method = "ClustalW", output = "consule")
61
->>>>>>> Added ability to print alignment to consule
62
-#' @export
63
-#' @importFrom Biostrings DNAStringSet
64
-#' @importFrom Biostrings AAStringSet
65
-#' @import msa
66
-<<<<<<< HEAD
67
-alignSeq = function(list, sample = NULL, sequence = NULL, editDistance = 15, output = "console", type = "nucleotide", method = "ClustalOmega") {
68
-=======
69
-alignSeq = function(list, sample = NULL, sequence = NULL, editDistance = 15, output = "consule", type = "nucleotide", method = "ClustalOmega") {
70
->>>>>>> Added ability to print alignment to consule
71
-    if(!is.null(sequence) & is.null(sample)){
72
-        file <- searchSeq(list = list, sequence = sequence, editDistance = editDistance, type = type, match = "partial")
73
-        if(is.null(file)){
74
-            stop("There are no sequences to be aligned", call. = FALSE)
75
-        }
76
-    }
77
-    if(!is.null(sequence) & !is.null(sample)){
78
-        file <- searchSeq(list = list[sample], sequence = sequence, editDistance = editDistance, type = type, match = "partial")
79
-        if(is.null(file)){
80
-            stop("There are no sequences to be aligned", call. = FALSE)
81
-        }
82
-    }
83
-    if(is.null(sequence)){
84
-        file <- list[[sample]] 
85
-    }
86
-    if(nrow(file) > 150){
87
-        file <- file[1:150,]
88
-        message("Only the first 150 sequences will be aligned")
89
-    }
90
-    file[file == ""] <- "unresolved"
91
-    if(type == "nucleotide"){
92
-        names(file)[which(names(file) == "foundSequnece")] <- "nucleotide"
93
-        file <- file[nchar(file$nucleotide) > 3, ]
94
-        string <- Biostrings::DNAStringSet(file$nucleotide)
95
-    }
96
-    if(type == "aminoAcid"){
97
-        names(file)[which(names(file) == "foundSequnece")] <- "aminoAcid"
98
-        file <- file[nchar(file$aminoAcid) > 3, ]
99
-        string <- Biostrings::AAStringSet(file$aminoAcid)
100
-    }
101
-    if(nrow(file) < 3){
102
-        stop("There are less than 3 sequences to be aligned", call. = FALSE)
103
-    }
104
-    if(!is.null(sequence)){
105
-        names(string) <- paste(file$sample)
106
-    } else {
107
-        names(string) <- paste(file$vFamilyName, file$dFamilyName, file$jFamilyName, file$count)
108
-    }
109
-        names(string) <- gsub(names(string), pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "")
110
-        names(string) <- gsub(names(string), pattern = "unresolved", replacement = "UNR")
111
-        alignment <- msa::msa(string, method = method)
112
-<<<<<<< HEAD
113
-        if(output == "console"){
114
-=======
115
-        if(output == "consule"){
116
->>>>>>> Added ability to print alignment to consule
117
-            print(alignment, show = "complete")
118
-        }
119
-        if(output == "pdf"){
120
-            if(!is.null(sample)){
121
-                file.name <- paste(names(list[sample]), ".pdf", sep = "")
122
-            } else {
123
-                file.name <- "Sequence_aligment.pdf"
124
-            }
125
-            msa::msaPrettyPrint(alignment, output = "pdf", file = file.name, 
126
-                                showNumbering = "right", showNames = "left", showConsensus = "top",  
127
-                                shadingMode = "similar", shadingColors = "blues",
128
-                                paperWidth = ncol(alignment)*0.1 + 5, paperHeight = nrow(alignment)*0.1 + 5,
129
-                                showLogo = "bottom", showLogoScale = "right", logoColors = "rasmol",
130
-                                psFonts = TRUE, showLegend = TRUE, askForOverwrite = FALSE,
131
-                                furtherCode=c("\\defconsensus{.}{lower}{upper}","\\showruler{1}{top}"))
132
-            message(paste(file.name, "saved to", getwd()))
133
-        }
134
-}
135 0
new file mode 100644
... ...
@@ -0,0 +1,74 @@
1
+#' Clonality
2
+#' 
3
+#' Creates a data frame giving the total number of sequences, number of unique 
4
+#' productive sequences, number of genomes, entropy, clonality, Gini 
5
+#' coefficient, and the frequency (\%) of the top productive sequences in a list 
6
+#' of sample data frames.
7
+#' 
8
+#' @param file.list A list of data frames consisting of antigen receptor 
9
+#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", 
10
+#' and "frequencyCount" are required columns.  "estimatedNumberGenomes" is optional.  
11
+#' Note that clonality is usually calculated from productive nucleotide sequences.  
12
+#' Therefore, it is not recommended to run this function using a productive sequence
13
+#' list aggregated by amino acids.
14
+#' @return Returns a data frame giving the total number of sequences, number of 
15
+#' unique productive sequences, number of genomes, clonality, Gini coefficient, 
16
+#' and the frequency (\%) of the top productive sequence in each sample.
17
+#' @details Clonality is derived from the Shannon entropy, which is calculated 
18
+#' from the frequencies of all productive sequences divided by the logarithm of 
19
+#' the total number of unique productive sequences.  This normalized entropy 
20
+#' value is then inverted (1 - normalized entropy) to produce the clonality 
21
+#' metric.  
22
+#' 
23
+#' The Gini coefficient is an alternative metric used to calculate repertoire 
24
+#' diversity and is derived from the Lorenz curve.  The Lorenz curve is drawn 
25
+#' such that x-axis represents the cumulative percentage of unique sequences and 
26
+#' the y-axis represents the cumulative percentage of reads.  A line passing 
27
+#' through the origin with a slope of 1 reflects equal frequencies of all clones.  
28
+#' The Gini coefficient is the ratio of the area between the line of equality 
29
+#' and the observed Lorenz curve over the total area under the line of equality.  
30
+#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where 
31
+#' 0 indicates all sequences have the same frequency and 1 indicates the 
32
+#' repertoire is dominated by a single sequence.
33
+#' @examples
34
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
35
+#' 
36
+#' file.list <- readImmunoSeq(path = file.path)
37
+#' 
38
+#' clonality(file.list = file.list)
39
+#' @seealso \code{\link{lorenzCurve}}
40
+#' @export
41
+#' @importFrom ineq Gini
42
+clonality <- function(file.list) {
43
+    table <- data.frame(samples = names(file.list))
44
+    i <- 1
45
+    for (i in 1:length(file.list)) {
46
+        file <- file.list[[i]]
47
+        total.reads <- nrow(file)
48
+        file$estimatedNumberGenomes <- suppressWarnings(as.integer(file$estimatedNumberGenomes))
49
+        total.genomes <- sum(file$estimatedNumberGenomes)
50
+        total.count <- sum(file$count)
51
+        productive <- file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ]
52
+        frequency <- productive$count/sum(productive$count)
53
+        entropy <- -sum(frequency * log2(frequency), na.rm = TRUE)
54
+        unique.productive <- nrow(productive)
55
+        clonality <- 1 - round(entropy/log2(unique.productive), digits = 6)
56
+        table$totalSequences[i] <- total.reads
57
+        table$uniqueProductiveSequences[i] <- unique.productive
58
+        table$totalGenomes[i] <- total.genomes
59
+        table$totalCount[i] <- total.count
60
+        table$clonality[i] <- clonality
61
+        table$giniCoefficient[i] <- ineq::Gini(frequency)
62
+        table$topProductiveSequence[i] <- max(frequency) * 100
63
+        if (any(grepl("estimatedNumberGenomes", colnames(file)))) {
64
+            file$estimatedNumberGenomes <- suppressWarnings(as.integer(file$estimatedNumberGenomes))
65
+            total.genomes <- sum(file$estimatedNumberGenomes)
66
+            table$totalGenomes[i] <- ifelse(total.genomes == 0, NA, total.genomes)
67
+        } else {
68
+            table$totalGenomes[i] <- NA
69
+        }
70
+    }
71
+    table <- table[order(table$topProductiveSequence, decreasing = TRUE), ]
72
+    rownames(table) <- NULL
73
+    return(table)
74
+} 
0 75
\ No newline at end of file