git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/LymphoSeq@115899 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,20 @@ |
1 |
+Package: LymphoSeq |
|
2 |
+Title: Analyze high-throughput sequencing of T and B cell receptors |
|
3 |
+Version: 0.99.3 |
|
4 |
+Author: David Coffey <dcoffey@fredhutch.org> |
|
5 |
+Maintainer: David Coffey <dcoffey@fredhutch.org> |
|
6 |
+Description: This R package analyzes high-throughput sequencing of T |
|
7 |
+ and B cell receptor complementarity determining region 3 (CDR3) |
|
8 |
+ sequences generated by Adaptive Biotechnologies' ImmunoSEQ |
|
9 |
+ assay. Its input comes from tab-separated value (.tsv) files |
|
10 |
+ exported from the ImmunoSEQ analyzer. |
|
11 |
+Depends: R (>= 3.2), LymphoSeqDB |
|
12 |
+Imports: data.table, plyr, dplyr, reshape, VennDiagram, ggplot2, ineq, |
|
13 |
+ RColorBrewer, circlize, grid, utils, stats |
|
14 |
+License: Artistic-2.0 |
|
15 |
+LazyData: true |
|
16 |
+RoxygenNote: 5.0.1 |
|
17 |
+Suggests: knitr, pheatmap, wordcloud |
|
18 |
+VignetteBuilder: knitr |
|
19 |
+biocViews: Software, Technology, Sequencing, TargetedResequencing |
|
20 |
+NeedsCompilation: no |
0 | 21 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,47 @@ |
1 |
+# Generated by roxygen2: do not edit by hand |
|
2 |
+ |
|
3 |
+export(bhattacharyyaCoefficient) |
|
4 |
+export(bhattacharyyaMatrix) |
|
5 |
+export(chordDiagramVDJ) |
|
6 |
+export(clonality) |
|
7 |
+export(cloneTrack) |
|
8 |
+export(commonSeqs) |
|
9 |
+export(commonSeqsPlot) |
|
10 |
+export(commonSeqsVenn) |
|
11 |
+export(geneFreq) |
|
12 |
+export(lorenzCurve) |
|
13 |
+export(mergeFiles) |
|
14 |
+export(pairwisePlot) |
|
15 |
+export(productive) |
|
16 |
+export(productiveSeq) |
|
17 |
+export(readImmunoSeq) |
|
18 |
+export(removeSeq) |
|
19 |
+export(searchPublished) |
|
20 |
+export(searchSeq) |
|
21 |
+export(seqMatrix) |
|
22 |
+export(similarityMatrix) |
|
23 |
+export(similarityScore) |
|
24 |
+export(topFreq) |
|
25 |
+export(topSeqs) |
|
26 |
+export(topSeqsPlot) |
|
27 |
+export(uniqueSeqs) |
|
28 |
+import(LymphoSeqDB) |
|
29 |
+import(ggplot2) |
|
30 |
+importFrom(RColorBrewer,brewer.pal) |
|
31 |
+importFrom(VennDiagram,draw.pairwise.venn) |
|
32 |
+importFrom(VennDiagram,draw.triple.venn) |
|
33 |
+importFrom(circlize,chordDiagram) |
|
34 |
+importFrom(circlize,colorRamp2) |
|
35 |
+importFrom(data.table,fread) |
|
36 |
+importFrom(dplyr,group_by) |
|
37 |
+importFrom(dplyr,summarise) |
|
38 |
+importFrom(grid,grid.draw) |
|
39 |
+importFrom(grid,grid.newpage) |
|
40 |
+importFrom(ineq,Gini) |
|
41 |
+importFrom(ineq,Lc) |
|
42 |
+importFrom(plyr,ldply) |
|
43 |
+importFrom(plyr,llply) |
|
44 |
+importFrom(reshape,melt.data.frame) |
|
45 |
+importFrom(stats,aggregate) |
|
46 |
+importFrom(stats,na.omit) |
|
47 |
+importFrom(utils,adist) |
0 | 48 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,32 @@ |
1 |
+#' Bhattacharyya coefficient |
|
2 |
+#' |
|
3 |
+#' Calculates the Bhattacharyya coefficient of two samples. |
|
4 |
+#' |
|
5 |
+#' @param sample1 A data frame consisting of frequencies of antigen receptor |
|
6 |
+#' sequences. "frequencyCount" is a required column. |
|
7 |
+#' @param sample2 A data frame consisting of frequencies of antigen receptor |
|
8 |
+#' sequences. "frequencyCount" is a required column. |
|
9 |
+#' @return Returns the Bhattacharyya coefficient, a measure of the amount of |
|
10 |
+#' overlap between two samples. The value ranges from 0 to 1 where 1 indicates |
|
11 |
+#' the sequence frequencies are identical in the two samples and 0 indicates no |
|
12 |
+#' shared frequencies. |
|
13 |
+#' @examples |
|
14 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
15 |
+#' |
|
16 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
17 |
+#' |
|
18 |
+#' productive.aa <- productiveSeq(file.list, aggregate = "aminoAcid") |
|
19 |
+#' |
|
20 |
+#' bhattacharyyaCoefficient(productive.aa[["TCRB_Day32_Unsorted"]], |
|
21 |
+#' productive.aa[["TCRB_Day83_Unsorted"]]) |
|
22 |
+#' @seealso \code{\link{bhattacharyyaMatrix}} |
|
23 |
+#' @export |
|
24 |
+bhattacharyyaCoefficient <- function(sample1, sample2) { |
|
25 |
+ p <- sample1[which(sample1[, "aminoAcid"] %in% sample2[, "aminoAcid"]), ] |
|
26 |
+ p$frequencyCount <- p$frequencyCount/100 |
|
27 |
+ q <- sample2[which(sample2[, "aminoAcid"] %in% sample1[, "aminoAcid"]), ] |
|
28 |
+ q$frequencyCount <- q$frequencyCount/100 |
|
29 |
+ s <- p[, "frequencyCount"] * q[, "frequencyCount"] |
|
30 |
+ bc <- sum(sqrt(s)) |
|
31 |
+ return(bc) |
|
32 |
+} |
|
0 | 33 |
\ No newline at end of file |
1 | 34 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,41 @@ |
1 |
+#' Bhattacharyya matrix |
|
2 |
+#' |
|
3 |
+#' Calculates the Bhattacharyya coefficient of all pairwise comparison from a |
|
4 |
+#' list of data frames. |
|
5 |
+#' |
|
6 |
+#' @param productive.seqs A list data frames of productive sequences generated |
|
7 |
+#' by the LymphoSeq function productiveSeq. "frequencyCount" and "aminoAcid" |
|
8 |
+#' are a required columns. |
|
9 |
+#' @return A data frame of Bhattacharyya coefficients calculated from all |
|
10 |
+#' pairwise comparisons from a list of sample data frames. The Bhattacharyya |
|
11 |
+#' coefficient is a measure of the amount of overlap between two samples. The |
|
12 |
+#' value ranges from 0 to 1 where 1 indicates the sequence frequencies are |
|
13 |
+#' identical in the two samples and 0 indicates no shared frequencies. |
|
14 |
+#' @examples |
|
15 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
16 |
+#' |
|
17 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
18 |
+#' |
|
19 |
+#' productive.aa <- productiveSeq(file.list, aggregate = "aminoAcid") |
|
20 |
+#' |
|
21 |
+#' bhattacharyyaMatrix(productive.seqs = productive.aa) |
|
22 |
+#' @seealso \code{\link{pairwisePlot}} for plotting results as a heat map. |
|
23 |
+#' @export |
|
24 |
+bhattacharyyaMatrix <- function(productive.seqs) { |
|
25 |
+ l <- length(productive.seqs) |
|
26 |
+ m <- matrix(nrow = l, ncol = l) |
|
27 |
+ rownames(m) <- names(productive.seqs) |
|
28 |
+ colnames(m) <- names(productive.seqs) |
|
29 |
+ for (i in 1:l) { |
|
30 |
+ for (j in 1:i) { |
|
31 |
+ m[i, j] <- bhattacharyyaCoefficient(productive.seqs[[i]], productive.seqs[[j]]) |
|
32 |
+ m[j, i] <- m[i, j] |
|
33 |
+ } |
|
34 |
+ } |
|
35 |
+ m <- m[, colnames(m)[order(nchar(colnames(m)), colnames(m), |
|
36 |
+ decreasing = TRUE)]] |
|
37 |
+ m <- m[colnames(m)[order(nchar(colnames(m)), colnames(m), |
|
38 |
+ decreasing = TRUE)], ] |
|
39 |
+ d <- as.data.frame(m) |
|
40 |
+ return(d) |
|
41 |
+} |
|
0 | 42 |
\ No newline at end of file |
1 | 43 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,70 @@ |
1 |
+#' Chord diagram of VJ or DJ gene associations |
|
2 |
+#' |
|
3 |
+#' Creates a chord diagram showing VJ or DJ gene associations from one or more |
|
4 |
+#' samples. |
|
5 |
+#' |
|
6 |
+#' @param sample A data frame consisting of frequencies of antigen receptor |
|
7 |
+#' sequences. "vFamilyName", "jFamilyName", and if applicable, "dFamilyName" |
|
8 |
+#' are a required columns. Using output from the LymphoSeq function topSeqs is |
|
9 |
+#' recommended. |
|
10 |
+#' @param association A character vector of gene familes to associate. Options |
|
11 |
+#' include "VJ" or "DJ". |
|
12 |
+#' @param colors A character vector of 2 colors corresponding to the V/D and J |
|
13 |
+#' gene colors respectively. |
|
14 |
+#' @details The size of the ribbons connecting VJ or DJ genes correspond to the |
|
15 |
+#' number of samples or number of sequences that make up that recombination |
|
16 |
+#' event. The thicker the ribbon, the higher the frequency of the recombination. |
|
17 |
+#' @return Returns a chord diagram showing VJ or DJ gene associations from one or |
|
18 |
+#' more samples. |
|
19 |
+#' @seealso \code{\link{topSeqs}} |
|
20 |
+#' @examples |
|
21 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
22 |
+#' |
|
23 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
24 |
+#' |
|
25 |
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
26 |
+#' |
|
27 |
+#' top.seqs <- topSeqs(productive.seqs = productive.nt, top = 1) |
|
28 |
+#' |
|
29 |
+#' chordDiagramVDJ(sample = top.seqs, association = "VJ", colors = c("red", "blue")) |
|
30 |
+#' |
|
31 |
+#' # Remove "TCRB" from gene family name |
|
32 |
+#' top.seqs <- as.data.frame(apply(top.seqs, 2, function(x) gsub("TCRB", "", x))) |
|
33 |
+#' |
|
34 |
+#' chordDiagramVDJ(sample = top.seqs, association = "VJ", colors = c("red", "blue")) |
|
35 |
+#' @export |
|
36 |
+#' @importFrom circlize colorRamp2 chordDiagram |
|
37 |
+chordDiagramVDJ <- function(sample, association = "VJ", colors = c("red", "blue")) { |
|
38 |
+ if (association == "VJ") { |
|
39 |
+ if (!all(c("vFamilyName", "jFamilyName") %in% colnames(sample))) { |
|
40 |
+ stop("The source data frame does not contain the required columns 'vFamilyName' and 'jFamilyName'.") |
|
41 |
+ } |
|
42 |
+ vj <- sample[, c("vFamilyName", "jFamilyName")] |
|
43 |
+ vj[is.na(vj) | vj == ""] <- "unresolved" |
|
44 |
+ vj$vFamilyName <- as.character(vj$vFamilyName) |
|
45 |
+ vj$jFamilyName <- as.character(vj$jFamilyName) |
|
46 |
+ table <- table(vj$vFamilyName, vj$jFamilyName) |
|
47 |
+ matrix <- as.matrix(as.data.frame.matrix(table)) |
|
48 |
+ ribbon.color <- circlize::colorRamp2(range(matrix), c("grey", "black")) |
|
49 |
+ circlize::chordDiagram(matrix, |
|
50 |
+ annotationTrack = c("grid", "name"), |
|
51 |
+ grid.col = c(rep(colors[1], dim(matrix)[1]), rep(colors[2], dim(matrix)[2])), |
|
52 |
+ col = ribbon.color) |
|
53 |
+ } |
|
54 |
+ if (association == "DJ") { |
|
55 |
+ if (!all(c("dFamilyName", "jFamilyName") %in% colnames(sample))) { |
|
56 |
+ stop("The source data frame does not contain the required columns 'dFamilyName' and 'jFamilyName'.") |
|
57 |
+ } |
|
58 |
+ dj <- sample[, c("vFamilyName", "jFamilyName")] |
|
59 |
+ dj[is.na(dj) | dj == ""] <- "unresolved" |
|
60 |
+ dj$dFamilyName <- as.character(dj$dFamilyName) |
|
61 |
+ dj$jFamilyName <- as.character(dj$jFamilyName) |
|
62 |
+ table <- table(dj$dFamilyName, dj$jFamilyName) |
|
63 |
+ matrix <- as.matrix(as.data.frame.matrix(table)) |
|
64 |
+ ribbon.color <- circlize::colorRamp2(range(matrix), c("grey", "black")) |
|
65 |
+ circlize::chordDiagram(matrix, |
|
66 |
+ annotationTrack = c("grid", "name"), |
|
67 |
+ grid.col = c(rep(colors[1], dim(matrix)[1]), rep(colors[2], dim(matrix)[2])), |
|
68 |
+ col = ribbon.color) |
|
69 |
+ } |
|
70 |
+} |
|
0 | 71 |
\ No newline at end of file |
1 | 72 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,67 @@ |
1 |
+#' Clonality |
|
2 |
+#' |
|
3 |
+#' Creates a data frame giving the total number of sequences, number of unique |
|
4 |
+#' productive sequences, number of genomes, entropy, clonality, Gini |
|
5 |
+#' coefficient, and the frequency (\%) of the top productive sequences in a list |
|
6 |
+#' of sample data frames. |
|
7 |
+#' |
|
8 |
+#' @param file.list A list of data frames consisting of antigen receptor |
|
9 |
+#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", |
|
10 |
+#' "frequencyCount", and "estimatedNumberGenomes" are required columns. Note |
|
11 |
+#' that the function is not intended to be run using a productive sequence list |
|
12 |
+#' generated by the function productiveSeq. |
|
13 |
+#' @return Returns a data frame giving the total number of sequences, number of |
|
14 |
+#' unique productive sequences, number of genomes, entropy, clonality, |
|
15 |
+#' Gini coefficient, and the frequency (\%) of the top productive sequence in each sample. |
|
16 |
+#' @details Clonality is derived from the Shannon entropy, which is calculated |
|
17 |
+#' from the frequencies of all productive sequences divided by the logarithm of |
|
18 |
+#' the total number of unique productive sequences. This normalized entropy |
|
19 |
+#' value is then inverted (1 - normalized entropy) to produce the clonality |
|
20 |
+#' metric. |
|
21 |
+#' |
|
22 |
+#' The Gini coefficient is an alternative metric used to calculate repertoire |
|
23 |
+#' diversity and is derived from the Lorenz curve. The Lorenz curve is drawn |
|
24 |
+#' such that x-axis represents the cumulative percentage of unique sequences and |
|
25 |
+#' the y-axis represents the cumulative percentage of reads. A line passing |
|
26 |
+#' through the origin with a slope of 1 reflects equal frequencies of all clones. |
|
27 |
+#' The Gini coefficient is the ratio of the area between the line of equality |
|
28 |
+#' and the observed Lorenz curve over the total area under the line of equality. |
|
29 |
+#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where |
|
30 |
+#' 0 indicates all sequences have the same frequency and 1 indicates the |
|
31 |
+#' repertoire is dominated by a single sequence. |
|
32 |
+#' @examples |
|
33 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
34 |
+#' |
|
35 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
36 |
+#' |
|
37 |
+#' clonality(file.list = file.list) |
|
38 |
+#' @seealso \code{\link{lorenzCurve}} |
|
39 |
+#' @export |
|
40 |
+#' @importFrom ineq Gini |
|
41 |
+clonality <- function(file.list) { |
|
42 |
+ table <- data.frame(samples = names(file.list)) |
|
43 |
+ i <- 1 |
|
44 |
+ for (i in 1:length(file.list)) { |
|
45 |
+ file <- file.list[[i]] |
|
46 |
+ total.reads <- length(file$nucleotide) |
|
47 |
+ total.genomes <- sum(file$estimatedNumberGenomes) |
|
48 |
+ total.count <- sum(file$count) |
|
49 |
+ productive <- file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ] |
|
50 |
+ frequency <- productive$count/sum(productive$count) |
|
51 |
+ entropy <- -sum(frequency * log2(frequency)) |
|
52 |
+ unique.productive <- length(productive$nucleotide) |
|
53 |
+ clonality <- 1 - (entropy/log2(unique.productive)) |
|
54 |
+ table$totalSequences[i] <- total.reads |
|
55 |
+ table$uniqueProductiveSequences[i] <- unique.productive |
|
56 |
+ table$totalGenomes[i] <- total.genomes |
|
57 |
+ table$totalCount[i] <- total.count |
|
58 |
+ table$entropy[i] <- entropy |
|
59 |
+ table$clonality[i] <- clonality |
|
60 |
+ table$giniCoefficient[i] <- ineq::Gini(frequency) |
|
61 |
+ table$topProductiveSequence[i] <- max(frequency) * 100 |
|
62 |
+ } |
|
63 |
+ table$samples <- as.character(table$samples) |
|
64 |
+ table <- table[order(nchar(table$samples), table$samples, decreasing = FALSE), ] |
|
65 |
+ rownames(table) <- NULL |
|
66 |
+ return(table) |
|
67 |
+} |
|
0 | 68 |
\ No newline at end of file |
1 | 69 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,158 @@ |
1 |
+#' Clone tracking plot |
|
2 |
+#' |
|
3 |
+#' Creates line plot tracking amino acid frequencies across multiple samples |
|
4 |
+#' |
|
5 |
+#' @param sequence.matrix A sequence matrix generated from the LymphoSeq |
|
6 |
+#' function seqMatrix. |
|
7 |
+#' @param map An optional character vector of one or more sample names contained |
|
8 |
+#' in the productive.aa list. If the same sequence appears in multiple mapped |
|
9 |
+#' samples, then it will be assigned the label of the first listed sample only. |
|
10 |
+#' @param productive.aa A list of data frames of productive amino acid sequences |
|
11 |
+#' containing the samples to be mapped. This parameter is only required if |
|
12 |
+#' sequence mapping is performed. |
|
13 |
+#' @param label An optional character vector of one or more labels used to |
|
14 |
+#' annotate the mapped sequences. The order of the labels must match the order |
|
15 |
+#' of the samples listed in map. |
|
16 |
+#' @param track An optional character vector of one or more amino acid sequences |
|
17 |
+#' to track. |
|
18 |
+#' @param unassigned A Boolean value indicating whether or not to draw the lines |
|
19 |
+#' of sequences not being mapped or tracked. If TRUE then the unassigned |
|
20 |
+#' sequences are drawn. If FALSE, then the unassigned sequences are not drawn. |
|
21 |
+#' @return Returns a line plot showing the amino acid frequencies across |
|
22 |
+#' multiple samples in the sequence matrix where each line represents one |
|
23 |
+#' unique sequence. |
|
24 |
+#' @details The plot is made using the package ggplot2 and can be reformatted |
|
25 |
+#' using ggplot2 functions. See examples below. |
|
26 |
+#' @seealso An excellent resource for examples on how to reformat a ggplot can |
|
27 |
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}). |
|
28 |
+#' @examples |
|
29 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
30 |
+#' |
|
31 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
32 |
+#' |
|
33 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
34 |
+#' |
|
35 |
+#' top.freq <- topFreq(productive.aa = productive.aa, percent = 0.1) |
|
36 |
+#' |
|
37 |
+#' sequence.matrix <- seqMatrix(productive.aa = productive.aa, sequences = top.freq$aminoAcid) |
|
38 |
+#' |
|
39 |
+#' # Track clones without mapping or tracking specific sequences |
|
40 |
+#' cloneTrack(sequence.matrix = sequence.matrix) |
|
41 |
+#' |
|
42 |
+#' # Track top 20 clones mapping to the CD4 and CD8 samples |
|
43 |
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, |
|
44 |
+#' map = c("TCRB_Day949_CD4", "TCRB_Day949_CD8"), label = c("CD4", "CD8"), |
|
45 |
+#' track = top.freq$aminoAcid[1:20], unassigned = TRUE) |
|
46 |
+#' |
|
47 |
+#' # Track the top 10 clones from top.freq |
|
48 |
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, |
|
49 |
+#' track = top.freq$aminoAcid[1:10], unassigned = FALSE) |
|
50 |
+#' |
|
51 |
+#' # Track clones mapping to the CD4 and CD8 samples while ignoring all others |
|
52 |
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, |
|
53 |
+#' map = c("TCRB_Day949_CD4", "TCRB_Day949_CD8"), label = c("CD4", "CD8"), |
|
54 |
+#' unassigned = FALSE) |
|
55 |
+#' |
|
56 |
+#' # Track clones mapping to the CD4 and CD8 samples and track 2 specific sequences |
|
57 |
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, |
|
58 |
+#' map = c("TCRB_Day949_CD4", "TCRB_Day949_CD8"), label = c("CD4", "CD8"), |
|
59 |
+#' track = c("CASSPPTGERDTQYF", "CASSQDRTGQYGYTF"), unassigned = FALSE) |
|
60 |
+#' |
|
61 |
+#' # Reorder the x axis, change the axis labels, convert to log scale, and add title |
|
62 |
+#' x.limits <- c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted", |
|
63 |
+#' "TCRB_Day83_Unsorted", "TCRB_Day949_Unsorted", "TCRB_Day1320_Unsorted") |
|
64 |
+#' |
|
65 |
+#' sequence.matrix <- sequence.matrix[ ,c("aminoAcid", x.limits)] |
|
66 |
+#' |
|
67 |
+#' clone.track <- cloneTrack(sequence.matrix = sequence.matrix, |
|
68 |
+#' productive.aa = productive.aa, track = top.freq$aminoAcid[1:10], unassigned = FALSE) |
|
69 |
+#' |
|
70 |
+#' x.labels <- c("Day 0", "Day 32", "Day 83", "Day 949", "Day 1320") |
|
71 |
+#' |
|
72 |
+#' clone.track + |
|
73 |
+#' ggplot2::scale_x_discrete(expand = c(0,0), labels = x.labels) + |
|
74 |
+#' ggplot2::scale_y_log10() + ggplot2::annotation_logticks(sides = "l") + |
|
75 |
+#' ggplot2::ggtitle("Figure Title") |
|
76 |
+#' @export |
|
77 |
+#' @import ggplot2 |
|
78 |
+#' @importFrom RColorBrewer brewer.pal |
|
79 |
+#' @importFrom reshape melt.data.frame |
|
80 |
+cloneTrack <- function(sequence.matrix, map = "none", productive.aa, |
|
81 |
+ label = "none", track = "none", unassigned = TRUE) { |
|
82 |
+ if (any(map != "none")) { |
|
83 |
+ sequence.matrix$numberSamples <- NULL |
|
84 |
+ sequence.melt <- reshape::melt.data.frame(sequence.matrix, |
|
85 |
+ id.vars = "aminoAcid") |
|
86 |
+ sequence.melt$map <- rep("Unassigned", length(sequence.matrix$aminoAcid)) |
|
87 |
+ if (any(track != "none")) { |
|
88 |
+ i <- 1 |
|
89 |
+ for (i in 1:length(track)) { |
|
90 |
+ sequence.melt$map <- ifelse(sequence.melt$aminoAcid == track[i], |
|
91 |
+ track[i], sequence.melt$map) |
|
92 |
+ } |
|
93 |
+ } |
|
94 |
+ j <- 1 |
|
95 |
+ for (j in 1:length(map)) { |
|
96 |
+ file <- productive.aa[[map[j]]] |
|
97 |
+ aminoAcid <- as.character(file$aminoAcid) |
|
98 |
+ sequence.melt$map <- ifelse(sequence.melt$aminoAcid %in% aminoAcid & |
|
99 |
+ sequence.melt$map == "Unassigned", |
|
100 |
+ label[j], sequence.melt$map) |
|
101 |
+ } |
|
102 |
+ if (unassigned == FALSE) { |
|
103 |
+ sequence.melt <- sequence.melt[sequence.melt$map != "Unassigned", ] |
|
104 |
+ } |
|
105 |
+ getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1")) |
|
106 |
+ plot <- ggplot2::ggplot(sequence.melt, |
|
107 |
+ aes_string(x = "variable", |
|
108 |
+ y = "value", |
|
109 |
+ group = "aminoAcid", |
|
110 |
+ color = "map")) + |
|
111 |
+ geom_line() + |
|
112 |
+ theme_minimal() + |
|
113 |
+ scale_x_discrete(expand = c(0, 0)) + |
|
114 |
+ scale_y_continuous(expand = c(0, 0)) + |
|
115 |
+ labs(x = "", y = "Frequency (%)", color = "") + |
|
116 |
+ scale_colour_manual(values = c(getPalette(length(label) + |
|
117 |
+ length(track) + 1))) + |
|
118 |
+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) |
|
119 |
+ return(plot) |
|
120 |
+ } else { |
|
121 |
+ sequence.matrix$numberSamples <- NULL |
|
122 |
+ sequence.melt <- reshape::melt.data.frame(sequence.matrix, id.vars = "aminoAcid") |
|
123 |
+ if (any(track != "none")) { |
|
124 |
+ sequence.melt$map <- rep("Unassigned", length(sequence.matrix$aminoAcid)) |
|
125 |
+ k <- 1 |
|
126 |
+ for (k in 1:length(track)) { |
|
127 |
+ sequence.melt$map <- ifelse(sequence.melt$aminoAcid == track[k], |
|
128 |
+ track[k], sequence.melt$map) |
|
129 |
+ } |
|
130 |
+ if (unassigned == FALSE) { |
|
131 |
+ sequence.melt <- sequence.melt[sequence.melt$map != "Unassigned", |
|
132 |
+ ] |
|
133 |
+ } |
|
134 |
+ getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1")) |
|
135 |
+ plot <- ggplot2::ggplot(sequence.melt, aes_string(x = "variable", |
|
136 |
+ y = "value", |
|
137 |
+ group = "aminoAcid", |
|
138 |
+ color = "map")) + |
|
139 |
+ geom_line() + theme_minimal() + |
|
140 |
+ scale_x_discrete(expand = c(0,0)) + |
|
141 |
+ scale_y_continuous(expand = c(0, 0)) + |
|
142 |
+ labs(x = "", y = "Frequency (%)", color = "") + |
|
143 |
+ scale_colour_manual(values = getPalette(length(track) + 1)) + |
|
144 |
+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) |
|
145 |
+ return(plot) |
|
146 |
+ } |
|
147 |
+ plot <- ggplot2::ggplot(sequence.melt, aes_string(x = "variable", |
|
148 |
+ y = "value", |
|
149 |
+ group = "aminoAcid")) + |
|
150 |
+ geom_line() + |
|
151 |
+ theme_minimal() + |
|
152 |
+ scale_x_discrete(expand = c(0, 0)) + |
|
153 |
+ scale_y_continuous(expand = c(0, 0)) + |
|
154 |
+ labs(x = "", y = "Frequency (%)") + |
|
155 |
+ theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) |
|
156 |
+ return(plot) |
|
157 |
+ } |
|
158 |
+} |
|
0 | 159 |
\ No newline at end of file |
1 | 160 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,32 @@ |
1 |
+#' Common sequences in two or more samples |
|
2 |
+#' |
|
3 |
+#' Creates a data frame of the common sequences in two or more samples, reporting |
|
4 |
+#' their frequencies in each. |
|
5 |
+#' |
|
6 |
+#' @param samples A character vector of two or more sample names in |
|
7 |
+#' productive.aa. |
|
8 |
+#' @param productive.aa A list of productive amino acid sequences generated |
|
9 |
+#' by the LymphoSeq function productiveSeq where aggregate = "aminoAcid". |
|
10 |
+#' @return Returns a data frame of the common sequences between two or more files |
|
11 |
+#' displaying their frequencies in each. |
|
12 |
+#' @seealso \code{\link{commonSeqsVenn}} |
|
13 |
+#' @examples |
|
14 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
15 |
+#' |
|
16 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
17 |
+#' |
|
18 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
19 |
+#' |
|
20 |
+#' commonSeqs(samples = c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted"), |
|
21 |
+#' productive.aa = productive.aa) |
|
22 |
+#' @export |
|
23 |
+#' @importFrom plyr llply |
|
24 |
+#' @import ggplot2 |
|
25 |
+commonSeqs <- function(samples, productive.aa) { |
|
26 |
+ aminoAcid <- plyr::llply(productive.aa[samples], |
|
27 |
+ function(x) x[, "aminoAcid"]) |
|
28 |
+ common <- Reduce(intersect, aminoAcid) |
|
29 |
+ seq.matrix <- seqMatrix(productive.aa[samples], common) |
|
30 |
+ seq.matrix$numberSamples <- NULL |
|
31 |
+ return(seq.matrix) |
|
32 |
+} |
|
0 | 33 |
\ No newline at end of file |
1 | 34 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,59 @@ |
1 |
+#' Common sequences plot |
|
2 |
+#' |
|
3 |
+#' Creates a scatter plot of just the sequences in common between two samples. |
|
4 |
+#' |
|
5 |
+#' @param sample1 A name of a sample in a list of data frames generated by the |
|
6 |
+#' LymphoSeq function productiveSeq. |
|
7 |
+#' @param sample2 A name of a sample in a list of data frames generated by the |
|
8 |
+#' LymphoSeq function productiveSeq. |
|
9 |
+#' @param productive.aa A list of data frames of productive amino acid sequences |
|
10 |
+#' produced by the LymphoSeq function productiveSeq containing the |
|
11 |
+#' samples to be compared. |
|
12 |
+#' @return Returns a frequency scatter plot of two samples showing only the |
|
13 |
+#' shared sequences. |
|
14 |
+#' @details The plot is made using the package ggplot2 and can be reformatted |
|
15 |
+#' using ggplot2 functions. See examples below. |
|
16 |
+#' @seealso An excellent resource for examples on how to reformat a ggplot can |
|
17 |
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}). |
|
18 |
+#' @examples |
|
19 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
20 |
+#' |
|
21 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
22 |
+#' |
|
23 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
24 |
+#' |
|
25 |
+#' commonSeqsPlot("TCRB_Day32_Unsorted", "TCRB_Day83_Unsorted", |
|
26 |
+#' productive.aa = productive.aa) |
|
27 |
+#' |
|
28 |
+#' # Change the X and Y axises to log-10 scale |
|
29 |
+#' commonSeqsPlot("TCRB_Day32_Unsorted", "TCRB_Day83_Unsorted", |
|
30 |
+#' productive.aa = productive.aa) + |
|
31 |
+#' ggplot2::scale_x_log10() + |
|
32 |
+#' ggplot2::scale_y_log10() + |
|
33 |
+#' ggplot2::annotation_logticks(sides = "bl") |
|
34 |
+#' @export |
|
35 |
+#' @import ggplot2 |
|
36 |
+commonSeqsPlot <- function(sample1, sample2, productive.aa) { |
|
37 |
+ if(any(unlist(lapply(productive.aa, function(x) |
|
38 |
+ x[, "aminoAcid"] == "" | |
|
39 |
+ grepl("\\*", x[, "aminoAcid"]) | |
|
40 |
+ duplicated(x[, "aminoAcid"]))))){ |
|
41 |
+ stop("Your list contains unproductive sequences or has not been aggreated for productive amino acid sequences. Remove unproductive sequences first using the function productiveSeq with the aggregate parameter set to 'aminoAcid'.", call. = FALSE) |
|
42 |
+ } |
|
43 |
+ a <- productive.aa[[sample1]] |
|
44 |
+ b <- productive.aa[[sample2]] |
|
45 |
+ common <- intersect(a$aminoAcid, b$aminoAcid) |
|
46 |
+ common.seq <- data.frame(aminoAcid = common) |
|
47 |
+ common.a <- a[a$aminoAcid %in% common, ] |
|
48 |
+ common.b <- b[b$aminoAcid %in% common, ] |
|
49 |
+ common.seq$freq.a <- common.a$freq |
|
50 |
+ common.seq$freq.b <- common.b$freq |
|
51 |
+ common.seq$difference <- common.a$freq - common.b$freq |
|
52 |
+ common.seq$log.fold.change <- log(common.a$freq/common.b$freq) |
|
53 |
+ common.seq <- common.seq[order(common.seq$log.fold.change, decreasing = TRUE), ] |
|
54 |
+ plot <- ggplot2::ggplot(data = common.seq, aes_string("freq.a", "freq.b")) + |
|
55 |
+ geom_point(size = 3) + |
|
56 |
+ theme_minimal() + |
|
57 |
+ labs(x = paste(sample1, "frequency (%)"), y = paste(sample2, "frequency (%)")) |
|
58 |
+ return(plot) |
|
59 |
+} |
|
0 | 60 |
\ No newline at end of file |
1 | 61 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,84 @@ |
1 |
+#' Common sequences Venn diagram |
|
2 |
+#' |
|
3 |
+#' Creates a Venn diagram comparing the number of common sequences in two or |
|
4 |
+#' three samples. |
|
5 |
+#' |
|
6 |
+#' @param samples A character vector of two or three names of samples in |
|
7 |
+#' productive.seqs to compare. |
|
8 |
+#' @param productive.seqs A list of productive amino acid sequences generated |
|
9 |
+#' by the LymphoSeq function productiveSeq. |
|
10 |
+#' @return Returns a a Venn diagram of the number of common sequences between |
|
11 |
+#' two or three samples. |
|
12 |
+#' @seealso \code{\link{commonSeqs}} |
|
13 |
+#' @examples |
|
14 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
15 |
+#' |
|
16 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
17 |
+#' |
|
18 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
19 |
+#' |
|
20 |
+#' # Plot a triple Venn diagram |
|
21 |
+#' commonSeqsVenn(samples = c("TCRB_Day0_Unsorted", |
|
22 |
+#' "TCRB_Day32_Unsorted", "TCRB_Day83_Unsorted"), |
|
23 |
+#' productive.seqs = productive.aa) |
|
24 |
+#' |
|
25 |
+#' # Plot a double Venn diagram |
|
26 |
+#' commonSeqsVenn(samples = c("TCRB_Day0_Unsorted", |
|
27 |
+#' "TCRB_Day32_Unsorted"), productive.seqs = productive.aa) |
|
28 |
+#' |
|
29 |
+#' # Save Venn diagram as a .png file to working directory |
|
30 |
+#' png(filename = "Venn diagram.png", res = 300, units = "in", height = 5, width = 5) |
|
31 |
+#' |
|
32 |
+#' commonSeqsVenn(samples = c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted"), |
|
33 |
+#' productive.seqs = productive.aa) |
|
34 |
+#' |
|
35 |
+#' dev.off() |
|
36 |
+#' @export |
|
37 |
+#' @importFrom VennDiagram draw.pairwise.venn draw.triple.venn |
|
38 |
+#' @importFrom grid grid.newpage grid.draw |
|
39 |
+commonSeqsVenn <- function(samples, productive.seqs) { |
|
40 |
+ if (length(samples) > 3 | length(samples) < 2) { |
|
41 |
+ stop("Please enter 2 or 3 samples.") |
|
42 |
+ } |
|
43 |
+ if (length(samples) == 2) { |
|
44 |
+ a <- productive.seqs[[samples[1]]] |
|
45 |
+ b <- productive.seqs[[samples[2]]] |
|
46 |
+ grid::grid.newpage() |
|
47 |
+ venn <- VennDiagram::draw.pairwise.venn(area1 = length(a$aminoAcid), |
|
48 |
+ area2 = length(b$aminoAcid), |
|
49 |
+ cross.area = length(intersect(a$aminoAcid, b$aminoAcid)), |
|
50 |
+ category = c(samples[1], samples[2]), |
|
51 |
+ cat.fontfamily = rep("sans", 2), |
|
52 |
+ fontfamily = rep("sans", 3), |
|
53 |
+ fill = c("#3288bd", "#d53e4f"), |
|
54 |
+ cat.pos = c(0, 0), |
|
55 |
+ cat.dist = rep(0.025, 2), |
|
56 |
+ cex = 1, |
|
57 |
+ cat.cex = 0.7, |
|
58 |
+ lwd = rep(2, 2)) |
|
59 |
+ grid::grid.draw(venn) |
|
60 |
+ } |
|
61 |
+ if (length(samples) == 3) { |
|
62 |
+ a <- productive.seqs[[samples[1]]] |
|
63 |
+ b <- productive.seqs[[samples[2]]] |
|
64 |
+ c <- productive.seqs[[samples[3]]] |
|
65 |
+ grid::grid.newpage() |
|
66 |
+ venn <- VennDiagram::draw.triple.venn(area1 = length(a$aminoAcid), |
|
67 |
+ area2 = length(b$aminoAcid), |
|
68 |
+ area3 = length(c$aminoAcid), |
|
69 |
+ n12 = length(intersect(a$aminoAcid, b$aminoAcid)), |
|
70 |
+ n23 = length(intersect(b$aminoAcid, c$aminoAcid)), |
|
71 |
+ n13 = length(intersect(a$aminoAcid, c$aminoAcid)), |
|
72 |
+ n123 = length(Reduce(intersect, list(a$aminoAcid, b$aminoAcid, c$aminoAcid))), |
|
73 |
+ category = c(samples[1], samples[2], samples[3]), |
|
74 |
+ cat.fontfamily = rep("sans", 3), |
|
75 |
+ fontfamily = rep("sans", 7), |
|
76 |
+ fill = c("#3288bd", "#abdda4", "#d53e4f"), |
|
77 |
+ cat.pos = c(0, 0, 180), |
|
78 |
+ cat.dist = rep(0.025, 3), |
|
79 |
+ cex = 1, |
|
80 |
+ cat.cex = 0.7, |
|
81 |
+ lwd = rep(2, 3)) |
|
82 |
+ grid::grid.draw(venn) |
|
83 |
+ } |
|
84 |
+} |
|
0 | 85 |
\ No newline at end of file |
1 | 86 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,127 @@ |
1 |
+#' Gene frequencies |
|
2 |
+#' |
|
3 |
+#' Creates a data frame of VDJ gene counts and frequencies. |
|
4 |
+#' |
|
5 |
+#' @param productive.nt A list of one or more data frames of productive sequences |
|
6 |
+#' generated by the LymphoSeq function productiveSeq where the parameter |
|
7 |
+#' aggregate is set to "nucleotide". |
|
8 |
+#' @param locus A character vector indicating which VDJ genes to include in the |
|
9 |
+#' output. Available options include "VDJ", "DJ", "VJ", "DJ", "V", "D", or "J". |
|
10 |
+#' @param family A Boolean value indicating whether or not family names instead |
|
11 |
+#' of gene names are used. If TRUE, then family names are used and if FALSE, |
|
12 |
+#' gene names are used. |
|
13 |
+#' @return Returns a data frame with the sample names, VDJ gene name, count, and |
|
14 |
+#' \% frequency of the V, D, or J genes (each gene frequency should add to |
|
15 |
+#' 100\% for each sample). |
|
16 |
+#' @examples |
|
17 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
18 |
+#' |
|
19 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
20 |
+#' |
|
21 |
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
22 |
+#' |
|
23 |
+#' geneFreq(productive.nt, locus = "VDJ", family = FALSE) |
|
24 |
+#' |
|
25 |
+#' # Create a heat map of V gene usage |
|
26 |
+#' vfamilies <- geneFreq(productive.nt, locus = "V", family = TRUE) |
|
27 |
+#' |
|
28 |
+#' require(reshape) |
|
29 |
+#' |
|
30 |
+#' vfamilies <- reshape::cast(vfamilies, familyName ~ samples, value = "frequencyGene", sum) |
|
31 |
+#' |
|
32 |
+#' rownames(vfamilies) <- as.character(vfamilies$familyName) |
|
33 |
+#' |
|
34 |
+#' vfamilies$familyName <- NULL |
|
35 |
+#' |
|
36 |
+#' RedBlue <- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdBu")))(256) |
|
37 |
+#' |
|
38 |
+#' require(pheatmap) |
|
39 |
+#' |
|
40 |
+#' pheatmap::pheatmap(vfamilies, color = RedBlue, scale = "row") |
|
41 |
+#' |
|
42 |
+#' # Create a word cloud of V gene usage |
|
43 |
+#' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE) |
|
44 |
+#' |
|
45 |
+#' require(wordcloud) |
|
46 |
+#' |
|
47 |
+#' wordcloud::wordcloud(words = vgenes[vgenes$samples == "TCRB_Day83_Unsorted", "geneName"], |
|
48 |
+#' freq = vgenes[vgenes$samples == "TCRB_Day83_Unsorted", "frequencyGene"], |
|
49 |
+#' colors = RedBlue) |
|
50 |
+#' |
|
51 |
+#' # Create a cumulative frequency bar plot of V gene usage |
|
52 |
+#' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE) |
|
53 |
+#' |
|
54 |
+#' require(ggplot2) |
|
55 |
+#' |
|
56 |
+#' ggplot2::ggplot(vgenes, aes(x = samples, y = frequencyGene, fill = geneName)) + |
|
57 |
+#' geom_bar(stat = "identity") + |
|
58 |
+#' theme_minimal() + |
|
59 |
+#' scale_y_continuous(expand = c(0, 0)) + |
|
60 |
+#' guides(fill = guide_legend(ncol = 3)) + |
|
61 |
+#' labs(y = "Frequency (%)", x = "", fill = "") + |
|
62 |
+#' theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) |
|
63 |
+#' @export |
|
64 |
+#' @importFrom stats aggregate |
|
65 |
+geneFreq <- function(productive.nt, locus = "VDJ", family = FALSE) { |
|
66 |
+ gene.names <- data.frame() |
|
67 |
+ if (family == FALSE) { |
|
68 |
+ i <- 1 |
|
69 |
+ for (i in 1:length(productive.nt)) { |
|
70 |
+ file <- productive.nt[[i]] |
|
71 |
+ file[file == ""] <- "unresolved" |
|
72 |
+ name <- names(productive.nt) |
|
73 |
+ v <- data.frame() |
|
74 |
+ d <- data.frame() |
|
75 |
+ j <- data.frame() |
|
76 |
+ if (grepl("V|v", locus)) { |
|
77 |
+ v <- aggregate(count ~ vGeneName, data = file, FUN = sum) |
|
78 |
+ v <- data.frame(samples = name[i], geneName = v$vGeneName, |
|
79 |
+ count = v$count, |
|
80 |
+ frequencyGene = v$count/sum(v$count) * 100) |
|
81 |
+ } |
|
82 |
+ if (grepl("D|d", locus)) { |
|
83 |
+ d <- aggregate(count ~ dGeneName, data = file, FUN = sum) |
|
84 |
+ d <- data.frame(samples = name[i], geneName = d$dGeneName, |
|
85 |
+ count = d$count, |
|
86 |
+ frequencyGene = d$count/sum(d$count) * 100) |
|
87 |
+ } |
|
88 |
+ if (grepl("J|j", locus)) { |
|
89 |
+ j <- aggregate(count ~ jGeneName, data = file, FUN = sum) |
|
90 |
+ j <- data.frame(samples = name[i], geneName = j$jGeneName, |
|
91 |
+ count = j$count, |
|
92 |
+ frequencyGene = j$count/sum(j$count) * 100) |
|
93 |
+ } |
|
94 |
+ gene.names <- rbind(v, d, j, gene.names) |
|
95 |
+ } |
|
96 |
+ } else { |
|
97 |
+ i <- 1 |
|
98 |
+ for (i in 1:length(productive.nt)) { |
|
99 |
+ file <- productive.nt[[i]] |
|
100 |
+ file[file == ""] <- "unresolved" |
|
101 |
+ name <- names(productive.nt) |
|
102 |
+ v <- data.frame() |
|
103 |
+ d <- data.frame() |
|
104 |
+ j <- data.frame() |
|
105 |
+ if (grepl("V|v", locus)) { |
|
106 |
+ v <- aggregate(count ~ vFamilyName, data = file, FUN = sum) |
|
107 |
+ v <- data.frame(samples = name[i], familyName = v$vFamilyName, |
|
108 |
+ count = v$count, |
|
109 |
+ frequencyGene = v$count/sum(v$count) * 100) |
|
110 |
+ } |
|
111 |
+ if (grepl("D|d", locus)) { |
|
112 |
+ d <- aggregate(count ~ dFamilyName, data = file, FUN = sum) |
|
113 |
+ d <- data.frame(samples = name[i], familyName = d$dFamilyName, |
|
114 |
+ count = d$count, |
|
115 |
+ frequencyGene = d$count/sum(d$count) * 100) |
|
116 |
+ } |
|
117 |
+ if (grepl("J|j", locus)) { |
|
118 |
+ j <- aggregate(count ~ jFamilyName, data = file, FUN = sum) |
|
119 |
+ j <- data.frame(samples = name[i], familyName = j$jFamilyName, |
|
120 |
+ count = j$count, |
|
121 |
+ frequencyGene = j$count/sum(j$count) * 100) |
|
122 |
+ } |
|
123 |
+ gene.names <- rbind(v, d, j, gene.names) |
|
124 |
+ } |
|
125 |
+ } |
|
126 |
+ return(gene.names) |
|
127 |
+} |
|
0 | 128 |
\ No newline at end of file |
1 | 129 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,72 @@ |
1 |
+#' Lorenz curve |
|
2 |
+#' |
|
3 |
+#' Plots a Lorenz curve derived from the frequency of the amino acid sequences. |
|
4 |
+#' |
|
5 |
+#' @param samples A character vector of sample names in list. |
|
6 |
+#' @param list A list data frames generated using the LymphoSeq function readImmunoSeq |
|
7 |
+#' or productiveSeq. "frequencyCount" is a required column. |
|
8 |
+#' @return Returns a Lorenz curve. |
|
9 |
+#' @details The Gini coefficient is an alternative metric used to calculate |
|
10 |
+#' repertoire diversity and is derived from the Lorenz curve. The Lorenz curve |
|
11 |
+#' is drawn such that x-axis represents the cumulative percentage of unique |
|
12 |
+#' sequences and the y-axis represents the cumulative percentage of reads. A |
|
13 |
+#' line passing through the origin with a slope of 1 reflects equal frequencies |
|
14 |
+#' of all sequences. The Gini coefficient is the ratio of the area between the |
|
15 |
+#' line of equality and the observed Lorenz curve over the total area under the |
|
16 |
+#' line of equality. |
|
17 |
+#' |
|
18 |
+#' The plot is made using the package ggplot2 and can be reformatted |
|
19 |
+#' using ggplot2 functions. See examples below. |
|
20 |
+#' @seealso An excellent resource for examples on how to reformat a ggplot can |
|
21 |
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}). |
|
22 |
+#' @examples |
|
23 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
24 |
+#' |
|
25 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
26 |
+#' |
|
27 |
+#' lorenzCurve(samples = names(file.list), list = file.list) |
|
28 |
+#' |
|
29 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
30 |
+#' |
|
31 |
+#' lorenzCurve(samples = names(productive.aa), list = productive.aa) |
|
32 |
+#' |
|
33 |
+#' # Change the legend labels, line colors, and add a title |
|
34 |
+#' samples <- c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted", |
|
35 |
+#' "TCRB_Day83_Unsorted", "TCRB_Day949_Unsorted", "TCRB_Day1320_Unsorted") |
|
36 |
+#' |
|
37 |
+#' lorenz.curve <- lorenzCurve(samples = samples, list = productive.aa) |
|
38 |
+#' |
|
39 |
+#' labels <- c("Day 0", "Day 32", "Day 83", "Day 949", "Day 1320") |
|
40 |
+#' |
|
41 |
+#' colors <- c("navyblue", "red", "darkgreen", "orange", "purple") |
|
42 |
+#' |
|
43 |
+#' lorenz.curve + ggplot2::scale_color_manual(name = "Samples", breaks = samples, |
|
44 |
+#' labels = labels, values = colors) + ggplot2::ggtitle("Figure Title") |
|
45 |
+#' @export |
|
46 |
+#' @import ggplot2 |
|
47 |
+#' @importFrom RColorBrewer brewer.pal |
|
48 |
+#' @importFrom ineq Lc |
|
49 |
+lorenzCurve <- function(samples, list) { |
|
50 |
+ lorenz <- data.frame() |
|
51 |
+ i <- 1 |
|
52 |
+ for (i in 1:length(samples)) { |
|
53 |
+ sample <- samples[i] |
|
54 |
+ file <- list[[sample]] |
|
55 |
+ lc <- ineq::Lc(file$frequencyCount) |
|
56 |
+ lcdf <- data.frame(L = lc$L, p = lc$p) |
|
57 |
+ lcdf$sample <- rep(sample, nrow(lcdf)) |
|
58 |
+ lorenz <- rbind(lcdf, lorenz) |
|
59 |
+ } |
|
60 |
+ getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1")) |
|
61 |
+ plot <- ggplot2::ggplot(lorenz, aes_string(x = "p", y = "L", color = "sample")) + |
|
62 |
+ geom_line(size = 1) + |
|
63 |
+ theme_minimal() + |
|
64 |
+ scale_color_manual(values = getPalette(length(samples) + 1)) + |
|
65 |
+ scale_y_continuous(expand = c(0, 0)) + |
|
66 |
+ scale_x_continuous(expand = c(0,0)) + |
|
67 |
+ geom_abline(intercept = 0, slope = 1, color = "grey", linetype = 2) + |
|
68 |
+ coord_fixed() + |
|
69 |
+ labs(x = "Cumulative percentage of unique sequences", |
|
70 |
+ y = "Cumulative percentage of reads", color = "") |
|
71 |
+ return(plot) |
|
72 |
+} |
0 | 73 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,49 @@ |
1 |
+#' Merge files |
|
2 |
+#' |
|
3 |
+#' Merges two or more sample data frames into a single data frame and aggregates |
|
4 |
+#' count, frequencyCount, and estimatedNumberGenomes. |
|
5 |
+#' |
|
6 |
+#' @param samples A character vector of the names of the samples that are to be |
|
7 |
+#' merged in file.list. |
|
8 |
+#' @param file.list A list of data frames imported using the LymphoSeq function |
|
9 |
+#' readImmunoSeq that contain the sample names that are to be merged. "aminoAcid", |
|
10 |
+#' "nucleotide", "count" and "frequencyCount" are required columns. |
|
11 |
+#' @return Returns a data frame of the merged samples. The values of count, |
|
12 |
+#' frequencyCount, and estimatedNumberGenomes are aggregated. That is, the sum |
|
13 |
+#' of count and estimatedNumberGenomes columns of the merged data frame should |
|
14 |
+#' equal the sum of the columns from the unmerged samples. Likewise, the |
|
15 |
+#' frequencyCount of the merged data frame should add up to 100\%. All other |
|
16 |
+#' columns in the unmerged data frames are included in the merge data frame. |
|
17 |
+#' @examples |
|
18 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
19 |
+#' |
|
20 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
21 |
+#' |
|
22 |
+#' TCRB_Day949_Merged <- mergeFiles(samples = c("TCRB_Day949_CD4", |
|
23 |
+#' "TCRB_Day949_CD8"), file.list) |
|
24 |
+#' |
|
25 |
+#' # To combine the merged data frames with file.list |
|
26 |
+#' file.list <- c(list("TCRB_Day949_Merged" = TCRB_Day949_Merged), file.list) |
|
27 |
+#' @export |
|
28 |
+#' @importFrom stats aggregate |
|
29 |
+mergeFiles <- function(samples, file.list) { |
|
30 |
+ merged <- data.frame() |
|
31 |
+ i <- 1 |
|
32 |
+ for (i in 1:length(samples)) { |
|
33 |
+ file <- file.list[[as.character(samples[i])]] |
|
34 |
+ merged <- rbind(merged, file) |
|
35 |
+ } |
|
36 |
+ count <- aggregate(count ~ nucleotide, data = merged, FUN = sum) |
|
37 |
+ if (any(grepl("estimatedNumberGenomes", colnames(merged)))) { |
|
38 |
+ merged$estimatedNumberGenomes[is.na(merged$estimatedNumberGenomes)] <- 0 |
|
39 |
+ estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ nucleotide, |
|
40 |
+ data = merged, FUN = sum) |
|
41 |
+ } |
|
42 |
+ merged$count <- NULL |
|
43 |
+ merged$estimatedNumberGenomes <- NULL |
|
44 |
+ merged <- merged[!duplicated(merged$nucleotide), ] |
|
45 |
+ merged <- Reduce(function(x, y) |
|
46 |
+ merge(x, y, by = "nucleotide"), list(count, estimatedNumberGenomes, merged)) |
|
47 |
+ merged$frequencyCount <- (merged$count/sum(merged$count)) * 100 |
|
48 |
+ return(merged) |
|
49 |
+} |
|
0 | 50 |
\ No newline at end of file |
1 | 51 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,57 @@ |
1 |
+#' Pairwise comparison plot |
|
2 |
+#' |
|
3 |
+#' Creates a heat map from a similarity or Bhattacharyya matrix. |
|
4 |
+#' |
|
5 |
+#' @param matrix A similarity or Bhattacharyya matrix produced by the LymphoSeq |
|
6 |
+#' functions similarityMatrix or bhattacharyyaMatrix. |
|
7 |
+#' @return A pairwise comparison heat map. |
|
8 |
+#' @details The plot is made using the package ggplot2 and can be reformatted |
|
9 |
+#' using ggplot2 functions. See examples below. |
|
10 |
+#' @seealso An excellent resource for examples on how to reformat a ggplot can |
|
11 |
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}). |
|
12 |
+#' The functions to create the similarity or Bhattacharyya matrix can be found |
|
13 |
+#' here: \code{\link{similarityMatrix}} and \code{\link{bhattacharyyaMatrix}} |
|
14 |
+#' @examples |
|
15 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
16 |
+#' |
|
17 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
18 |
+#' |
|
19 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
20 |
+#' |
|
21 |
+#' similarity.matrix <- similarityMatrix(productive.seqs = productive.aa) |
|
22 |
+#' |
|
23 |
+#' pairwisePlot(matrix = similarity.matrix) |
|
24 |
+#' |
|
25 |
+#' bhattacharyya.matrix <- bhattacharyyaMatrix(productive.seqs = productive.aa) |
|
26 |
+#' |
|
27 |
+#' pairwisePlot(matrix = bhattacharyya.matrix) |
|
28 |
+#' |
|
29 |
+#' # Change plot color, title legend, and add title |
|
30 |
+#' pairwisePlot(matrix = similarity.matrix) + |
|
31 |
+#' ggplot2::scale_fill_gradient(low = "#deebf7", high = "#3182bd") + |
|
32 |
+#' ggplot2::labs(fill = "Similarity score") + ggplot2::ggtitle("Figure Title") |
|
33 |
+#' @export |
|
34 |
+#' @import ggplot2 |
|
35 |
+#' @importFrom reshape melt.data.frame |
|
36 |
+#' @importFrom stats na.omit |
|
37 |
+pairwisePlot <- function(matrix) { |
|
38 |
+ i <- 1 |
|
39 |
+ l <- length(matrix) |
|
40 |
+ p <- l - 1 |
|
41 |
+ for (i in 1:p) { |
|
42 |
+ j <- i + 1 |
|
43 |
+ matrix[i, j:l] <- NA |
|
44 |
+ } |
|
45 |
+ matrix$Samples <- rownames(matrix) |
|
46 |
+ melt <- reshape::melt.data.frame(matrix, id.vars = "Samples") |
|
47 |
+ melt <- na.omit(melt) |
|
48 |
+ melt$variable <- factor(melt$variable, levels = rownames(matrix)) |
|
49 |
+ melt$Samples <- factor(melt$Samples, levels = rev(rownames(matrix))) |
|
50 |
+ plot <- ggplot(data = melt, aes_string("Samples", "variable", fill = "value")) + |
|
51 |
+ geom_tile() + |
|
52 |
+ scale_fill_gradient(low = "#fee8c8", high = "#e34a33") + |
|
53 |
+ theme_classic() + |
|
54 |
+ labs(x = "", y = "", fill = "") + |
|
55 |
+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) |
|
56 |
+ return(plot) |
|
57 |
+} |
|
0 | 58 |
\ No newline at end of file |
1 | 59 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,68 @@ |
1 |
+#' Productive sequences |
|
2 |
+#' |
|
3 |
+#' Remove unproductive CDR3 sequences from a single data frame. |
|
4 |
+#' |
|
5 |
+#' @param sample A data frame consisting of antigen receptor sequencing data. |
|
6 |
+#' "aminoAcid", "count", and "frequencyCount" are required columns. |
|
7 |
+#' @param aggregate Indicates whether the values of "count", "frequencyCount", |
|
8 |
+#' and "esimatedNumberGenomes" should be aggregated by amino acid or nucleotide |
|
9 |
+#' sequence. Acceptable values are "aminoAcid" or "nucleotide". If "aminoAcid" |
|
10 |
+#' is selected, then the resulting data frame will have columns corresponding to |
|
11 |
+#' "aminoAcid", "count", "frequnecyCount", and "estimatedNumberGenomes" (if this |
|
12 |
+#' column is available). If "nucleotide" is selected then all columns in the |
|
13 |
+#' original data frame will be present in the outputted data frame. The difference in |
|
14 |
+#' output is due to the fact that the same amino acid CDR3 sequence may be |
|
15 |
+#' encoded by multiple unique nucleotide sequences with differing V, D, and J |
|
16 |
+#' genes. |
|
17 |
+#' @return Returns a data frame of productive amino acid sequences with recomputed |
|
18 |
+#' values for "count", "frequencyCount", and "esimatedNumberGenomes". A |
|
19 |
+#' productive sequences is defined as a sequence that is in frame and does not |
|
20 |
+#' have an early stop codon. |
|
21 |
+#' @examples |
|
22 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
23 |
+#' |
|
24 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
25 |
+#' |
|
26 |
+#' productive <- productive(sample = file.list[["TCRB_Day32_Unsorted"]], aggregate = "aminoAcid") |
|
27 |
+#' @seealso \code{\link{productiveSeq}} |
|
28 |
+#' @export |
|
29 |
+#' @importFrom stats aggregate |
|
30 |
+productive <- function(sample, aggregate = "aminoAcid") { |
|
31 |
+ productive.seqs <- sample[which(sample[, "aminoAcid"] != "" & |
|
32 |
+ !grepl("\\*", sample[, "aminoAcid"])), ] |
|
33 |
+ if (any(grepl("estimatedNumberGenomes", colnames(productive.seqs)))) { |
|
34 |
+ if (aggregate == "aminoAcid") { |
|
35 |
+ count <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum) |
|
36 |
+ productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0 |
|
37 |
+ estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ aminoAcid, |
|
38 |
+ data = productive.seqs, FUN = sum) |
|
39 |
+ merged <- merge(count, estimatedNumberGenomes, by = "aminoAcid") |
|
40 |
+ merged$frequencyCount <- merged$count/sum(count$count) * 100 |
|
41 |
+ } |
|
42 |
+ if (aggregate == "nucleotide") { |
|
43 |
+ count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum) |
|
44 |
+ productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0 |
|
45 |
+ estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ nucleotide, |
|
46 |
+ data = productive.seqs, FUN = sum) |
|
47 |
+ productive.seqs$count <- NULL |
|
48 |
+ productive.seqs$estimatedNumberGenomes <- NULL |
|
49 |
+ merged <- Reduce(function(x, y) merge(x, y, by = "nucleotide"), |
|
50 |
+ list(productive.seqs, count, estimatedNumberGenomes)) |
|
51 |
+ merged$frequencyCount <- merged$count/sum(count$count) * 100 |
|
52 |
+ } |
|
53 |
+ } else { |
|
54 |
+ if (aggregate == "aminoAcid") { |
|
55 |
+ merged <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum) |
|
56 |
+ merged$frequencyCount <- merged$count/sum(merged$count) * 100 |
|
57 |
+ } |
|
58 |
+ if (aggregate == "nucleotide") { |
|
59 |
+ count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum) |
|
60 |
+ productive.seqs$count <- NULL |
|
61 |
+ merged <- merge(productive.seqs, count, by = "nucleotide") |
|
62 |
+ merged$frequencyCount <- merged$count/sum(merged$count) * 100 |
|
63 |
+ } |
|
64 |
+ } |
|
65 |
+ merged <- merged[order(merged$frequencyCount, decreasing = TRUE), intersect(names(sample), names(merged))] |
|
66 |
+ rownames(merged) <- NULL |
|
67 |
+ return(merged) |
|
68 |
+} |
|
0 | 69 |
\ No newline at end of file |
1 | 70 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,65 @@ |
1 |
+#' Productive sequences |
|
2 |
+#' |
|
3 |
+#' Remove unproductive CDR3 sequences from a list of data frames. |
|
4 |
+#' |
|
5 |
+#' @param file.list A list of data frames consisting antigen receptor sequencing |
|
6 |
+#' data imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", and |
|
7 |
+#' "frequencyCount" are required columns. |
|
8 |
+#' @param aggregate Indicates whether the values of "count", "frequencyCount", |
|
9 |
+#' and "esimatedNumberGenomes" should be aggregated by amino acid or nucleotide |
|
10 |
+#' sequence. Acceptable values are "aminoAcid" or "nucleotide". If "aminoAcid" |
|
11 |
+#' is selected, then resulting data frame will have columns corresponding to |
|
12 |
+#' aminoAcid, count, frequencyCount, and estimatedNumberGenomes (if this |
|
13 |
+#' column is available). If "nucleotide" is selected then all columns in the |
|
14 |
+#' original list will be present in the outputted list. The difference in |
|
15 |
+#' output is due to the fact that the same amino acid CDR3 sequence may be |
|
16 |
+#' encoded by multiple unique nucleotide sequences with differing V, D, and J |
|
17 |
+#' genes. |
|
18 |
+#' @param prevalence A Boolean value indicating if a new column should be added |
|
19 |
+#' to each of the data frames giving the prevalence of each CDR3 amino acid |
|
20 |
+#' sequence in 55 healthy donor peripheral blood samples. TRUE means the column |
|
21 |
+#' is added and FALSE means it is not. Values range from 0 to 100\% where |
|
22 |
+#' 100\% means the sequence appeared in the blood of all 55 individuals. The |
|
23 |
+#' prevalenceTRB database is located in a separate package called LymphoSeqDB |
|
24 |
+#' that should be loaded automatically. |
|
25 |
+#' @return Returns a list of data frames of productive amino acid sequences with |
|
26 |
+#' recomputed values for "count", "frequencyCount", and |
|
27 |
+#' "esimatedNumberGenomes". A productive sequences is defined as a sequences |
|
28 |
+#' that is in frame and does not have an early stop codon. |
|
29 |
+#' @examples |
|
30 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
31 |
+#' |
|
32 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
33 |
+#' |
|
34 |
+#' productive.nt <- productiveSeq(file.list = file.list, |
|
35 |
+#' aggregate = "nucleotide", prevalence = FALSE) |
|
36 |
+#' |
|
37 |
+#' productive.aa <- productiveSeq(file.list = file.list, |
|
38 |
+#' aggregate = "aminoAcid", prevalence = TRUE) |
|
39 |
+#' @seealso Refer to the LymphoSeqDB package for details regarding the |
|
40 |
+#' prevalenceTRB database. |
|
41 |
+#' @export |
|
42 |
+#' @importFrom plyr llply |
|
43 |
+#' @import LymphoSeqDB |
|
44 |
+productiveSeq <- function(file.list, aggregate = "aminoAcid", prevalence = FALSE) { |
|
45 |
+ productive.list <- plyr::llply(file.list, function(x) |
|
46 |
+ productive(x, aggregate), .progress = "text") |
|
47 |
+ if (prevalence == TRUE) { |
|
48 |
+ if (aggregate != "aminoAcid") { |
|
49 |
+ stop("In order to add prevalence to your list of data frames, aggregate must be equal 'aminoAcid'.", call. = FALSE) |
|
50 |
+ } |
|
51 |
+ cat("Adding prevalence column...") |
|
52 |
+ i <- 1 |
|
53 |
+ productive.list.prevalence <- list() |
|
54 |
+ for(i in 1:length(productive.list)) { |
|
55 |
+ file <- productive.list[[i]] |
|
56 |
+ file$prevalence <- LymphoSeqDB::prevalenceTRB[match(file$aminoAcid, LymphoSeqDB::prevalenceTRB$aminoAcid), "prevalence"] |
|
57 |
+ file$prevalence[is.na(file$prevalence)] <- 0 |
|
58 |
+ productive.list.prevalence <- c(productive.list.prevalence, list(file)) |
|
59 |
+ } |
|
60 |
+ names(productive.list.prevalence) <- names(productive.list) |
|
61 |
+ return(productive.list.prevalence) |
|
62 |
+ } else { |
|
63 |
+ return(productive.list) |
|
64 |
+ } |
|
65 |
+} |
|
0 | 66 |
\ No newline at end of file |
1 | 67 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,89 @@ |
1 |
+#' Read ImmunoSeq files |
|
2 |
+#' |
|
3 |
+#' Imports tab-separated value (.tsv) files exported by the Adaptive |
|
4 |
+#' Biotechnologies ImmunoSEQ analyzer and stores them as a list of data frames. |
|
5 |
+#' |
|
6 |
+#' @param path Path to the directory containing tab-delimited files. Only |
|
7 |
+#' files with the extension .tsv are imported. The names of the data frames are |
|
8 |
+#' the same as names of the files. |
|
9 |
+#' @param columns Column names from the tab-delimited files that you desire to |
|
10 |
+#' import, all others will be disregarded. May use "all" to import all columns. |
|
11 |
+#' A warning may be called if columns contain no data or must be coereced to a |
|
12 |
+#' different class. Usually this warning can be ignored. |
|
13 |
+#' @param recursive A Boolean value indicating whether tab-delimited files in |
|
14 |
+#' subdirectories should be imported. If TRUE, then all files in the parent as |
|
15 |
+#' well as the subdirectory are imported. If FALSE, then only files in the |
|
16 |
+#' parent directory are imported. |
|
17 |
+#' @details May import tab-delimited files containing antigen receptor |
|
18 |
+#' sequencing from other sources (e.g. miTCR or miXCR) as long as the column |
|
19 |
+#' names are the same as used by ImmunoSEQ files. Available column headings in |
|
20 |
+#' ImmunoSEQ files are: |
|
21 |
+#' "nucleotide", "aminoAcid", "count", "count (templates)", "count (reads)", |
|
22 |
+#' "frequencyCount", "frequencyCount (\%)", "cdr3Length", "vMaxResolved", |
|
23 |
+#' "vFamilyName", "vGeneName", "vGeneAllele", "vFamilyTies", "vGeneNameTies", |
|
24 |
+#' "vGeneAlleleTies", "dMaxResolved", "dFamilyName", "dGeneName", "dGeneAllele", |
|
25 |
+#' "dFamilyTies", "dGeneNameTies", "dGeneAlleleTies", "jMaxResolved", |
|
26 |
+#' "jFamilyName", "jGeneName", "jGeneAllele", "jFamilyTies", "jGeneNameTies", |
|
27 |
+#' "jGeneAlleleTies", "vDeletion", "d5Deletion", "d3Deletion", "jDeletion", |
|
28 |
+#' "n2Insertion", "n1Insertion", "vIndex", "n2Index", "dIndex", "n1Index", |
|
29 |
+#' "jIndex", "estimatedNumberGenomes", "sequenceStatus", "cloneResolved", |
|
30 |
+#' "vOrphon", "dOrphon", "jOrphon", "vFunction", "dFunction", "jFunction", |
|
31 |
+#' "fractionNucleated". |
|
32 |
+#' |
|
33 |
+#' IMPORTANT: be aware that Adaptive has changed the |
|
34 |
+#' column names of their files over time and if the headings of your files are |
|
35 |
+#' inconsistent, then specify column = "all" or include all variations of the |
|
36 |
+#' headings you want to important. For example, column = c("count", |
|
37 |
+#' "count (templates)", "count (reads)"). Also be aware that the "count" |
|
38 |
+#' column previously reported the number of sequencing reads in earlier |
|
39 |
+#' versions of ImmunoSEQ but now is equivalent to the |
|
40 |
+#' "estimatedNumberGenomes" column. |
|
41 |
+#' @return Returns a list of data frames. The names of each data frame are |
|
42 |
+#' assigned according to the original ImmunoSEQ file names. |
|
43 |
+#' @examples |
|
44 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
45 |
+#' |
|
46 |
+#' file.list <- readImmunoSeq(path = file.path, |
|
47 |
+#' columns = c("aminoAcid", "nucleotide", "count", |
|
48 |
+#' "count (templates)", "count (reads)", |
|
49 |
+#' "frequencyCount", "frequencyCount (%)", |
|
50 |
+#' "estimatedNumberGenomes"), |
|
51 |
+#' recursive = FALSE) |
|
52 |
+#' @export |
|
53 |
+#' @importFrom data.table fread |
|
54 |
+#' @importFrom plyr llply |
|
55 |
+readImmunoSeq <- function(path, columns = c("aminoAcid", "nucleotide", "count", |
|
56 |
+ "count (templates)", "count (reads)", |
|
57 |
+ "frequencyCount", "frequencyCount (%)", |
|
58 |
+ "estimatedNumberGenomes", "vFamilyName", |
|
59 |
+ "dFamilyName", "jFamilyName","vGeneName", |
|
60 |
+ "dGeneName", "jGeneName"), |
|
61 |
+ recursive = FALSE) { |
|
62 |
+ file.paths <- list.files(path, full.names = TRUE, all.files = FALSE, |
|
63 |
+ recursive = recursive, pattern = ".tsv", |
|
64 |
+ include.dirs = FALSE) |
|
65 |
+ if (any(columns == "all")) { |
|
66 |
+ file.list <- plyr::llply(file.paths, data.table::fread, |
|
67 |
+ data.table = FALSE, .progress = "text") |
|
68 |
+ } else { |
|
69 |
+ file.list <- plyr::llply(file.paths, data.table::fread, select = columns, |
|
70 |
+ data.table = FALSE, .progress = "text") |
|
71 |
+ if(length(unique(plyr::llply(file.list, ncol))) > 1){ |
|
72 |
+ warning("One or more of the files you are trying to import do not contain all the columns you specified.", call. = FALSE) |
|
73 |
+ } |
|
74 |
+ } |
|
75 |
+ file.names <- gsub(".tsv", "", basename(file.paths)) |
|
76 |
+ names(file.list) <- file.names |
|
77 |
+ i <- 1 |
|
78 |
+ for (i in 1:length(file.list)) { |
|
79 |
+ colnames(file.list[[i]]) <- ifelse(grepl("frequencyCount*", |
|
80 |
+ colnames(file.list[[i]])), |
|
81 |
+ "frequencyCount", |
|
82 |
+ colnames(file.list[[i]])) |
|
83 |
+ colnames(file.list[[i]]) <- ifelse(grepl("count*", |
|
84 |
+ colnames(file.list[[i]])), |
|
85 |
+ "count", |
|
86 |
+ colnames(file.list[[i]])) |
|
87 |
+ } |
|
88 |
+ return(file.list) |
|
89 |
+} |
|
0 | 90 |
\ No newline at end of file |
1 | 91 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,37 @@ |
1 |
+#' Remove sequence |
|
2 |
+#' |
|
3 |
+#' Removes an amino acid sequence and associated data from all instances within |
|
4 |
+#' a list of data frames and then recomputes the frequencyCount. |
|
5 |
+#' |
|
6 |
+#' @param file.list A list of data frames imported using the LymphoSeq function |
|
7 |
+#' readImmunoSeq. "aminoAcid", "count", and "frequencyCount" are required columns. |
|
8 |
+#' @param sequence A character vector of one or more amino acid sequences to |
|
9 |
+#' remove from the list of data frames. |
|
10 |
+#' @return Returns a list of data frames like the one imported except all rows |
|
11 |
+#' with the specified amino acid sequence are removed. The frequencyCount is |
|
12 |
+#' recalculated. |
|
13 |
+#' @examples |
|
14 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
15 |
+#' |
|
16 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
17 |
+#' |
|
18 |
+#' searchSeq(list = file.list, sequence = "CASSDLIGNGKLFF") |
|
19 |
+#' |
|
20 |
+#' cleansed <- removeSeq(file.list = file.list, sequence = "CASSDLIGNGKLFF") |
|
21 |
+#' |
|
22 |
+#' searchSeq(list = cleansed, sequence = "CASSDLIGNGKLFF") |
|
23 |
+#' @export |
|
24 |
+#' @importFrom plyr llply |
|
25 |
+removeSeq <- function(file.list, sequence) { |
|
26 |
+ cleansed <- plyr::llply(file.list, function(x) |
|
27 |
+ x[which(!(x[, "aminoAcid"] %in% sequence)), ]) |
|
28 |
+ recompute <- list() |
|
29 |
+ i <- 1 |
|
30 |
+ for (i in 1:length(cleansed)) { |
|
31 |
+ file <- cleansed[[i]] |
|
32 |
+ file$frequencyCount <- file$count/sum(file$count) * 100 |
|
33 |
+ recompute <- c(recompute, list(file)) |
|
34 |
+ } |
|
35 |
+ names(recompute) <- names(cleansed) |
|
36 |
+ return(recompute) |
|
37 |
+} |
|
0 | 38 |
\ No newline at end of file |
1 | 39 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,50 @@ |
1 |
+#' Search for T cell receptor beta CDR3 amino acid sequences with known antigen |
|
2 |
+#' specificity |
|
3 |
+#' |
|
4 |
+#' Search for published T cell receptor beta CDR3 amino acid sequences with |
|
5 |
+#' known antigen specificity in a list of data frames. |
|
6 |
+#' |
|
7 |
+#' @param list A list of data frames generated by the LymphoSeq functions |
|
8 |
+#' readImmunoSeq or productiveSeq. "aminoAcid", "frequencyCount", and "count" |
|
9 |
+#' are required columns. |
|
10 |
+#' @return Returns a data frame of each sample name and instance in the sample |
|
11 |
+#' that the published TCR sequence appeared along with additional |
|
12 |
+#' information including antigen specificity, epitope, HLA type, and PubMed ID |
|
13 |
+#' (PMID) for the reference where the sequence was characterized. The |
|
14 |
+#' publishedTRB database is located in a separate package called LymphoSeqDB |
|
15 |
+#' that should be loaded automatically. |
|
16 |
+#' @examples |
|
17 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
18 |
+#' |
|
19 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
20 |
+#' |
|
21 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
22 |
+#' |
|
23 |
+#' searchPublished(list = productive.aa) |
|
24 |
+#' @seealso Refer to the LymphoSeqDB package for details regarding the |
|
25 |
+#' publishedTRB database. |
|
26 |
+#' @export |
|
27 |
+#' @importFrom plyr llply |
|
28 |
+#' @import LymphoSeqDB |
|
29 |
+searchPublished <- function(list) { |
|
30 |
+ search <- plyr::llply(list, function(x) |
|
31 |
+ x[which(x[, "aminoAcid"] %in% LymphoSeqDB::publishedTRB$aminoAcid), ]) |
|
32 |
+ found <- NULL |
|
33 |
+ l <- 1 |
|
34 |
+ for (l in 1:length(search)) { |
|
35 |
+ if (nrow(search[[l]]) != 0) { |
|
36 |
+ search[[l]] <- cbind(rep(names(search)[l], |
|
37 |
+ nrow(search[[l]])), search[[l]]) |
|
38 |
+ colnames(search[[l]])[1] <- "sample" |
|
39 |
+ found <- rbind(found, search[[l]]) |
|
40 |
+ } |
|
41 |
+ } |
|
42 |
+ if (is.null(found)) { |
|
43 |
+ message("No sequences found.") |
|
44 |
+ } else { |
|
45 |
+ found$prevalence = NULL |
|
46 |
+ found <- merge(found, LymphoSeqDB::publishedTRB, by = "aminoAcid") |
|
47 |
+ found <- found[c("sample", setdiff(names(found), "sample"))] |
|
48 |
+ return(found) |
|
49 |
+ } |
|
50 |
+} |
|
0 | 51 |
\ No newline at end of file |
1 | 52 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,125 @@ |
1 |
+#' Search for a sequence |
|
2 |
+#' |
|
3 |
+#' Search for one or more amino acid or nucleotide CDR3 sequences in a list of |
|
4 |
+#' data frames. |
|
5 |
+#' |
|
6 |
+#' @param list A list of data frames generated by the LymphoSeq functions readImmunoSeq |
|
7 |
+#' or productiveSeq. "aminoAcid" or "nucleotide", "frequencyCount", and |
|
8 |
+#' "count" are required columns. |
|
9 |
+#' @param sequence A character vector of one ore more amino acid or nucleotide |
|
10 |
+#' CDR3 sequences to search. |
|
11 |
+#' @param type A character vector specifying the type of sequence(s) to be |
|
12 |
+#' searched. Available options are "aminoAcid" or "nucleotide". |
|
13 |
+#' @param match A character vector specifying whether an exact partial or exact global |
|
14 |
+#' match of the searched sequence(s) is desired. Available options are |
|
15 |
+#' "partial" and "global". |
|
16 |
+#' @param editDistance An integer giving the minimum edit distance that the |
|
17 |
+#' sequence must be less than or equal to. See details below. |
|
18 |
+#' @details An exact partial match means the searched sequence is contained within |
|
19 |
+#' target sequence. An exact global match means the searched sequence is identical to |
|
20 |
+#' the target sequence. |
|
21 |
+#' |
|
22 |
+#' Edit distance is a way of quantifying how dissimilar two sequences |
|
23 |
+#' are to one another by counting the minimum number of operations required to |
|
24 |
+#' transform one sequence into the other. For example, an edit distance of 0 |
|
25 |
+#' means the sequences are identical and an edit distance of 1 indicates that |
|
26 |
+#' the sequences different by a single amino acid or nucleotide. |
|
27 |
+#' @return Returns the rows for every instance in the list of data frames where |
|
28 |
+#' the searched sequence(s) appeared. |
|
29 |
+#' @examples |
|
30 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
31 |
+#' |
|
32 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
33 |
+#' |
|
34 |
+#' aa1 <- "CASSPVSNEQFF" |
|
35 |
+#' |
|
36 |
+#' aa2 <- "CASSQEVPPYQAFF" |
|
37 |
+#' |
|
38 |
+#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", |
|
39 |
+#' match = "global", editDistance = 0) |
|
40 |
+#' |
|
41 |
+#' searchSeq(list = file.list, sequence = c(aa1, aa2), |
|
42 |
+#' type = "aminoAcid", match = "global", editDistance = 0) |
|
43 |
+#' |
|
44 |
+#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", editDistance = 1) |
|
45 |
+#' |
|
46 |
+#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA" |
|
47 |
+#' |
|
48 |
+#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 3) |
|
49 |
+#' |
|
50 |
+#' searchSeq(list = file.list, sequence = "CASSPVS", type = "aminoAcid", |
|
51 |
+#' match = "partial", editDistance = 0) |
|
52 |
+#' |
|
53 |
+#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 0) |
|
54 |
+#' @export |
|
55 |
+#' @importFrom plyr llply |
|
56 |
+#' @importFrom utils adist |
|
57 |
+searchSeq <- function(list, sequence, type = "aminoAcid", match = "global", editDistance = 0) { |
|
58 |
+ if (editDistance >= 1) { |
|
59 |
+ merged.results <- data.frame() |
|
60 |
+ i <- 1 |
|
61 |
+ for (i in 1:length(list)) { |
|
62 |
+ file <- list[[i]] |
|
63 |
+ ed <- adist(sequence, file[, type]) |
|
64 |
+ rownames(ed) <- sequence |
|
65 |
+ colnames(ed) <- file[, type] |
|
66 |
+ ed.index <- which(ed <= editDistance, arr.ind = TRUE) |
|
67 |
+ if (nrow(ed.index) >= 1) { |
|
68 |
+ ed.subset <- as.integer() |
|
69 |
+ j <- 1 |
|
70 |
+ for (j in 1:nrow(ed.index)) { |
|
71 |
+ ed.j <- ed[ed.index[j, "row"], ed.index[j, "col"]] |
|
72 |
+ ed.subset <- c(ed.subset, ed.j) |
|
73 |
+ } |
|
74 |
+ results <- data.frame(searchSequnece = rownames(ed)[ed.index[, 1]], |
|