... | ... |
@@ -10,7 +10,8 @@ Description: This R package analyzes high-throughput sequencing of T |
10 | 10 |
exported from the ImmunoSEQ analyzer. |
11 | 11 |
Depends: R (>= 3.2), LymphoSeqDB |
12 | 12 |
Imports: data.table, plyr, dplyr, reshape, VennDiagram, ggplot2, ineq, |
13 |
- RColorBrewer, circlize, grid, utils, stats |
|
13 |
+ RColorBrewer, circlize, grid, utils, stats, ggtree, msa, Biostrings, |
|
14 |
+ phangorn, doMC |
|
14 | 15 |
License: Artistic-2.0 |
15 | 16 |
LazyData: true |
16 | 17 |
RoxygenNote: 5.0.1 |
17 | 18 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,17 @@ |
1 |
+Version: 1.0 |
|
2 |
+ |
|
3 |
+RestoreWorkspace: Default |
|
4 |
+SaveWorkspace: Default |
|
5 |
+AlwaysSaveHistory: Default |
|
6 |
+ |
|
7 |
+EnableCodeIndexing: Yes |
|
8 |
+UseSpacesForTab: Yes |
|
9 |
+NumSpacesForTab: 4 |
|
10 |
+Encoding: UTF-8 |
|
11 |
+ |
|
12 |
+RnwWeave: knitr |
|
13 |
+LaTeX: pdfLaTeX |
|
14 |
+ |
|
15 |
+BuildType: Package |
|
16 |
+PackageUseDevtools: Yes |
|
17 |
+PackageInstallArgs: --no-multiarch --with-keep.source |
... | ... |
@@ -1,5 +1,6 @@ |
1 | 1 |
# Generated by roxygen2: do not edit by hand |
2 | 2 |
|
3 |
+export(alignSeq) |
|
3 | 4 |
export(bhattacharyyaCoefficient) |
4 | 5 |
export(bhattacharyyaMatrix) |
5 | 6 |
export(chordDiagramVDJ) |
... | ... |
@@ -8,10 +9,13 @@ export(cloneTrack) |
8 | 9 |
export(commonSeqs) |
9 | 10 |
export(commonSeqsPlot) |
10 | 11 |
export(commonSeqsVenn) |
12 |
+export(differentialAbundance) |
|
13 |
+export(exportFasta) |
|
11 | 14 |
export(geneFreq) |
12 | 15 |
export(lorenzCurve) |
13 | 16 |
export(mergeFiles) |
14 | 17 |
export(pairwisePlot) |
18 |
+export(phyloTree) |
|
15 | 19 |
export(productive) |
16 | 20 |
export(productiveSeq) |
17 | 21 |
export(readImmunoSeq) |
... | ... |
@@ -27,21 +31,32 @@ export(topSeqsPlot) |
27 | 31 |
export(uniqueSeqs) |
28 | 32 |
import(LymphoSeqDB) |
29 | 33 |
import(ggplot2) |
34 |
+import(ggtree) |
|
35 |
+import(msa) |
|
36 |
+importFrom(Biostrings,AAStringSet) |
|
37 |
+importFrom(Biostrings,DNAStringSet) |
|
38 |
+importFrom(Biostrings,writeXStringSet) |
|
30 | 39 |
importFrom(RColorBrewer,brewer.pal) |
31 | 40 |
importFrom(VennDiagram,draw.pairwise.venn) |
32 | 41 |
importFrom(VennDiagram,draw.triple.venn) |
33 | 42 |
importFrom(circlize,chordDiagram) |
34 | 43 |
importFrom(circlize,colorRamp2) |
35 | 44 |
importFrom(data.table,fread) |
45 |
+importFrom(doMC,registerDoMC) |
|
36 | 46 |
importFrom(dplyr,group_by) |
37 | 47 |
importFrom(dplyr,summarise) |
38 | 48 |
importFrom(grid,grid.draw) |
39 | 49 |
importFrom(grid,grid.newpage) |
40 | 50 |
importFrom(ineq,Gini) |
41 | 51 |
importFrom(ineq,Lc) |
52 |
+importFrom(phangorn,NJ) |
|
53 |
+importFrom(phangorn,as.phyDat) |
|
54 |
+importFrom(phangorn,dist.ml) |
|
42 | 55 |
importFrom(plyr,ldply) |
43 | 56 |
importFrom(plyr,llply) |
44 | 57 |
importFrom(reshape,melt.data.frame) |
45 | 58 |
importFrom(stats,aggregate) |
59 |
+importFrom(stats,fisher.test) |
|
46 | 60 |
importFrom(stats,na.omit) |
61 |
+importFrom(stats,p.adjust) |
|
47 | 62 |
importFrom(utils,adist) |
48 | 63 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,55 @@ |
1 |
+#' Align mutliple sequences |
|
2 |
+#' |
|
3 |
+#' Perform multiple sequence alignment using one of three methods and output results as a pdf file. |
|
4 |
+#' |
|
5 |
+#' @param list A list of data frames consisting of antigen receptor sequences |
|
6 |
+#' imported by the LymphoSeq function readImmunoSeq. |
|
7 |
+#' @param sample A character vector indicating the name of the sample in the productive |
|
8 |
+#' sequence list. |
|
9 |
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
10 |
+#' should be aligned. If "aminoAcid" is specified, then run productiveSeqs first. |
|
11 |
+#' @param method A character vector indicating the multiple sequence alignment method to |
|
12 |
+#' be used. Refer to the Bioconductor msa package for more details. Options incude |
|
13 |
+#' "ClustalW", "ClustalOmega", and "Muscle". |
|
14 |
+#' @return Saves a pdf file of the multiple sequence alignemnt to the working directory. |
|
15 |
+#' @examples |
|
16 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
17 |
+#' |
|
18 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
19 |
+#' |
|
20 |
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
21 |
+#' |
|
22 |
+#' alignSeq(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", |
|
23 |
+#' method = "ClustalW") |
|
24 |
+#' @export |
|
25 |
+#' @importFrom Biostrings DNAStringSet |
|
26 |
+#' @importFrom Biostrings AAStringSet |
|
27 |
+#' @import msa |
|
28 |
+alignSeq = function(list, sample, type = "nucleotide", method = "ClustalOmega") { |
|
29 |
+ file = list[[sample]] |
|
30 |
+ if(nrow(file) > 150){ |
|
31 |
+ file = file[1:150,] |
|
32 |
+ message("Only the first 150 sequences will be aligned") |
|
33 |
+ } |
|
34 |
+ file[file == ""] <- "unresolved" |
|
35 |
+ if(type == "nucleotide"){ |
|
36 |
+ file = file[nchar(file$nucleotide) > 9, ] |
|
37 |
+ string = Biostrings::DNAStringSet(file$nucleotide) |
|
38 |
+ } |
|
39 |
+ if(type == "aminoAcid"){ |
|
40 |
+ file = file[nchar(file$aminoAcid) > 3, ] |
|
41 |
+ string = Biostrings::AAStringSet(file$aminoAcid) |
|
42 |
+ } |
|
43 |
+ names(string) = paste(file$vFamilyName, file$dFamilyName, file$jFamilyName, file$count) |
|
44 |
+ names(string) = gsub(names(string), pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "") |
|
45 |
+ names(string) = gsub(names(string), pattern = "unresolved", replacement = "UNR") |
|
46 |
+ alignment = msa::msa(string, method = method) |
|
47 |
+ msa::msaPrettyPrint(alignment, output = "pdf", file = paste(names(list[sample]), ".pdf", sep = ""), |
|
48 |
+ showNumbering = "right", showNames = "left", showConsensus = "top", |
|
49 |
+ shadingMode = "similar", shadingColors = "blues", |
|
50 |
+ paperWidth = ncol(alignment)*0.1 + 5, paperHeight = nrow(alignment)*0.1 + 5, |
|
51 |
+ showLogo = "bottom", showLogoScale = "right", logoColors = "rasmol", |
|
52 |
+ psFonts = TRUE, showLegend = TRUE, askForOverwrite = FALSE, |
|
53 |
+ furtherCode=c("\\defconsensus{.}{lower}{upper}","\\showruler{1}{top}")) |
|
54 |
+ message(paste("Aligment files saved to", getwd())) |
|
55 |
+} |
|
0 | 56 |
\ No newline at end of file |
1 | 57 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,93 @@ |
1 |
+#' Differential abundance analysis |
|
2 |
+#' |
|
3 |
+#' Use a Fisher exact test to calculate differential abdunance of each sequence in |
|
4 |
+#' two samples and report the log2 transformed fold change, P value and adjusted P value. |
|
5 |
+#' |
|
6 |
+#' @param list A list of data frames consisting of antigen receptor sequences |
|
7 |
+#' imported by the LymphoSeq function readImmunoSeq. |
|
8 |
+#' @param sample1 A character vector indicating the name of the first sample in the list |
|
9 |
+#' to be compared. |
|
10 |
+#' @param sample2 A character vector indicating the name of the second sample in the list |
|
11 |
+#' to be compared. |
|
12 |
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
13 |
+#' should be used. If "aminoAcid" is specified, then run productiveSeqs first. |
|
14 |
+#' @param q A numeric value between 0.0 and 1.0 indicating the threshold Holms adjusted |
|
15 |
+#' P value (also knowns as the false discovery rate or q value) to subset the results with. |
|
16 |
+#' Any sequences with a q value greater than this value will not be shown. |
|
17 |
+#' @param zero A numeric value to set all zero values to when calculating the log2 |
|
18 |
+#' transformed fold change between samples 1 and 2. This does not apply to the |
|
19 |
+#' p and q value calculations. |
|
20 |
+#' @param abundance The input value for the Fisher exact test. "estimatedNumberGenomes" |
|
21 |
+#' is recommend but "count" may also be used. |
|
22 |
+#' @param parallel A boolean indicating wheter parallel processing should be used or not. |
|
23 |
+#' @param cores An integer specifying the number of processor cores if parallel processing |
|
24 |
+#' is used. |
|
25 |
+#' @return Returns a data frame with columns corresponding to the frequency of the abudance |
|
26 |
+#' measure in samples 1 and 2, the P value, Q value (Holms adjusted P value, also knowns as |
|
27 |
+#' the false discovery rate), and log2 transformed fold change. |
|
28 |
+#' @examples |
|
29 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
30 |
+#' |
|
31 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
32 |
+#' |
|
33 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
34 |
+#' |
|
35 |
+#' differentialAbundance(list = productive.aa, sample1 = "TCRB_Day949_Unsorted", sample2 = "TCRB_Day1320_Unsorted", |
|
36 |
+#' type = "aminoAcid", q = 0.01, zero = 0.001, parallel = FALSE) |
|
37 |
+#' @export |
|
38 |
+#' @importFrom stats p.adjust |
|
39 |
+#' @importFrom stats fisher.test |
|
40 |
+#' @importFrom doMC registerDoMC |
|
41 |
+differentialAbundance <- function(sample1, sample2, list, abundance = "estimatedNumberGenomes", |
|
42 |
+ type = "aminoAcid", q = 1, zero = 0.001, parallel = TRUE, cores = 4) { |
|
43 |
+ x <- list[[sample1]] |
|
44 |
+ y <- list[[sample2]] |
|
45 |
+ sequences <- union(x[, type], y[, type]) |
|
46 |
+ fisherFunction <- function(sequences) { |
|
47 |
+ p.value <- as.numeric() |
|
48 |
+ sample1.freq <- as.numeric() |
|
49 |
+ sample2.freq <- as.numeric() |
|
50 |
+ if (any(x[, type] == sequences)) { |
|
51 |
+ in.x <- x[x[, type] == sequences, abundance] |
|
52 |
+ x.freq <- x[x[, type] == sequences, abundance]/sum(x[, abundance]) * |
|
53 |
+ 100 |
|
54 |
+ } else { |
|
55 |
+ in.x <- 0 |
|
56 |
+ x.freq <- 0 |
|
57 |
+ } |
|
58 |
+ if (any(y[, type] == sequences)) { |
|
59 |
+ in.y <- y[y[, type] == sequences, abundance] |
|
60 |
+ y.freq <- y[y[, type] == sequences, abundance]/sum(y[, abundance]) * |
|
61 |
+ 100 |
|
62 |
+ } else { |
|
63 |
+ in.y <- 0 |
|
64 |
+ y.freq <- 0 |
|
65 |
+ } |
|
66 |
+ not.x <- sum(x[, abundance]) - in.x |
|
67 |
+ not.y <- sum(y[, abundance]) - in.y |
|
68 |
+ matrix <- matrix(c(in.x, in.y, not.x, not.y), nrow = 2) |
|
69 |
+ fisher <- stats::fisher.test(matrix) |
|
70 |
+ p.value <- c(p.value, fisher$p.value) |
|
71 |
+ sample1.freq <- c(sample1.freq, x.freq) |
|
72 |
+ sample2.freq <- c(sample2.freq, y.freq) |
|
73 |
+ results <- data.frame(aminoAcid = sequences, |
|
74 |
+ x = sample1.freq, |
|
75 |
+ y = sample2.freq, |
|
76 |
+ p = p.value) |
|
77 |
+ return(results) |
|
78 |
+ } |
|
79 |
+ doMC::registerDoMC(cores = cores) |
|
80 |
+ results <- plyr::ldply(sequences, fisherFunction, .parallel = parallel) |
|
81 |
+ results$q <- stats::p.adjust(results$p, method = "holm") |
|
82 |
+ x.not.zero <- results[, "x"] |
|
83 |
+ x.not.zero[x.not.zero == 0] <- zero |
|
84 |
+ y.not.zero <- results[, "y"] |
|
85 |
+ y.not.zero[y.not.zero == 0] <- zero |
|
86 |
+ results$l2fc <- log2(x.not.zero/y.not.zero) |
|
87 |
+ names(results)[2] <- sample1 |
|
88 |
+ names(results)[3] <- sample2 |
|
89 |
+ results <- results[order(results$q, decreasing = FALSE), ] |
|
90 |
+ results <- results[results$q <= q, ] |
|
91 |
+ rownames(results) <- NULL |
|
92 |
+ return(results) |
|
93 |
+} |
|
0 | 94 |
\ No newline at end of file |
1 | 95 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,57 @@ |
1 |
+#' Export sequences in fasta format |
|
2 |
+#' |
|
3 |
+#' Export nucleotide or amino acid sequences in fasta format. |
|
4 |
+#' |
|
5 |
+#' @param list A list of data frames consisting of antigen receptor sequences |
|
6 |
+#' imported by the LymphoSeq function readImmunoSeq. |
|
7 |
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
8 |
+#' should be exported. If "aminoAcid" is specified, then run productiveSeqs first. |
|
9 |
+#' @param names A character vector of one or more column names to name the sequences. |
|
10 |
+#' If "rank" is specified, then the rank order of the sequences by frequency is used. |
|
11 |
+#' @return Exports fasta files to the working directory. |
|
12 |
+#' @examples |
|
13 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
14 |
+#' |
|
15 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
16 |
+#' |
|
17 |
+#' exportFasta(list = file.list, type = "nucleotide", names = c("rank", "aminoAcid", "count")) |
|
18 |
+#' |
|
19 |
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
20 |
+#' |
|
21 |
+#' exportFasta(list = productive.aa, type = "aminoAcid", names = "frequencyCount") |
|
22 |
+#' @export |
|
23 |
+#' @importFrom Biostrings DNAStringSet |
|
24 |
+#' @importFrom Biostrings AAStringSet |
|
25 |
+#' @importFrom Biostrings writeXStringSet |
|
26 |
+exportFasta <- function(list, type = "nucleotide", names = c("rank", "aminoAcid", "count")) { |
|
27 |
+ if (type == "nucleotide") { |
|
28 |
+ i <- 1 |
|
29 |
+ for (i in 1:length(list)) { |
|
30 |
+ file <- list[[i]] |
|
31 |
+ sequences <- file$nucleotide |
|
32 |
+ fasta <- Biostrings::DNAStringSet(sequences) |
|
33 |
+ if (any(names %in% "rank")) { |
|
34 |
+ file$rank <- rownames(file) |
|
35 |
+ } |
|
36 |
+ names(fasta) <- do.call(paste, c(file[names], sep = "_")) |
|
37 |
+ Biostrings::writeXStringSet(fasta, paste(names(list[i]), ".fasta", sep = "")) |
|
38 |
+ } |
|
39 |
+ } |
|
40 |
+ if (type == "aminoAcid") { |
|
41 |
+ i <- 1 |
|
42 |
+ for (i in 1:length(list)) { |
|
43 |
+ file <- list[[i]] |
|
44 |
+ sequences <- file$aminoAcid |
|
45 |
+ if (any(sequences == "") | any(grepl("\\*", sequences))) { |
|
46 |
+ stop("Your list contains unproductive sequences. Remove unproductive sequences first using the function productiveSeq.", call. = FALSE) |
|
47 |
+ } |
|
48 |
+ fasta <- Biostrings::AAStringSet(sequences) |
|
49 |
+ if (any(names %in% "rank")) { |
|
50 |
+ file$rank <- rownames(file) |
|
51 |
+ } |
|
52 |
+ names(fasta) <- do.call(paste, c(file[names], sep = "_")) |
|
53 |
+ Biostrings::writeXStringSet(fasta, paste(names(list[i]), ".fasta", sep = "")) |
|
54 |
+ } |
|
55 |
+ } |
|
56 |
+ message(paste("Fasta files saved to", getwd())) |
|
57 |
+} |
0 | 58 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,97 @@ |
1 |
+#' Create phylogenetic tree |
|
2 |
+#' |
|
3 |
+#' Create a phylogenetic tree from amino acid or nucleotide CDR3 sequences in a list of |
|
4 |
+#' data frames. |
|
5 |
+#' |
|
6 |
+#' @param list A list data frames of unproductive nucleotide sequences or productive |
|
7 |
+#' nucleotide or amino acid sequenes generated by the LymphoSeq function productiveSeq |
|
8 |
+#' where the parameter aggregate is set to "nucleotide". vFamilyName, dFamilyName, and |
|
9 |
+#' jFamilyName are required columns. |
|
10 |
+#' @param sample A character vector indicating the name of the sample in the productive |
|
11 |
+#' sequence list. |
|
12 |
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
13 |
+#' should be aligned. Available options are "nucleotide" and "aminoAcid". |
|
14 |
+#' @param method A character vector indicating the multiple sequence alignment method to |
|
15 |
+#' be used. Refer to the Bioconductor msa package for more details. Options incude |
|
16 |
+#' "ClustalW", "ClustalOmega", and "Muscle". |
|
17 |
+#' @param aa.model A character vector indicating the pairwise distance model to be used |
|
18 |
+#' to compare amino acid sequences. Refer to the phangorn package for more details. |
|
19 |
+#' Available options incude "WAG", "JTT", "LG", "Dayhoff", "cpREV", "mtmam", "mtArt", "MtZoa", |
|
20 |
+#' "mtREV24", "VT","RtREV", "HIVw", "HIVb", "FLU", "Blossum62", "Dayhoff_DCMut" and "JTT_DCMut". |
|
21 |
+#' The default options is "JTT". |
|
22 |
+#' @param nt.model A character vector indicating the pairwise distance model to be used |
|
23 |
+#' to compare nucleotide sequences. Refer to the phangorn package for more details. |
|
24 |
+#' Available options incude "JC69" or "F81". The default options is "F81". |
|
25 |
+#' @param layout A character vector indicating the tree layout. Options include |
|
26 |
+#' "rectangular", "slanted", "fan", "circular", "radial" and "unrooted". |
|
27 |
+#' @return Returns a phylogenetic tree where each leaf represents a sequence color coded by the |
|
28 |
+#' V, D, and J gene usage. The number next to each leaf refers to the sequence count. A triangle |
|
29 |
+#' shaped leaf indicates the dominant sequence. Refer to the ggtree Bioconductor package |
|
30 |
+#' documentation for details on how to manipulate the tree. |
|
31 |
+#' @examples |
|
32 |
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
33 |
+#' |
|
34 |
+#' file.list <- readImmunoSeq(path = file.path) |
|
35 |
+#' |
|
36 |
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
37 |
+#' |
|
38 |
+#' phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", |
|
39 |
+#' layout = "rectangular", method = "ClustalW", nt.model = "F81") |
|
40 |
+#' |
|
41 |
+#' phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "aminoAcid", |
|
42 |
+#' layout = "circular", method = "ClustalOmega", nt.model = "JTT") |
|
43 |
+#' @export |
|
44 |
+#' @importFrom Biostrings DNAStringSet |
|
45 |
+#' @importFrom Biostrings AAStringSet |
|
46 |
+#' @importFrom phangorn as.phyDat |
|
47 |
+#' @importFrom phangorn dist.ml |
|
48 |
+#' @importFrom phangorn NJ |
|
49 |
+#' @import msa |
|
50 |
+#' @import ggtree |
|
51 |
+phyloTree <- function(list, sample, type = "nucleotide", method = "ClustalOmega", |
|
52 |
+ layout = "rectangular", aa.model = "JTT", nt.model = "F81") { |
|
53 |
+ file <- list[[sample]] |
|
54 |
+ if(length(grep(pattern = "vFamilyName|dFamilyName|jFamilyName", names(file))) != 3){ |
|
55 |
+ stop("vFamilyName, dFamilyName, and jFamilyName are required columns. Try aggregating the sequences by 'nucleotide' usng the function productiveSeq.", call. = FALSE) |
|
56 |
+ } |
|
57 |
+ if(nrow(file) < 3){ |
|
58 |
+ stop("Cannot draw phlogenetic tree with less than 3 sequences.", call. = FALSE) |
|
59 |
+ } |
|
60 |
+ file[file == ""] <- "unresolved" |
|
61 |
+ if (type == "nucleotide") { |
|
62 |
+ file <- file[nchar(file$nucleotide) > 3, ] |
|
63 |
+ string <- Biostrings::DNAStringSet(file$nucleotide) |
|
64 |
+ } |
|
65 |
+ if (type == "aminoAcid") { |
|
66 |
+ file <- file[nchar(file$aminoAcid) > 3, ] |
|
67 |
+ string <- Biostrings::AAStringSet(file$aminoAcid) |
|
68 |
+ } |
|
69 |
+ names(string) <- file$count |
|
70 |
+ alignment <- msa::msa(string, method = method) |
|
71 |
+ if (type == "nucleotide") { |
|
72 |
+ alignment.phyDat <- phangorn::as.phyDat(x = alignment, type = "DNA", format = clustal) |
|
73 |
+ alignment.dist <- phangorn::dist.ml(alignment.phyDat, model = nt.model) |
|
74 |
+ } |
|
75 |
+ if (type == "aminoAcid") { |
|
76 |
+ alignment.phyDat <- phangorn::as.phyDat(x = alignment, type = "AA", format = clustal) |
|
77 |
+ alignment.dist <- phangorn::dist.ml(alignment.phyDat, model = aa.model) |
|
78 |
+ } |
|
79 |
+ tree <- phangorn::NJ(alignment.dist) |
|
80 |
+ geneFamilies <- paste(file$vFamilyName, file$dFamilyName, file$jFamilyName) |
|
81 |
+ geneFamilies <- gsub(geneFamilies, pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "") |
|
82 |
+ geneFamilies <- gsub(geneFamilies, pattern = "unresolved", replacement = "UNR") |
|
83 |
+ tree.annotation <- data.frame(Names = file$count, Genomes = file$estimatedNumberGenomes, |
|
84 |
+ Count = file$count, Genes = geneFamilies, |
|
85 |
+ Frequency = round(file$frequencyCount, digits = 2), |
|
86 |
+ Dominant = c("Yes", rep("No", length(string) - 1))) |
|
87 |
+ getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1")) |
|
88 |
+ plot <- ggtree::ggtree(tree, layout = layout) %<+% tree.annotation + |
|
89 |
+ ggtree::geom_tippoint(aes(color = Genes, shape = Dominant), size = 3) + |
|
90 |
+ ggplot2::scale_color_manual(values = getPalette(length(unique(geneFamilies)))) + |
|
91 |
+ ggplot2::guides(shape = FALSE) + |
|
92 |
+ ggtree::geom_tiplab(size = 2, hjust = - 0.3) + |
|
93 |
+ ggplot2::theme(legend.position = "bottom", legend.key = element_rect(colour = "white")) + |
|
94 |
+ ggplot2::labs(color = "") |
|
95 |
+ return(plot) |
|
96 |
+} |
|
97 |
+ |
0 | 98 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,50 @@ |
1 |
+# LymphoSeq |
|
2 |
+This R package analyzes high-throughput sequencing of T and B cell receptor |
|
3 |
+complementarity determining region 3 (CDR3) sequences generated by [Adaptive |
|
4 |
+Biotechnologies' ImmunoSEQ assay](http://www.adaptivebiotech.com/immunoseq). |
|
5 |
+ |
|
6 |
+### Installation instructions |
|
7 |
+ |
|
8 |
+#### Install release version 1.0.0 |
|
9 |
+###### Install from [Bioconductor](https://www.bioconductor.org/packages/LymphoSeq) |
|
10 |
+``` |
|
11 |
+source("https://bioconductor.org/biocLite.R") |
|
12 |
+biocLite("LymphoSeq") |
|
13 |
+``` |
|
14 |
+ |
|
15 |
+#### Install developer version 1.1.0 |
|
16 |
+###### Option 1: Install from [Bioconductor developer branch](https://www.bioconductor.org/developers/how-to/useDevel/) |
|
17 |
+``` |
|
18 |
+# Switch to Bioconductor developer branch (requires latest version of R) |
|
19 |
+library(BiocInstaller) |
|
20 |
+useDevel() |
|
21 |
+ |
|
22 |
+# Download developer release |
|
23 |
+source("https://bioconductor.org/biocLite.R") |
|
24 |
+biocLite("LymphoSeq") |
|
25 |
+ |
|
26 |
+# Switch back to Bioconductor release branch |
|
27 |
+library(BiocInstaller) |
|
28 |
+useDevel() |
|
29 |
+``` |
|
30 |
+###### Option 2: Install from GitHub |
|
31 |
+``` |
|
32 |
+# Install the latest version of Bioconductor |
|
33 |
+source("https://bioconductor.org/biocLite.R") |
|
34 |
+biocLite() |
|
35 |
+ |
|
36 |
+# Download developer tools |
|
37 |
+install.packages("devtools") |
|
38 |
+ |
|
39 |
+# Download package from GitHub |
|
40 |
+install_github("davidcoffey/LymphoSeqDB") |
|
41 |
+install_github("davidcoffey/LymphoSeq") |
|
42 |
+``` |
|
43 |
+ |
|
44 |
+### Documentation |
|
45 |
+* [LymphoSeq vignette](https://bioconductor.org/packages/release/bioc/vignettes/LymphoSeq/inst/doc/LymphoSeq.pdf) |
|
46 |
+* [LymphoSeq manual](https://bioconductor.org/packages/release/bioc/manuals/LymphoSeq/man/LymphoSeq.pdf) |
|
47 |
+* [LymphoSeq news](https://bioconductor.org/packages/release/bioc/news/LymphoSeq/NEWS) |
|
48 |
+ |
|
49 |
+### Citation |
|
50 |
+Coffey D (2016). LymphoSeq: Analyze high-throughput sequencing of T and B cell receptors. R package version 1.0.0. |
2 | 8 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,39 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/alignSeq.R |
|
3 |
+\name{alignSeq} |
|
4 |
+\alias{alignSeq} |
|
5 |
+\title{Align mutliple sequences} |
|
6 |
+\usage{ |
|
7 |
+alignSeq(list, sample, type = "nucleotide", method = "ClustalOmega") |
|
8 |
+} |
|
9 |
+\arguments{ |
|
10 |
+\item{list}{A list of data frames consisting of antigen receptor sequences |
|
11 |
+imported by the LymphoSeq function readImmunoSeq.} |
|
12 |
+ |
|
13 |
+\item{sample}{A character vector indicating the name of the sample in the productive |
|
14 |
+sequence list.} |
|
15 |
+ |
|
16 |
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
17 |
+should be aligned. If "aminoAcid" is specified, then run productiveSeqs first.} |
|
18 |
+ |
|
19 |
+\item{method}{A character vector indicating the multiple sequence alignment method to |
|
20 |
+be used. Refer to the Bioconductor msa package for more details. Options incude |
|
21 |
+"ClustalW", "ClustalOmega", and "Muscle".} |
|
22 |
+} |
|
23 |
+\value{ |
|
24 |
+Saves a pdf file of the multiple sequence alignemnt to the working directory. |
|
25 |
+} |
|
26 |
+\description{ |
|
27 |
+Perform multiple sequence alignment using one of three methods and output results as a pdf file. |
|
28 |
+} |
|
29 |
+\examples{ |
|
30 |
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
31 |
+ |
|
32 |
+file.list <- readImmunoSeq(path = file.path) |
|
33 |
+ |
|
34 |
+productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
35 |
+ |
|
36 |
+alignSeq(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", |
|
37 |
+ method = "ClustalW") |
|
38 |
+} |
|
39 |
+ |
0 | 40 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,59 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/differentialAbundance.R |
|
3 |
+\name{differentialAbundance} |
|
4 |
+\alias{differentialAbundance} |
|
5 |
+\title{Differential abundance analysis} |
|
6 |
+\usage{ |
|
7 |
+differentialAbundance(sample1, sample2, list, |
|
8 |
+ abundance = "estimatedNumberGenomes", type = "aminoAcid", q = 1, |
|
9 |
+ zero = 0.001, parallel = TRUE, cores = 4) |
|
10 |
+} |
|
11 |
+\arguments{ |
|
12 |
+\item{sample1}{A character vector indicating the name of the first sample in the list |
|
13 |
+to be compared.} |
|
14 |
+ |
|
15 |
+\item{sample2}{A character vector indicating the name of the second sample in the list |
|
16 |
+to be compared.} |
|
17 |
+ |
|
18 |
+\item{list}{A list of data frames consisting of antigen receptor sequences |
|
19 |
+imported by the LymphoSeq function readImmunoSeq.} |
|
20 |
+ |
|
21 |
+\item{abundance}{The input value for the Fisher exact test. "estimatedNumberGenomes" |
|
22 |
+is recommend but "count" may also be used.} |
|
23 |
+ |
|
24 |
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
25 |
+should be used. If "aminoAcid" is specified, then run productiveSeqs first.} |
|
26 |
+ |
|
27 |
+\item{q}{A numeric value between 0.0 and 1.0 indicating the threshold Holms adjusted |
|
28 |
+P value (also knowns as the false discovery rate or q value) to subset the results with. |
|
29 |
+Any sequences with a q value greater than this value will not be shown.} |
|
30 |
+ |
|
31 |
+\item{zero}{A numeric value to set all zero values to when calculating the log2 |
|
32 |
+transformed fold change between samples 1 and 2. This does not apply to the |
|
33 |
+p and q value calculations.} |
|
34 |
+ |
|
35 |
+\item{parallel}{A boolean indicating wheter parallel processing should be used or not.} |
|
36 |
+ |
|
37 |
+\item{cores}{An integer specifying the number of processor cores if parallel processing |
|
38 |
+is used.} |
|
39 |
+} |
|
40 |
+\value{ |
|
41 |
+Returns a data frame with columns corresponding to the frequency of the abudance |
|
42 |
+measure in samples 1 and 2, the P value, Q value (Holms adjusted P value, also knowns as |
|
43 |
+the false discovery rate), and log2 transformed fold change. |
|
44 |
+} |
|
45 |
+\description{ |
|
46 |
+Use a Fisher exact test to calculate differential abdunance of each sequence in |
|
47 |
+two samples and report the log2 transformed fold change, P value and adjusted P value. |
|
48 |
+} |
|
49 |
+\examples{ |
|
50 |
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
51 |
+ |
|
52 |
+file.list <- readImmunoSeq(path = file.path) |
|
53 |
+ |
|
54 |
+productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
55 |
+ |
|
56 |
+differentialAbundance(list = productive.aa, sample1 = "TCRB_Day949_Unsorted", sample2 = "TCRB_Day1320_Unsorted", |
|
57 |
+ type = "aminoAcid", q = 0.01, zero = 0.001, parallel = FALSE) |
|
58 |
+} |
|
59 |
+ |
0 | 60 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,37 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/exportFasta.R |
|
3 |
+\name{exportFasta} |
|
4 |
+\alias{exportFasta} |
|
5 |
+\title{Export sequences in fasta format} |
|
6 |
+\usage{ |
|
7 |
+exportFasta(list, type = "nucleotide", names = c("rank", "aminoAcid", |
|
8 |
+ "count")) |
|
9 |
+} |
|
10 |
+\arguments{ |
|
11 |
+\item{list}{A list of data frames consisting of antigen receptor sequences |
|
12 |
+imported by the LymphoSeq function readImmunoSeq.} |
|
13 |
+ |
|
14 |
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
15 |
+should be exported. If "aminoAcid" is specified, then run productiveSeqs first.} |
|
16 |
+ |
|
17 |
+\item{names}{A character vector of one or more column names to name the sequences. |
|
18 |
+If "rank" is specified, then the rank order of the sequences by frequency is used.} |
|
19 |
+} |
|
20 |
+\value{ |
|
21 |
+Exports fasta files to the working directory. |
|
22 |
+} |
|
23 |
+\description{ |
|
24 |
+Export nucleotide or amino acid sequences in fasta format. |
|
25 |
+} |
|
26 |
+\examples{ |
|
27 |
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
28 |
+ |
|
29 |
+file.list <- readImmunoSeq(path = file.path) |
|
30 |
+ |
|
31 |
+exportFasta(list = file.list, type = "nucleotide", names = c("rank", "aminoAcid", "count")) |
|
32 |
+ |
|
33 |
+productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid") |
|
34 |
+ |
|
35 |
+exportFasta(list = productive.aa, type = "aminoAcid", names = "frequencyCount") |
|
36 |
+} |
|
37 |
+ |
0 | 38 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,62 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/phyloTree.R |
|
3 |
+\name{phyloTree} |
|
4 |
+\alias{phyloTree} |
|
5 |
+\title{Create phylogenetic tree} |
|
6 |
+\usage{ |
|
7 |
+phyloTree(list, sample, type = "nucleotide", method = "ClustalOmega", |
|
8 |
+ layout = "rectangular", aa.model = "JTT", nt.model = "F81") |
|
9 |
+} |
|
10 |
+\arguments{ |
|
11 |
+\item{list}{A list data frames of unproductive nucleotide sequences or productive |
|
12 |
+nucleotide or amino acid sequenes generated by the LymphoSeq function productiveSeq |
|
13 |
+where the parameter aggregate is set to "nucleotide". vFamilyName, dFamilyName, and |
|
14 |
+jFamilyName are required columns.} |
|
15 |
+ |
|
16 |
+\item{sample}{A character vector indicating the name of the sample in the productive |
|
17 |
+sequence list.} |
|
18 |
+ |
|
19 |
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences |
|
20 |
+should be aligned. Available options are "nucleotide" and "aminoAcid".} |
|
21 |
+ |
|
22 |
+\item{method}{A character vector indicating the multiple sequence alignment method to |
|
23 |
+be used. Refer to the Bioconductor msa package for more details. Options incude |
|
24 |
+"ClustalW", "ClustalOmega", and "Muscle".} |
|
25 |
+ |
|
26 |
+\item{layout}{A character vector indicating the tree layout. Options include |
|
27 |
+"rectangular", "slanted", "fan", "circular", "radial" and "unrooted".} |
|
28 |
+ |
|
29 |
+\item{aa.model}{A character vector indicating the pairwise distance model to be used |
|
30 |
+to compare amino acid sequences. Refer to the phangorn package for more details. |
|
31 |
+Available options incude "WAG", "JTT", "LG", "Dayhoff", "cpREV", "mtmam", "mtArt", "MtZoa", |
|
32 |
+"mtREV24", "VT","RtREV", "HIVw", "HIVb", "FLU", "Blossum62", "Dayhoff_DCMut" and "JTT_DCMut". |
|
33 |
+The default options is "JTT".} |
|
34 |
+ |
|
35 |
+\item{nt.model}{A character vector indicating the pairwise distance model to be used |
|
36 |
+to compare nucleotide sequences. Refer to the phangorn package for more details. |
|
37 |
+Available options incude "JC69" or "F81". The default options is "F81".} |
|
38 |
+} |
|
39 |
+\value{ |
|
40 |
+Returns a phylogenetic tree where each leaf represents a sequence color coded by the |
|
41 |
+V, D, and J gene usage. The number next to each leaf refers to the sequence count. A triangle |
|
42 |
+shaped leaf indicates the dominant sequence. Refer to the ggtree Bioconductor package |
|
43 |
+documentation for details on how to manipulate the tree. |
|
44 |
+} |
|
45 |
+\description{ |
|
46 |
+Create a phylogenetic tree from amino acid or nucleotide CDR3 sequences in a list of |
|
47 |
+data frames. |
|
48 |
+} |
|
49 |
+\examples{ |
|
50 |
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq") |
|
51 |
+ |
|
52 |
+file.list <- readImmunoSeq(path = file.path) |
|
53 |
+ |
|
54 |
+productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide") |
|
55 |
+ |
|
56 |
+phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", |
|
57 |
+ layout = "rectangular", method = "ClustalW", nt.model = "F81") |
|
58 |
+ |
|
59 |
+phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "aminoAcid", |
|
60 |
+ layout = "circular", method = "ClustalOmega", nt.model = "JTT") |
|
61 |
+} |
|
62 |
+ |
... | ... |
@@ -1,14 +1,13 @@ |
1 | 1 |
--- |
2 | 2 |
title: "Analysis of high-throughput sequencing of T and B cell receptors with LymphoSeq" |
3 | 3 |
author: "David G. Coffey, MD" |
4 |
-date: "`r Sys.Date()`" |
|
4 |
+date: '`r Sys.Date()`' |
|
5 | 5 |
output: pdf_document |
6 | 6 |
geometry: margin=0.5in |
7 | 7 |
fontsize: 10pt |
8 |
-vignette: > |
|
9 |
- %\VignetteIndexEntry{Analyze high throughput sequencing of T and B cell receptors with LymphoSeq} |
|
10 |
- %\VignetteEncoding{UTF-8} |
|
11 |
- %\VignetteEngine{knitr::rmarkdown} |
|
8 |
+toc: yes |
|
9 |
+vignette: | |
|
10 |
+ %\VignetteIndexEntry{Analyze high throughput sequencing of T and B cell receptors with LymphoSeq} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} |
|
12 | 11 |
--- |
13 | 12 |
|
14 | 13 |
Application of high-throughput sequencing of T and B lymphocyte antigen receptors has great potential for improving the monitoring of lymphoid malignancies, assessing immune reconstitution after hematopoietic stem cell transplantation, and characterizing the composition of lymphocyte repertoires.^[Warren, E. H. *et al.* *Blood* 2013;122:19–22.] LymhoSeq is an R package designed to import, analyze, and visualize antigen receptor sequencing from Adaptive Biotechnologies' ImmunoSEQ assay.^[http://www.adaptivebiotech.com/immunoseq] The package is also adaptable to the analysis of T and B cell receptor sequencing processed using other platforms such as MiXCR^[http://mixcr.milaboratory.com] or IMGT/HighV-QUEST.^[http://www.imgt.org/HighV-QUEST] This vignette has been written to highlight some of the features of LymphoSeq and guide the user through a typical workflow. |