Browse code

Added functions alignSeq, phyloSeq, exportFasta, differentialAbundance

davidcoffey authored on 10/05/2016 23:14:17
Showing 16 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,2 @@
1
+^.*\.Rproj$
2
+^\.Rproj\.user$
0 3
new file mode 100644
... ...
@@ -0,0 +1,4 @@
1
+.Rproj.user
2
+.Rhistory
3
+.RData
4
+.Ruserdata
... ...
@@ -10,7 +10,8 @@ Description: This R package analyzes high-throughput sequencing of T
10 10
         exported from the ImmunoSEQ analyzer.
11 11
 Depends: R (>= 3.2), LymphoSeqDB
12 12
 Imports: data.table, plyr, dplyr, reshape, VennDiagram, ggplot2, ineq,
13
-        RColorBrewer, circlize, grid, utils, stats
13
+        RColorBrewer, circlize, grid, utils, stats, ggtree, msa, Biostrings,
14
+        phangorn, doMC
14 15
 License: Artistic-2.0
15 16
 LazyData: true
16 17
 RoxygenNote: 5.0.1
17 18
new file mode 100644
... ...
@@ -0,0 +1,17 @@
1
+Version: 1.0
2
+
3
+RestoreWorkspace: Default
4
+SaveWorkspace: Default
5
+AlwaysSaveHistory: Default
6
+
7
+EnableCodeIndexing: Yes
8
+UseSpacesForTab: Yes
9
+NumSpacesForTab: 4
10
+Encoding: UTF-8
11
+
12
+RnwWeave: knitr
13
+LaTeX: pdfLaTeX
14
+
15
+BuildType: Package
16
+PackageUseDevtools: Yes
17
+PackageInstallArgs: --no-multiarch --with-keep.source
... ...
@@ -1,5 +1,6 @@
1 1
 # Generated by roxygen2: do not edit by hand
2 2
 
3
+export(alignSeq)
3 4
 export(bhattacharyyaCoefficient)
4 5
 export(bhattacharyyaMatrix)
5 6
 export(chordDiagramVDJ)
... ...
@@ -8,10 +9,13 @@ export(cloneTrack)
8 9
 export(commonSeqs)
9 10
 export(commonSeqsPlot)
10 11
 export(commonSeqsVenn)
12
+export(differentialAbundance)
13
+export(exportFasta)
11 14
 export(geneFreq)
12 15
 export(lorenzCurve)
13 16
 export(mergeFiles)
14 17
 export(pairwisePlot)
18
+export(phyloTree)
15 19
 export(productive)
16 20
 export(productiveSeq)
17 21
 export(readImmunoSeq)
... ...
@@ -27,21 +31,32 @@ export(topSeqsPlot)
27 31
 export(uniqueSeqs)
28 32
 import(LymphoSeqDB)
29 33
 import(ggplot2)
34
+import(ggtree)
35
+import(msa)
36
+importFrom(Biostrings,AAStringSet)
37
+importFrom(Biostrings,DNAStringSet)
38
+importFrom(Biostrings,writeXStringSet)
30 39
 importFrom(RColorBrewer,brewer.pal)
31 40
 importFrom(VennDiagram,draw.pairwise.venn)
32 41
 importFrom(VennDiagram,draw.triple.venn)
33 42
 importFrom(circlize,chordDiagram)
34 43
 importFrom(circlize,colorRamp2)
35 44
 importFrom(data.table,fread)
45
+importFrom(doMC,registerDoMC)
36 46
 importFrom(dplyr,group_by)
37 47
 importFrom(dplyr,summarise)
38 48
 importFrom(grid,grid.draw)
39 49
 importFrom(grid,grid.newpage)
40 50
 importFrom(ineq,Gini)
41 51
 importFrom(ineq,Lc)
52
+importFrom(phangorn,NJ)
53
+importFrom(phangorn,as.phyDat)
54
+importFrom(phangorn,dist.ml)
42 55
 importFrom(plyr,ldply)
43 56
 importFrom(plyr,llply)
44 57
 importFrom(reshape,melt.data.frame)
45 58
 importFrom(stats,aggregate)
59
+importFrom(stats,fisher.test)
46 60
 importFrom(stats,na.omit)
61
+importFrom(stats,p.adjust)
47 62
 importFrom(utils,adist)
48 63
new file mode 100644
... ...
@@ -0,0 +1,55 @@
1
+#' Align mutliple sequences
2
+#' 
3
+#' Perform multiple sequence alignment using one of three methods and output results as a pdf file.
4
+#' 
5
+#' @param list A list of data frames consisting of antigen receptor sequences 
6
+#' imported by the LymphoSeq function readImmunoSeq.
7
+#' @param sample A character vector indicating the name of the sample in the productive
8
+#' sequence list. 
9
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences
10
+#' should be aligned.  If "aminoAcid" is specified, then run productiveSeqs first.
11
+#' @param method A character vector indicating the multiple sequence alignment method to 
12
+#' be used.  Refer to the Bioconductor msa package for more details.  Options incude 
13
+#' "ClustalW", "ClustalOmega", and "Muscle".
14
+#' @return Saves a pdf file of the multiple sequence alignemnt to the working directory.
15
+#' @examples
16
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
17
+#' 
18
+#' file.list <- readImmunoSeq(path = file.path)
19
+#' 
20
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
21
+#' 
22
+#' alignSeq(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", 
23
+#'          method = "ClustalW")
24
+#' @export
25
+#' @importFrom Biostrings DNAStringSet
26
+#' @importFrom Biostrings AAStringSet
27
+#' @import msa
28
+alignSeq = function(list, sample, type = "nucleotide", method = "ClustalOmega") {
29
+    file = list[[sample]]
30
+    if(nrow(file) > 150){
31
+        file = file[1:150,]
32
+        message("Only the first 150 sequences will be aligned")
33
+    }
34
+    file[file == ""] <- "unresolved"
35
+    if(type == "nucleotide"){
36
+        file = file[nchar(file$nucleotide) > 9, ]
37
+        string = Biostrings::DNAStringSet(file$nucleotide)
38
+    }
39
+    if(type == "aminoAcid"){
40
+        file = file[nchar(file$aminoAcid) > 3, ]
41
+        string = Biostrings::AAStringSet(file$aminoAcid)
42
+    }
43
+        names(string) = paste(file$vFamilyName, file$dFamilyName, file$jFamilyName, file$count)
44
+        names(string) = gsub(names(string), pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "")
45
+        names(string) = gsub(names(string), pattern = "unresolved", replacement = "UNR")
46
+        alignment = msa::msa(string, method = method)
47
+        msa::msaPrettyPrint(alignment, output = "pdf", file = paste(names(list[sample]), ".pdf", sep = ""), 
48
+                       showNumbering = "right", showNames = "left", showConsensus = "top",  
49
+                       shadingMode = "similar", shadingColors = "blues",
50
+                       paperWidth = ncol(alignment)*0.1 + 5, paperHeight = nrow(alignment)*0.1 + 5,
51
+                       showLogo = "bottom", showLogoScale = "right", logoColors = "rasmol",
52
+                       psFonts = TRUE, showLegend = TRUE, askForOverwrite = FALSE,
53
+                       furtherCode=c("\\defconsensus{.}{lower}{upper}","\\showruler{1}{top}"))
54
+        message(paste("Aligment files saved to", getwd()))
55
+}
0 56
\ No newline at end of file
1 57
new file mode 100644
... ...
@@ -0,0 +1,93 @@
1
+#' Differential abundance analysis
2
+#' 
3
+#' Use a Fisher exact test to calculate differential abdunance of each sequence in 
4
+#' two samples and report the log2 transformed fold change, P value and adjusted P value.
5
+#' 
6
+#' @param list A list of data frames consisting of antigen receptor sequences 
7
+#' imported by the LymphoSeq function readImmunoSeq.
8
+#' @param sample1 A character vector indicating the name of the first sample in the list
9
+#' to be compared.
10
+#' @param sample2 A character vector indicating the name of the second sample in the list
11
+#' to be compared.
12
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences
13
+#' should be used.  If "aminoAcid" is specified, then run productiveSeqs first.
14
+#' @param q A numeric value between 0.0 and 1.0 indicating the threshold Holms adjusted 
15
+#' P value (also knowns as the false discovery rate or q value) to subset the results with.  
16
+#' Any sequences with a q value greater than this value will not be shown.
17
+#' @param zero A numeric value to set all zero values to when calculating the log2
18
+#' transformed fold change between samples 1 and 2.  This does not apply to the 
19
+#' p and q value calculations.
20
+#' @param abundance The input value for the Fisher exact test.  "estimatedNumberGenomes"
21
+#' is recommend but "count" may also be used.
22
+#' @param parallel A boolean indicating wheter parallel processing should be used or not.
23
+#' @param cores An integer specifying the number of processor cores if parallel processing
24
+#' is used.
25
+#' @return Returns a data frame with columns corresponding to the frequency of the abudance
26
+#' measure in samples 1 and 2, the P value, Q value (Holms adjusted P value, also knowns as 
27
+#' the false discovery rate), and log2 transformed fold change.
28
+#' @examples
29
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
30
+#' 
31
+#' file.list <- readImmunoSeq(path = file.path)
32
+#' 
33
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
34
+#' 
35
+#' differentialAbundance(list = productive.aa, sample1 = "TCRB_Day949_Unsorted", sample2 = "TCRB_Day1320_Unsorted",
36
+#'                      type = "aminoAcid", q = 0.01, zero = 0.001, parallel = FALSE)
37
+#' @export
38
+#' @importFrom stats p.adjust
39
+#' @importFrom stats fisher.test
40
+#' @importFrom doMC registerDoMC
41
+differentialAbundance <- function(sample1, sample2, list, abundance = "estimatedNumberGenomes", 
42
+                                  type = "aminoAcid", q = 1, zero = 0.001, parallel = TRUE, cores = 4) {
43
+    x <- list[[sample1]]
44
+    y <- list[[sample2]]
45
+    sequences <- union(x[, type], y[, type])
46
+    fisherFunction <- function(sequences) {
47
+        p.value <- as.numeric()
48
+        sample1.freq <- as.numeric()
49
+        sample2.freq <- as.numeric()
50
+        if (any(x[, type] == sequences)) {
51
+            in.x <- x[x[, type] == sequences, abundance]
52
+            x.freq <- x[x[, type] == sequences, abundance]/sum(x[, abundance]) * 
53
+                100
54
+        } else {
55
+            in.x <- 0
56
+            x.freq <- 0
57
+        }
58
+        if (any(y[, type] == sequences)) {
59
+            in.y <- y[y[, type] == sequences, abundance]
60
+            y.freq <- y[y[, type] == sequences, abundance]/sum(y[, abundance]) * 
61
+                100
62
+        } else {
63
+            in.y <- 0
64
+            y.freq <- 0
65
+        }
66
+        not.x <- sum(x[, abundance]) - in.x
67
+        not.y <- sum(y[, abundance]) - in.y
68
+        matrix <- matrix(c(in.x, in.y, not.x, not.y), nrow = 2)
69
+        fisher <- stats::fisher.test(matrix)
70
+        p.value <- c(p.value, fisher$p.value)
71
+        sample1.freq <- c(sample1.freq, x.freq)
72
+        sample2.freq <- c(sample2.freq, y.freq)
73
+        results <- data.frame(aminoAcid = sequences, 
74
+                              x = sample1.freq, 
75
+                              y = sample2.freq, 
76
+                              p = p.value)
77
+        return(results)
78
+    }
79
+    doMC::registerDoMC(cores = cores)
80
+    results <- plyr::ldply(sequences, fisherFunction, .parallel = parallel)
81
+    results$q <- stats::p.adjust(results$p, method = "holm")
82
+    x.not.zero <- results[, "x"]
83
+    x.not.zero[x.not.zero == 0] <- zero
84
+    y.not.zero <- results[, "y"]
85
+    y.not.zero[y.not.zero == 0] <- zero
86
+    results$l2fc <- log2(x.not.zero/y.not.zero)
87
+    names(results)[2] <- sample1
88
+    names(results)[3] <- sample2
89
+    results <- results[order(results$q, decreasing = FALSE), ]
90
+    results <- results[results$q <= q, ]
91
+    rownames(results) <- NULL
92
+    return(results)
93
+}
0 94
\ No newline at end of file
1 95
new file mode 100644
... ...
@@ -0,0 +1,57 @@
1
+#' Export sequences in fasta format
2
+#' 
3
+#' Export nucleotide or amino acid sequences in fasta format.
4
+#' 
5
+#' @param list A list of data frames consisting of antigen receptor sequences 
6
+#' imported by the LymphoSeq function readImmunoSeq.
7
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences
8
+#' should be exported.  If "aminoAcid" is specified, then run productiveSeqs first.
9
+#' @param names A character vector of one or more column names to name the sequences.
10
+#' If "rank" is specified, then the rank order of the sequences by frequency is used.
11
+#' @return Exports fasta files to the working directory.
12
+#' @examples
13
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
14
+#' 
15
+#' file.list <- readImmunoSeq(path = file.path)
16
+#' 
17
+#' exportFasta(list = file.list, type = "nucleotide", names = c("rank", "aminoAcid", "count"))
18
+#' 
19
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
20
+#' 
21
+#' exportFasta(list = productive.aa, type = "aminoAcid", names = "frequencyCount")
22
+#' @export
23
+#' @importFrom Biostrings DNAStringSet
24
+#' @importFrom Biostrings AAStringSet
25
+#' @importFrom Biostrings writeXStringSet
26
+exportFasta <- function(list, type = "nucleotide", names = c("rank", "aminoAcid", "count")) {
27
+    if (type == "nucleotide") {
28
+        i <- 1
29
+        for (i in 1:length(list)) {
30
+            file <- list[[i]]
31
+            sequences <- file$nucleotide
32
+            fasta <- Biostrings::DNAStringSet(sequences)
33
+            if (any(names %in% "rank")) {
34
+                file$rank <- rownames(file)
35
+            }
36
+            names(fasta) <- do.call(paste, c(file[names], sep = "_"))
37
+            Biostrings::writeXStringSet(fasta, paste(names(list[i]), ".fasta", sep = ""))
38
+        }
39
+    }
40
+    if (type == "aminoAcid") {
41
+        i <- 1
42
+        for (i in 1:length(list)) {
43
+            file <- list[[i]]
44
+            sequences <- file$aminoAcid
45
+            if (any(sequences == "") | any(grepl("\\*", sequences))) {
46
+                stop("Your list contains unproductive sequences.  Remove unproductive sequences first using the function productiveSeq.", call. = FALSE)
47
+            }
48
+            fasta <- Biostrings::AAStringSet(sequences)
49
+            if (any(names %in% "rank")) {
50
+                file$rank <- rownames(file)
51
+            }
52
+            names(fasta) <- do.call(paste, c(file[names], sep = "_"))
53
+            Biostrings::writeXStringSet(fasta, paste(names(list[i]), ".fasta", sep = ""))
54
+        }
55
+    }
56
+    message(paste("Fasta files saved to", getwd()))
57
+}
0 58
new file mode 100644
... ...
@@ -0,0 +1,97 @@
1
+#' Create phylogenetic tree
2
+#' 
3
+#' Create a phylogenetic tree from amino acid or nucleotide CDR3 sequences in a list of 
4
+#' data frames.
5
+#' 
6
+#' @param list A list data frames of unproductive nucleotide sequences or productive 
7
+#' nucleotide or amino acid sequenes generated by the LymphoSeq function productiveSeq 
8
+#' where the parameter aggregate is set to "nucleotide".  vFamilyName, dFamilyName, and 
9
+#' jFamilyName are required columns.
10
+#' @param sample A character vector indicating the name of the sample in the productive
11
+#' sequence list. 
12
+#' @param type A character vector indicating whether "aminoAcid" or "nucleotide" sequences
13
+#' should be aligned.  Available options are "nucleotide" and "aminoAcid".
14
+#' @param method A character vector indicating the multiple sequence alignment method to 
15
+#' be used.  Refer to the Bioconductor msa package for more details.  Options incude 
16
+#' "ClustalW", "ClustalOmega", and "Muscle".
17
+#' @param aa.model A character vector indicating the pairwise distance model to be used
18
+#' to compare amino acid sequences.  Refer to the phangorn package for more details.
19
+#' Available options incude "WAG", "JTT", "LG", "Dayhoff", "cpREV", "mtmam", "mtArt", "MtZoa",
20
+#' "mtREV24", "VT","RtREV", "HIVw", "HIVb", "FLU", "Blossum62", "Dayhoff_DCMut" and "JTT_DCMut".
21
+#' The default options is "JTT".
22
+#' @param nt.model A character vector indicating the pairwise distance model to be used
23
+#' to compare nucleotide sequences.  Refer to the phangorn package for more details.
24
+#' Available options incude "JC69" or "F81".  The default options is "F81".
25
+#' @param layout A character vector indicating the tree layout.  Options include 
26
+#' "rectangular", "slanted", "fan", "circular", "radial" and "unrooted".
27
+#' @return Returns a phylogenetic tree where each leaf represents a sequence color coded by the
28
+#' V, D, and J gene usage.  The number next to each leaf refers to the sequence count.  A triangle 
29
+#' shaped leaf indicates the dominant sequence.  Refer to the ggtree Bioconductor package 
30
+#' documentation for details on how to manipulate the tree.
31
+#' @examples
32
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
33
+#' 
34
+#' file.list <- readImmunoSeq(path = file.path)
35
+#' 
36
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
37
+#' 
38
+#' phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", 
39
+#'          layout = "rectangular", method = "ClustalW", nt.model = "F81")
40
+#' 
41
+#' phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "aminoAcid", 
42
+#'          layout = "circular", method = "ClustalOmega", nt.model = "JTT")
43
+#' @export
44
+#' @importFrom Biostrings DNAStringSet
45
+#' @importFrom Biostrings AAStringSet
46
+#' @importFrom phangorn as.phyDat
47
+#' @importFrom phangorn dist.ml
48
+#' @importFrom phangorn NJ
49
+#' @import msa
50
+#' @import ggtree
51
+phyloTree <- function(list, sample, type = "nucleotide", method = "ClustalOmega", 
52
+                      layout = "rectangular", aa.model = "JTT", nt.model = "F81") {
53
+    file <- list[[sample]]
54
+    if(length(grep(pattern = "vFamilyName|dFamilyName|jFamilyName", names(file))) != 3){
55
+        stop("vFamilyName, dFamilyName, and jFamilyName are required columns.  Try aggregating the sequences by 'nucleotide' usng the function productiveSeq.", call. = FALSE)
56
+    }
57
+    if(nrow(file) < 3){
58
+        stop("Cannot draw phlogenetic tree with less than 3 sequences.", call. = FALSE)
59
+    }
60
+    file[file == ""] <- "unresolved"
61
+    if (type == "nucleotide") {
62
+        file <- file[nchar(file$nucleotide) > 3, ]
63
+        string <- Biostrings::DNAStringSet(file$nucleotide)
64
+    }
65
+    if (type == "aminoAcid") {
66
+        file <- file[nchar(file$aminoAcid) > 3, ]
67
+        string <- Biostrings::AAStringSet(file$aminoAcid)
68
+    }
69
+    names(string) <- file$count
70
+    alignment <- msa::msa(string, method = method)
71
+    if (type == "nucleotide") {
72
+        alignment.phyDat <- phangorn::as.phyDat(x = alignment, type = "DNA", format = clustal)
73
+        alignment.dist <- phangorn::dist.ml(alignment.phyDat, model = nt.model)
74
+    }
75
+    if (type == "aminoAcid") {
76
+        alignment.phyDat <- phangorn::as.phyDat(x = alignment, type = "AA", format = clustal)
77
+        alignment.dist <- phangorn::dist.ml(alignment.phyDat, model = aa.model)
78
+    }
79
+    tree <- phangorn::NJ(alignment.dist)
80
+    geneFamilies <- paste(file$vFamilyName, file$dFamilyName, file$jFamilyName)
81
+    geneFamilies <- gsub(geneFamilies, pattern = "IGH|IGL|IGK|TCRB|TCRA", replacement = "")
82
+    geneFamilies <- gsub(geneFamilies, pattern = "unresolved", replacement = "UNR")
83
+    tree.annotation <- data.frame(Names = file$count, Genomes = file$estimatedNumberGenomes, 
84
+                                  Count = file$count, Genes = geneFamilies, 
85
+                                  Frequency = round(file$frequencyCount, digits = 2), 
86
+                                  Dominant = c("Yes", rep("No", length(string) - 1)))
87
+    getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
88
+    plot <- ggtree::ggtree(tree, layout = layout) %<+% tree.annotation + 
89
+        ggtree::geom_tippoint(aes(color = Genes, shape = Dominant), size = 3) + 
90
+        ggplot2::scale_color_manual(values = getPalette(length(unique(geneFamilies)))) + 
91
+        ggplot2::guides(shape = FALSE) + 
92
+        ggtree::geom_tiplab(size = 2, hjust = - 0.3) + 
93
+        ggplot2::theme(legend.position = "bottom", legend.key = element_rect(colour = "white")) + 
94
+        ggplot2::labs(color = "")
95
+    return(plot)
96
+}
97
+
0 98
new file mode 100644
... ...
@@ -0,0 +1,50 @@
1
+# LymphoSeq
2
+This R package analyzes high-throughput sequencing of T and B cell receptor
3
+complementarity determining region 3 (CDR3) sequences generated by [Adaptive
4
+Biotechnologies' ImmunoSEQ assay](http://www.adaptivebiotech.com/immunoseq).
5
+
6
+### Installation instructions
7
+
8
+#### Install release version 1.0.0
9
+###### Install from [Bioconductor](https://www.bioconductor.org/packages/LymphoSeq)
10
+```
11
+source("https://bioconductor.org/biocLite.R")
12
+biocLite("LymphoSeq")
13
+```
14
+
15
+#### Install developer version 1.1.0
16
+###### Option 1:  Install from [Bioconductor developer branch](https://www.bioconductor.org/developers/how-to/useDevel/)
17
+```
18
+# Switch to Bioconductor developer branch (requires latest version of R)
19
+library(BiocInstaller)
20
+useDevel()
21
+
22
+# Download developer release
23
+source("https://bioconductor.org/biocLite.R")
24
+biocLite("LymphoSeq")
25
+
26
+# Switch back to Bioconductor release branch
27
+library(BiocInstaller)
28
+useDevel()
29
+```
30
+###### Option 2:  Install from GitHub
31
+```
32
+# Install the latest version of Bioconductor
33
+source("https://bioconductor.org/biocLite.R")
34
+biocLite()
35
+
36
+# Download developer tools
37
+install.packages("devtools")
38
+
39
+# Download package from GitHub
40
+install_github("davidcoffey/LymphoSeqDB")
41
+install_github("davidcoffey/LymphoSeq")
42
+```
43
+
44
+### Documentation
45
+* [LymphoSeq vignette](https://bioconductor.org/packages/release/bioc/vignettes/LymphoSeq/inst/doc/LymphoSeq.pdf)
46
+* [LymphoSeq manual](https://bioconductor.org/packages/release/bioc/manuals/LymphoSeq/man/LymphoSeq.pdf)
47
+* [LymphoSeq news](https://bioconductor.org/packages/release/bioc/news/LymphoSeq/NEWS)
48
+
49
+### Citation
50
+Coffey D (2016). LymphoSeq: Analyze high-throughput sequencing of T and B cell receptors. R package version 1.0.0.
... ...
@@ -1 +1,7 @@
1 1
 LymphoSeq News
2
+
3
+Version 1.1.0 [May 10, 2016]
4
+ * Added function alignSeq
5
+ * Added funciton phyloSeq
6
+ * Added function exportFasta
7
+ * Added function differentialAbundance
2 8
new file mode 100644
... ...
@@ -0,0 +1,39 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/alignSeq.R
3
+\name{alignSeq}
4
+\alias{alignSeq}
5
+\title{Align mutliple sequences}
6
+\usage{
7
+alignSeq(list, sample, type = "nucleotide", method = "ClustalOmega")
8
+}
9
+\arguments{
10
+\item{list}{A list of data frames consisting of antigen receptor sequences 
11
+imported by the LymphoSeq function readImmunoSeq.}
12
+
13
+\item{sample}{A character vector indicating the name of the sample in the productive
14
+sequence list.}
15
+
16
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences
17
+should be aligned.  If "aminoAcid" is specified, then run productiveSeqs first.}
18
+
19
+\item{method}{A character vector indicating the multiple sequence alignment method to 
20
+be used.  Refer to the Bioconductor msa package for more details.  Options incude 
21
+"ClustalW", "ClustalOmega", and "Muscle".}
22
+}
23
+\value{
24
+Saves a pdf file of the multiple sequence alignemnt to the working directory.
25
+}
26
+\description{
27
+Perform multiple sequence alignment using one of three methods and output results as a pdf file.
28
+}
29
+\examples{
30
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
31
+
32
+file.list <- readImmunoSeq(path = file.path)
33
+
34
+productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
35
+
36
+alignSeq(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", 
37
+         method = "ClustalW")
38
+}
39
+
0 40
new file mode 100644
... ...
@@ -0,0 +1,59 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/differentialAbundance.R
3
+\name{differentialAbundance}
4
+\alias{differentialAbundance}
5
+\title{Differential abundance analysis}
6
+\usage{
7
+differentialAbundance(sample1, sample2, list,
8
+  abundance = "estimatedNumberGenomes", type = "aminoAcid", q = 1,
9
+  zero = 0.001, parallel = TRUE, cores = 4)
10
+}
11
+\arguments{
12
+\item{sample1}{A character vector indicating the name of the first sample in the list
13
+to be compared.}
14
+
15
+\item{sample2}{A character vector indicating the name of the second sample in the list
16
+to be compared.}
17
+
18
+\item{list}{A list of data frames consisting of antigen receptor sequences 
19
+imported by the LymphoSeq function readImmunoSeq.}
20
+
21
+\item{abundance}{The input value for the Fisher exact test.  "estimatedNumberGenomes"
22
+is recommend but "count" may also be used.}
23
+
24
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences
25
+should be used.  If "aminoAcid" is specified, then run productiveSeqs first.}
26
+
27
+\item{q}{A numeric value between 0.0 and 1.0 indicating the threshold Holms adjusted 
28
+P value (also knowns as the false discovery rate or q value) to subset the results with.  
29
+Any sequences with a q value greater than this value will not be shown.}
30
+
31
+\item{zero}{A numeric value to set all zero values to when calculating the log2
32
+transformed fold change between samples 1 and 2.  This does not apply to the 
33
+p and q value calculations.}
34
+
35
+\item{parallel}{A boolean indicating wheter parallel processing should be used or not.}
36
+
37
+\item{cores}{An integer specifying the number of processor cores if parallel processing
38
+is used.}
39
+}
40
+\value{
41
+Returns a data frame with columns corresponding to the frequency of the abudance
42
+measure in samples 1 and 2, the P value, Q value (Holms adjusted P value, also knowns as 
43
+the false discovery rate), and log2 transformed fold change.
44
+}
45
+\description{
46
+Use a Fisher exact test to calculate differential abdunance of each sequence in 
47
+two samples and report the log2 transformed fold change, P value and adjusted P value.
48
+}
49
+\examples{
50
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
51
+
52
+file.list <- readImmunoSeq(path = file.path)
53
+
54
+productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
55
+
56
+differentialAbundance(list = productive.aa, sample1 = "TCRB_Day949_Unsorted", sample2 = "TCRB_Day1320_Unsorted",
57
+                     type = "aminoAcid", q = 0.01, zero = 0.001, parallel = FALSE)
58
+}
59
+
0 60
new file mode 100644
... ...
@@ -0,0 +1,37 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/exportFasta.R
3
+\name{exportFasta}
4
+\alias{exportFasta}
5
+\title{Export sequences in fasta format}
6
+\usage{
7
+exportFasta(list, type = "nucleotide", names = c("rank", "aminoAcid",
8
+  "count"))
9
+}
10
+\arguments{
11
+\item{list}{A list of data frames consisting of antigen receptor sequences 
12
+imported by the LymphoSeq function readImmunoSeq.}
13
+
14
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences
15
+should be exported.  If "aminoAcid" is specified, then run productiveSeqs first.}
16
+
17
+\item{names}{A character vector of one or more column names to name the sequences.
18
+If "rank" is specified, then the rank order of the sequences by frequency is used.}
19
+}
20
+\value{
21
+Exports fasta files to the working directory.
22
+}
23
+\description{
24
+Export nucleotide or amino acid sequences in fasta format.
25
+}
26
+\examples{
27
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
28
+
29
+file.list <- readImmunoSeq(path = file.path)
30
+
31
+exportFasta(list = file.list, type = "nucleotide", names = c("rank", "aminoAcid", "count"))
32
+
33
+productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
34
+
35
+exportFasta(list = productive.aa, type = "aminoAcid", names = "frequencyCount")
36
+}
37
+
0 38
new file mode 100644
... ...
@@ -0,0 +1,62 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/phyloTree.R
3
+\name{phyloTree}
4
+\alias{phyloTree}
5
+\title{Create phylogenetic tree}
6
+\usage{
7
+phyloTree(list, sample, type = "nucleotide", method = "ClustalOmega",
8
+  layout = "rectangular", aa.model = "JTT", nt.model = "F81")
9
+}
10
+\arguments{
11
+\item{list}{A list data frames of unproductive nucleotide sequences or productive 
12
+nucleotide or amino acid sequenes generated by the LymphoSeq function productiveSeq 
13
+where the parameter aggregate is set to "nucleotide".  vFamilyName, dFamilyName, and 
14
+jFamilyName are required columns.}
15
+
16
+\item{sample}{A character vector indicating the name of the sample in the productive
17
+sequence list.}
18
+
19
+\item{type}{A character vector indicating whether "aminoAcid" or "nucleotide" sequences
20
+should be aligned.  Available options are "nucleotide" and "aminoAcid".}
21
+
22
+\item{method}{A character vector indicating the multiple sequence alignment method to 
23
+be used.  Refer to the Bioconductor msa package for more details.  Options incude 
24
+"ClustalW", "ClustalOmega", and "Muscle".}
25
+
26
+\item{layout}{A character vector indicating the tree layout.  Options include 
27
+"rectangular", "slanted", "fan", "circular", "radial" and "unrooted".}
28
+
29
+\item{aa.model}{A character vector indicating the pairwise distance model to be used
30
+to compare amino acid sequences.  Refer to the phangorn package for more details.
31
+Available options incude "WAG", "JTT", "LG", "Dayhoff", "cpREV", "mtmam", "mtArt", "MtZoa",
32
+"mtREV24", "VT","RtREV", "HIVw", "HIVb", "FLU", "Blossum62", "Dayhoff_DCMut" and "JTT_DCMut".
33
+The default options is "JTT".}
34
+
35
+\item{nt.model}{A character vector indicating the pairwise distance model to be used
36
+to compare nucleotide sequences.  Refer to the phangorn package for more details.
37
+Available options incude "JC69" or "F81".  The default options is "F81".}
38
+}
39
+\value{
40
+Returns a phylogenetic tree where each leaf represents a sequence color coded by the
41
+V, D, and J gene usage.  The number next to each leaf refers to the sequence count.  A triangle 
42
+shaped leaf indicates the dominant sequence.  Refer to the ggtree Bioconductor package 
43
+documentation for details on how to manipulate the tree.
44
+}
45
+\description{
46
+Create a phylogenetic tree from amino acid or nucleotide CDR3 sequences in a list of 
47
+data frames.
48
+}
49
+\examples{
50
+file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
51
+
52
+file.list <- readImmunoSeq(path = file.path)
53
+
54
+productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
55
+
56
+phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "nucleotide", 
57
+         layout = "rectangular", method = "ClustalW", nt.model = "F81")
58
+
59
+phyloTree(list = productive.nt, sample = "TCRB_Day1320_CD8_CMV", type = "aminoAcid", 
60
+         layout = "circular", method = "ClustalOmega", nt.model = "JTT")
61
+}
62
+
... ...
@@ -1,14 +1,13 @@
1 1
 ---
2 2
 title: "Analysis of high-throughput sequencing of T and B cell receptors with LymphoSeq"
3 3
 author: "David G. Coffey, MD"
4
-date: "`r Sys.Date()`"
4
+date: '`r Sys.Date()`'
5 5
 output: pdf_document
6 6
 geometry: margin=0.5in
7 7
 fontsize: 10pt
8
-vignette: >
9
-  %\VignetteIndexEntry{Analyze high throughput sequencing of T and B cell receptors with LymphoSeq}
10
-  %\VignetteEncoding{UTF-8}
11
-  %\VignetteEngine{knitr::rmarkdown}
8
+toc: yes
9
+vignette: |
10
+  %\VignetteIndexEntry{Analyze high throughput sequencing of T and B cell receptors with LymphoSeq} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown}
12 11
 ---
13 12
 
14 13
 Application of high-throughput sequencing of T and B lymphocyte antigen receptors has great potential for improving the monitoring of lymphoid malignancies, assessing immune reconstitution after hematopoietic stem cell transplantation, and characterizing the composition of lymphocyte repertoires.^[Warren, E. H. *et al.* *Blood* 2013;122:19–22.]   LymhoSeq is an R package designed to import, analyze, and visualize antigen receptor sequencing from Adaptive Biotechnologies' ImmunoSEQ assay.^[http://www.adaptivebiotech.com/immunoseq]  The package is also adaptable to the analysis of T and B cell receptor sequencing processed using other platforms such as MiXCR^[http://mixcr.milaboratory.com] or IMGT/HighV-QUEST.^[http://www.imgt.org/HighV-QUEST]  This vignette has been written to highlight some of the features of LymphoSeq and guide the user through a typical workflow.