Browse code

Adding pcaExplorer, oppar, pqsfinder, BgeeDB, genbankr, LymphoSeq, pbcmc, InteractionSet, ClusterSignificance

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/LymphoSeq@115899 bc3139a8-67e5-0310-9ffc-ced21a209358

Martin Morgan authored on 06/04/2016 16:10:37
Showing 66 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+Package: LymphoSeq
2
+Title: Analyze high-throughput sequencing of T and B cell receptors
3
+Version: 0.99.3
4
+Author: David Coffey <dcoffey@fredhutch.org>
5
+Maintainer: David Coffey <dcoffey@fredhutch.org>
6
+Description: This R package analyzes high-throughput sequencing of T
7
+        and B cell receptor complementarity determining region 3 (CDR3)
8
+        sequences generated by Adaptive Biotechnologies' ImmunoSEQ
9
+        assay.  Its input comes from tab-separated value (.tsv) files
10
+        exported from the ImmunoSEQ analyzer.
11
+Depends: R (>= 3.2), LymphoSeqDB
12
+Imports: data.table, plyr, dplyr, reshape, VennDiagram, ggplot2, ineq,
13
+        RColorBrewer, circlize, grid, utils, stats
14
+License: Artistic-2.0
15
+LazyData: true
16
+RoxygenNote: 5.0.1
17
+Suggests: knitr, pheatmap, wordcloud
18
+VignetteBuilder: knitr
19
+biocViews: Software, Technology, Sequencing, TargetedResequencing
20
+NeedsCompilation: no
0 21
new file mode 100644
... ...
@@ -0,0 +1,47 @@
1
+# Generated by roxygen2: do not edit by hand
2
+
3
+export(bhattacharyyaCoefficient)
4
+export(bhattacharyyaMatrix)
5
+export(chordDiagramVDJ)
6
+export(clonality)
7
+export(cloneTrack)
8
+export(commonSeqs)
9
+export(commonSeqsPlot)
10
+export(commonSeqsVenn)
11
+export(geneFreq)
12
+export(lorenzCurve)
13
+export(mergeFiles)
14
+export(pairwisePlot)
15
+export(productive)
16
+export(productiveSeq)
17
+export(readImmunoSeq)
18
+export(removeSeq)
19
+export(searchPublished)
20
+export(searchSeq)
21
+export(seqMatrix)
22
+export(similarityMatrix)
23
+export(similarityScore)
24
+export(topFreq)
25
+export(topSeqs)
26
+export(topSeqsPlot)
27
+export(uniqueSeqs)
28
+import(LymphoSeqDB)
29
+import(ggplot2)
30
+importFrom(RColorBrewer,brewer.pal)
31
+importFrom(VennDiagram,draw.pairwise.venn)
32
+importFrom(VennDiagram,draw.triple.venn)
33
+importFrom(circlize,chordDiagram)
34
+importFrom(circlize,colorRamp2)
35
+importFrom(data.table,fread)
36
+importFrom(dplyr,group_by)
37
+importFrom(dplyr,summarise)
38
+importFrom(grid,grid.draw)
39
+importFrom(grid,grid.newpage)
40
+importFrom(ineq,Gini)
41
+importFrom(ineq,Lc)
42
+importFrom(plyr,ldply)
43
+importFrom(plyr,llply)
44
+importFrom(reshape,melt.data.frame)
45
+importFrom(stats,aggregate)
46
+importFrom(stats,na.omit)
47
+importFrom(utils,adist)
0 48
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+#' Bhattacharyya coefficient
2
+#' 
3
+#' Calculates the Bhattacharyya coefficient of two samples.
4
+#' 
5
+#' @param sample1 A data frame consisting of frequencies of antigen receptor 
6
+#' sequences.  "frequencyCount" is a required column.
7
+#' @param sample2 A data frame consisting of frequencies of antigen receptor 
8
+#' sequences.  "frequencyCount" is a required column.
9
+#' @return Returns the Bhattacharyya coefficient, a measure of the amount of 
10
+#' overlap between two samples.  The value ranges from 0 to 1 where 1 indicates 
11
+#' the sequence frequencies are identical in the two samples and 0 indicates no 
12
+#' shared frequencies. 
13
+#' @examples
14
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
15
+#' 
16
+#' file.list <- readImmunoSeq(path = file.path)
17
+#' 
18
+#' productive.aa <- productiveSeq(file.list, aggregate = "aminoAcid")
19
+#' 
20
+#' bhattacharyyaCoefficient(productive.aa[["TCRB_Day32_Unsorted"]], 
21
+#'    productive.aa[["TCRB_Day83_Unsorted"]])
22
+#' @seealso \code{\link{bhattacharyyaMatrix}}
23
+#' @export
24
+bhattacharyyaCoefficient <- function(sample1, sample2) {
25
+    p <- sample1[which(sample1[, "aminoAcid"] %in% sample2[, "aminoAcid"]), ]
26
+    p$frequencyCount <- p$frequencyCount/100
27
+    q <- sample2[which(sample2[, "aminoAcid"] %in% sample1[, "aminoAcid"]), ]
28
+    q$frequencyCount <- q$frequencyCount/100
29
+    s <- p[, "frequencyCount"] * q[, "frequencyCount"]
30
+    bc <- sum(sqrt(s))
31
+    return(bc)
32
+}
0 33
\ No newline at end of file
1 34
new file mode 100644
... ...
@@ -0,0 +1,41 @@
1
+#' Bhattacharyya matrix
2
+#' 
3
+#' Calculates the Bhattacharyya coefficient of all pairwise comparison from a 
4
+#' list of data frames.
5
+#' 
6
+#' @param productive.seqs A list data frames of productive sequences generated 
7
+#' by the LymphoSeq function productiveSeq.  "frequencyCount" and "aminoAcid" 
8
+#' are a required columns.
9
+#' @return A data frame of Bhattacharyya coefficients calculated from all 
10
+#' pairwise comparisons from a list of sample data frames.  The Bhattacharyya 
11
+#' coefficient is a measure of the amount of overlap between two samples.  The 
12
+#' value ranges from 0 to 1 where 1 indicates the sequence frequencies are 
13
+#' identical in the two samples and 0 indicates no shared frequencies.
14
+#' @examples
15
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
16
+#' 
17
+#' file.list <- readImmunoSeq(path = file.path)
18
+#' 
19
+#' productive.aa <- productiveSeq(file.list, aggregate = "aminoAcid")
20
+#' 
21
+#' bhattacharyyaMatrix(productive.seqs = productive.aa)
22
+#' @seealso \code{\link{pairwisePlot}} for plotting results as a heat map.
23
+#' @export
24
+bhattacharyyaMatrix <- function(productive.seqs) {
25
+    l <- length(productive.seqs)
26
+    m <- matrix(nrow = l, ncol = l)
27
+    rownames(m) <- names(productive.seqs)
28
+    colnames(m) <- names(productive.seqs)
29
+    for (i in 1:l) {
30
+        for (j in 1:i) {
31
+            m[i, j] <- bhattacharyyaCoefficient(productive.seqs[[i]], productive.seqs[[j]])
32
+            m[j, i] <- m[i, j]
33
+        }
34
+    }
35
+    m <- m[, colnames(m)[order(nchar(colnames(m)), colnames(m), 
36
+                               decreasing = TRUE)]]
37
+    m <- m[colnames(m)[order(nchar(colnames(m)), colnames(m), 
38
+                             decreasing = TRUE)], ]
39
+    d <- as.data.frame(m)
40
+    return(d)
41
+}
0 42
\ No newline at end of file
1 43
new file mode 100644
... ...
@@ -0,0 +1,70 @@
1
+#' Chord diagram of VJ or DJ gene associations
2
+#' 
3
+#' Creates a chord diagram showing VJ or DJ gene associations from one or more 
4
+#' samples.
5
+#' 
6
+#' @param sample A data frame consisting of frequencies of antigen receptor 
7
+#' sequences.  "vFamilyName", "jFamilyName", and if applicable, "dFamilyName" 
8
+#' are a required columns.  Using output from the LymphoSeq function topSeqs is
9
+#' recommended.
10
+#' @param association A character vector of gene familes to associate.  Options 
11
+#' include "VJ" or "DJ".
12
+#' @param colors A character vector of 2 colors corresponding to the V/D and J 
13
+#' gene colors respectively.
14
+#' @details The size of the ribbons connecting VJ or DJ genes correspond to the 
15
+#' number of samples or number of sequences that make up that recombination 
16
+#' event.  The thicker the ribbon, the higher the frequency of the recombination.
17
+#' @return Returns a chord diagram showing VJ or DJ gene associations from one or 
18
+#' more samples.
19
+#' @seealso \code{\link{topSeqs}}
20
+#' @examples
21
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
22
+#' 
23
+#' file.list <- readImmunoSeq(path = file.path)
24
+#' 
25
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
26
+#' 
27
+#' top.seqs <- topSeqs(productive.seqs = productive.nt, top = 1)
28
+#' 
29
+#' chordDiagramVDJ(sample = top.seqs, association = "VJ", colors = c("red", "blue"))
30
+#' 
31
+#' # Remove "TCRB" from gene family name
32
+#' top.seqs <- as.data.frame(apply(top.seqs, 2, function(x) gsub("TCRB", "", x)))
33
+#' 
34
+#' chordDiagramVDJ(sample = top.seqs, association = "VJ", colors = c("red", "blue"))
35
+#' @export
36
+#' @importFrom circlize colorRamp2 chordDiagram
37
+chordDiagramVDJ <- function(sample, association = "VJ", colors = c("red", "blue")) {
38
+    if (association == "VJ") {
39
+        if (!all(c("vFamilyName", "jFamilyName") %in% colnames(sample))) {
40
+            stop("The source data frame does not contain the required columns 'vFamilyName' and 'jFamilyName'.")
41
+        }
42
+        vj <- sample[, c("vFamilyName", "jFamilyName")]
43
+        vj[is.na(vj) | vj == ""] <- "unresolved"
44
+        vj$vFamilyName <- as.character(vj$vFamilyName)
45
+        vj$jFamilyName <- as.character(vj$jFamilyName)
46
+        table <- table(vj$vFamilyName, vj$jFamilyName)
47
+        matrix <- as.matrix(as.data.frame.matrix(table))
48
+        ribbon.color <- circlize::colorRamp2(range(matrix), c("grey", "black"))
49
+        circlize::chordDiagram(matrix, 
50
+                               annotationTrack = c("grid", "name"), 
51
+                               grid.col = c(rep(colors[1], dim(matrix)[1]), rep(colors[2], dim(matrix)[2])), 
52
+                               col = ribbon.color)
53
+    }
54
+    if (association == "DJ") {
55
+        if (!all(c("dFamilyName", "jFamilyName") %in% colnames(sample))) {
56
+            stop("The source data frame does not contain the required columns 'dFamilyName' and 'jFamilyName'.")
57
+        }
58
+        dj <- sample[, c("vFamilyName", "jFamilyName")]
59
+        dj[is.na(dj) | dj == ""] <- "unresolved"
60
+        dj$dFamilyName <- as.character(dj$dFamilyName)
61
+        dj$jFamilyName <- as.character(dj$jFamilyName)
62
+        table <- table(dj$dFamilyName, dj$jFamilyName)
63
+        matrix <- as.matrix(as.data.frame.matrix(table))
64
+        ribbon.color <- circlize::colorRamp2(range(matrix), c("grey", "black"))
65
+        circlize::chordDiagram(matrix, 
66
+                               annotationTrack = c("grid", "name"), 
67
+                               grid.col = c(rep(colors[1], dim(matrix)[1]), rep(colors[2], dim(matrix)[2])), 
68
+                               col = ribbon.color)
69
+    }
70
+}
0 71
\ No newline at end of file
1 72
new file mode 100644
... ...
@@ -0,0 +1,67 @@
1
+#' Clonality
2
+#' 
3
+#' Creates a data frame giving the total number of sequences, number of unique 
4
+#' productive sequences, number of genomes, entropy, clonality, Gini 
5
+#' coefficient, and the frequency (\%) of the top productive sequences in a list 
6
+#' of sample data frames.
7
+#' 
8
+#' @param file.list A list of data frames consisting of antigen receptor 
9
+#' sequencing imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", 
10
+#' "frequencyCount", and "estimatedNumberGenomes" are required columns.  Note 
11
+#' that the function is not intended to be run using a productive sequence list 
12
+#' generated by the function productiveSeq.
13
+#' @return Returns a data frame giving the total number of sequences, number of 
14
+#' unique productive sequences, number of genomes, entropy, clonality, 
15
+#' Gini coefficient, and the frequency (\%) of the top productive sequence in each sample.
16
+#' @details Clonality is derived from the Shannon entropy, which is calculated 
17
+#' from the frequencies of all productive sequences divided by the logarithm of 
18
+#' the total number of unique productive sequences.  This normalized entropy 
19
+#' value is then inverted (1 - normalized entropy) to produce the clonality 
20
+#' metric.  
21
+#' 
22
+#' The Gini coefficient is an alternative metric used to calculate repertoire 
23
+#' diversity and is derived from the Lorenz curve.  The Lorenz curve is drawn 
24
+#' such that x-axis represents the cumulative percentage of unique sequences and 
25
+#' the y-axis represents the cumulative percentage of reads.  A line passing 
26
+#' through the origin with a slope of 1 reflects equal frequencies of all clones.  
27
+#' The Gini coefficient is the ratio of the area between the line of equality 
28
+#' and the observed Lorenz curve over the total area under the line of equality.  
29
+#' Both Gini coefficient and clonality are reported on a scale from 0 to 1 where 
30
+#' 0 indicates all sequences have the same frequency and 1 indicates the 
31
+#' repertoire is dominated by a single sequence.
32
+#' @examples
33
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
34
+#' 
35
+#' file.list <- readImmunoSeq(path = file.path)
36
+#' 
37
+#' clonality(file.list = file.list)
38
+#' @seealso \code{\link{lorenzCurve}}
39
+#' @export
40
+#' @importFrom ineq Gini
41
+clonality <- function(file.list) {
42
+    table <- data.frame(samples = names(file.list))
43
+    i <- 1
44
+    for (i in 1:length(file.list)) {
45
+        file <- file.list[[i]]
46
+        total.reads <- length(file$nucleotide)
47
+        total.genomes <- sum(file$estimatedNumberGenomes)
48
+        total.count <- sum(file$count)
49
+        productive <- file[!grepl("\\*", file$aminoAcid) & file$aminoAcid != "", ]
50
+        frequency <- productive$count/sum(productive$count)
51
+        entropy <- -sum(frequency * log2(frequency))
52
+        unique.productive <- length(productive$nucleotide)
53
+        clonality <- 1 - (entropy/log2(unique.productive))
54
+        table$totalSequences[i] <- total.reads
55
+        table$uniqueProductiveSequences[i] <- unique.productive
56
+        table$totalGenomes[i] <- total.genomes
57
+        table$totalCount[i] <- total.count
58
+        table$entropy[i] <- entropy
59
+        table$clonality[i] <- clonality
60
+        table$giniCoefficient[i] <- ineq::Gini(frequency)
61
+        table$topProductiveSequence[i] <- max(frequency) * 100
62
+    }
63
+    table$samples <- as.character(table$samples)
64
+    table <- table[order(nchar(table$samples), table$samples, decreasing = FALSE), ]
65
+    rownames(table) <- NULL
66
+    return(table)
67
+} 
0 68
\ No newline at end of file
1 69
new file mode 100644
... ...
@@ -0,0 +1,158 @@
1
+#' Clone tracking plot
2
+#' 
3
+#' Creates line plot tracking amino acid frequencies across multiple samples
4
+#' 
5
+#' @param sequence.matrix A sequence matrix generated from the LymphoSeq 
6
+#' function seqMatrix.
7
+#' @param map An optional character vector of one or more sample names contained 
8
+#' in the productive.aa list.  If the same sequence appears in multiple mapped 
9
+#' samples, then it will be assigned the label of the first listed sample only.
10
+#' @param productive.aa A list of data frames of productive amino acid sequences 
11
+#' containing the samples to be mapped.  This parameter is only required if 
12
+#' sequence mapping is performed.
13
+#' @param label An optional character vector of one or more labels used to 
14
+#' annotate the mapped sequences.  The order of the labels must match the order 
15
+#' of the samples listed in map.
16
+#' @param track An optional character vector of one or more amino acid sequences 
17
+#' to track.
18
+#' @param unassigned A Boolean value indicating whether or not to draw the lines 
19
+#' of sequences not being mapped or tracked.  If TRUE then the unassigned 
20
+#' sequences are drawn.  If FALSE, then the unassigned sequences are not drawn.
21
+#' @return Returns a line plot showing the amino acid frequencies across 
22
+#' multiple samples in the sequence matrix where each line represents one 
23
+#' unique sequence.
24
+#' @details The plot is made using the package ggplot2 and can be reformatted
25
+#' using ggplot2 functions.  See examples below.
26
+#' @seealso An excellent resource for examples on how to reformat a ggplot can 
27
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}).
28
+#' @examples
29
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
30
+#' 
31
+#' file.list <- readImmunoSeq(path = file.path)
32
+#' 
33
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
34
+#' 
35
+#' top.freq <- topFreq(productive.aa = productive.aa, percent = 0.1)
36
+#' 
37
+#' sequence.matrix <- seqMatrix(productive.aa = productive.aa, sequences = top.freq$aminoAcid)
38
+#' 
39
+#' # Track clones without mapping or tracking specific sequences
40
+#' cloneTrack(sequence.matrix = sequence.matrix)
41
+#' 
42
+#' # Track top 20 clones mapping to the CD4 and CD8 samples
43
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, 
44
+#'    map = c("TCRB_Day949_CD4", "TCRB_Day949_CD8"), label = c("CD4", "CD8"), 
45
+#'    track = top.freq$aminoAcid[1:20], unassigned = TRUE) 
46
+#' 
47
+#' # Track the top 10 clones from top.freq
48
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, 
49
+#'    track = top.freq$aminoAcid[1:10], unassigned = FALSE) 
50
+#' 
51
+#' # Track clones mapping to the CD4 and CD8 samples while ignoring all others
52
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, 
53
+#'    map = c("TCRB_Day949_CD4", "TCRB_Day949_CD8"), label = c("CD4", "CD8"), 
54
+#'    unassigned = FALSE) 
55
+#' 
56
+#' # Track clones mapping to the CD4 and CD8 samples and track 2 specific sequences
57
+#' cloneTrack(sequence.matrix = sequence.matrix, productive.aa = productive.aa, 
58
+#'    map = c("TCRB_Day949_CD4", "TCRB_Day949_CD8"), label = c("CD4", "CD8"), 
59
+#'    track = c("CASSPPTGERDTQYF", "CASSQDRTGQYGYTF"), unassigned = FALSE)
60
+#' 
61
+#' # Reorder the x axis, change the axis labels, convert to log scale, and add title
62
+#' x.limits <- c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted", 
63
+#'    "TCRB_Day83_Unsorted", "TCRB_Day949_Unsorted", "TCRB_Day1320_Unsorted")
64
+#' 
65
+#' sequence.matrix <- sequence.matrix[ ,c("aminoAcid", x.limits)]
66
+#'    
67
+#' clone.track <- cloneTrack(sequence.matrix = sequence.matrix, 
68
+#'    productive.aa = productive.aa, track = top.freq$aminoAcid[1:10], unassigned = FALSE) 
69
+#' 
70
+#' x.labels <- c("Day 0", "Day 32", "Day 83", "Day 949", "Day 1320")
71
+#' 
72
+#' clone.track + 
73
+#'    ggplot2::scale_x_discrete(expand = c(0,0), labels = x.labels) + 
74
+#'    ggplot2::scale_y_log10() + ggplot2::annotation_logticks(sides = "l") + 
75
+#'    ggplot2::ggtitle("Figure Title")
76
+#' @export
77
+#' @import ggplot2
78
+#' @importFrom RColorBrewer brewer.pal
79
+#' @importFrom reshape melt.data.frame
80
+cloneTrack <- function(sequence.matrix, map = "none", productive.aa, 
81
+                       label = "none", track = "none", unassigned = TRUE) {
82
+    if (any(map != "none")) {
83
+        sequence.matrix$numberSamples <- NULL
84
+        sequence.melt <- reshape::melt.data.frame(sequence.matrix, 
85
+                                                  id.vars = "aminoAcid")
86
+        sequence.melt$map <- rep("Unassigned", length(sequence.matrix$aminoAcid))
87
+        if (any(track != "none")) {
88
+            i <- 1
89
+            for (i in 1:length(track)) {
90
+                sequence.melt$map <- ifelse(sequence.melt$aminoAcid == track[i], 
91
+                                            track[i], sequence.melt$map)
92
+            }
93
+        }
94
+        j <- 1
95
+        for (j in 1:length(map)) {
96
+            file <- productive.aa[[map[j]]]
97
+            aminoAcid <- as.character(file$aminoAcid)
98
+            sequence.melt$map <- ifelse(sequence.melt$aminoAcid %in% aminoAcid & 
99
+                                            sequence.melt$map == "Unassigned", 
100
+                                        label[j], sequence.melt$map)
101
+        }
102
+        if (unassigned == FALSE) {
103
+            sequence.melt <- sequence.melt[sequence.melt$map != "Unassigned", ]
104
+        }
105
+        getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
106
+        plot <- ggplot2::ggplot(sequence.melt, 
107
+                                aes_string(x = "variable", 
108
+                                           y = "value", 
109
+                                           group = "aminoAcid", 
110
+                                           color = "map")) + 
111
+            geom_line() + 
112
+            theme_minimal() + 
113
+            scale_x_discrete(expand = c(0, 0)) + 
114
+            scale_y_continuous(expand = c(0, 0)) + 
115
+            labs(x = "", y = "Frequency (%)", color = "") + 
116
+            scale_colour_manual(values = c(getPalette(length(label) + 
117
+                                                          length(track) + 1))) + 
118
+            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
119
+        return(plot)
120
+    } else {
121
+        sequence.matrix$numberSamples <- NULL
122
+        sequence.melt <- reshape::melt.data.frame(sequence.matrix, id.vars = "aminoAcid")
123
+        if (any(track != "none")) {
124
+            sequence.melt$map <- rep("Unassigned", length(sequence.matrix$aminoAcid))
125
+            k <- 1
126
+            for (k in 1:length(track)) {
127
+                sequence.melt$map <- ifelse(sequence.melt$aminoAcid == track[k], 
128
+                                            track[k], sequence.melt$map)
129
+            }
130
+            if (unassigned == FALSE) {
131
+                sequence.melt <- sequence.melt[sequence.melt$map != "Unassigned", 
132
+                                               ]
133
+            }
134
+            getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
135
+            plot <- ggplot2::ggplot(sequence.melt, aes_string(x = "variable", 
136
+                                                              y = "value", 
137
+                                                              group = "aminoAcid", 
138
+                                                              color = "map")) + 
139
+                geom_line() + theme_minimal() + 
140
+                scale_x_discrete(expand = c(0,0)) + 
141
+                scale_y_continuous(expand = c(0, 0)) + 
142
+                labs(x = "", y = "Frequency (%)", color = "") + 
143
+                scale_colour_manual(values = getPalette(length(track) + 1)) + 
144
+                theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
145
+            return(plot)
146
+        }
147
+        plot <- ggplot2::ggplot(sequence.melt, aes_string(x = "variable", 
148
+                                                          y = "value", 
149
+                                                          group = "aminoAcid")) + 
150
+            geom_line() + 
151
+            theme_minimal() + 
152
+            scale_x_discrete(expand = c(0, 0)) + 
153
+            scale_y_continuous(expand = c(0, 0)) + 
154
+            labs(x = "", y = "Frequency (%)") + 
155
+            theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
156
+        return(plot)
157
+    }
158
+} 
0 159
\ No newline at end of file
1 160
new file mode 100644
... ...
@@ -0,0 +1,32 @@
1
+#' Common sequences in two or more samples
2
+#' 
3
+#' Creates a data frame of the common sequences in two or more samples, reporting 
4
+#' their frequencies in each.
5
+#' 
6
+#' @param samples A character vector of two or more sample names in 
7
+#' productive.aa.
8
+#' @param productive.aa A list of productive amino acid sequences generated
9
+#' by the LymphoSeq function productiveSeq where aggregate = "aminoAcid".
10
+#' @return Returns a data frame of the common sequences between two or more files 
11
+#' displaying their frequencies in each.
12
+#' @seealso \code{\link{commonSeqsVenn}}
13
+#' @examples
14
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
15
+#' 
16
+#' file.list <- readImmunoSeq(path = file.path)
17
+#' 
18
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
19
+#' 
20
+#' commonSeqs(samples = c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted"), 
21
+#'    productive.aa = productive.aa)
22
+#' @export
23
+#' @importFrom plyr llply
24
+#' @import ggplot2
25
+commonSeqs <- function(samples, productive.aa) {
26
+    aminoAcid <- plyr::llply(productive.aa[samples], 
27
+                             function(x) x[, "aminoAcid"])
28
+    common <- Reduce(intersect, aminoAcid)
29
+    seq.matrix <- seqMatrix(productive.aa[samples], common)
30
+    seq.matrix$numberSamples <- NULL
31
+    return(seq.matrix)
32
+}
0 33
\ No newline at end of file
1 34
new file mode 100644
... ...
@@ -0,0 +1,59 @@
1
+#' Common sequences plot
2
+#' 
3
+#' Creates a scatter plot of just the sequences in common between two samples.
4
+#' 
5
+#' @param sample1 A name of a sample in a list of data frames generated by the 
6
+#' LymphoSeq function productiveSeq.
7
+#' @param sample2 A name of a sample in a list of data frames generated by the 
8
+#' LymphoSeq function productiveSeq.
9
+#' @param productive.aa A list of data frames of productive amino acid sequences 
10
+#' produced by the LymphoSeq function productiveSeq containing the 
11
+#' samples to be compared.
12
+#' @return Returns a frequency scatter plot of two samples showing only the 
13
+#' shared sequences.
14
+#' @details The plot is made using the package ggplot2 and can be reformatted
15
+#' using ggplot2 functions.  See examples below.
16
+#' @seealso An excellent resource for examples on how to reformat a ggplot can 
17
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}).
18
+#' @examples
19
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
20
+#' 
21
+#' file.list <- readImmunoSeq(path = file.path)
22
+#' 
23
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
24
+#' 
25
+#' commonSeqsPlot("TCRB_Day32_Unsorted", "TCRB_Day83_Unsorted", 
26
+#'    productive.aa = productive.aa)
27
+#' 
28
+#' # Change the X and Y axises to log-10 scale
29
+#' commonSeqsPlot("TCRB_Day32_Unsorted", "TCRB_Day83_Unsorted", 
30
+#'    productive.aa = productive.aa) +
31
+#'    ggplot2::scale_x_log10() + 
32
+#'    ggplot2::scale_y_log10() + 
33
+#'    ggplot2::annotation_logticks(sides = "bl")
34
+#' @export
35
+#' @import ggplot2
36
+commonSeqsPlot <- function(sample1, sample2, productive.aa) {
37
+    if(any(unlist(lapply(productive.aa, function(x) 
38
+        x[, "aminoAcid"] == "" |
39
+        grepl("\\*", x[, "aminoAcid"]) | 
40
+        duplicated(x[, "aminoAcid"]))))){
41
+        stop("Your list contains unproductive sequences or has not been aggreated for productive amino acid sequences.  Remove unproductive sequences first using the function productiveSeq with the aggregate parameter set to 'aminoAcid'.", call. = FALSE)
42
+    }
43
+    a <- productive.aa[[sample1]]
44
+    b <- productive.aa[[sample2]]
45
+    common <- intersect(a$aminoAcid, b$aminoAcid)
46
+    common.seq <- data.frame(aminoAcid = common)
47
+    common.a <- a[a$aminoAcid %in% common, ]
48
+    common.b <- b[b$aminoAcid %in% common, ]
49
+    common.seq$freq.a <- common.a$freq
50
+    common.seq$freq.b <- common.b$freq
51
+    common.seq$difference <- common.a$freq - common.b$freq
52
+    common.seq$log.fold.change <- log(common.a$freq/common.b$freq)
53
+    common.seq <- common.seq[order(common.seq$log.fold.change, decreasing = TRUE), ]
54
+    plot <- ggplot2::ggplot(data = common.seq, aes_string("freq.a", "freq.b")) + 
55
+        geom_point(size = 3) +
56
+        theme_minimal() + 
57
+        labs(x = paste(sample1, "frequency (%)"), y = paste(sample2, "frequency (%)"))
58
+    return(plot)
59
+}
0 60
\ No newline at end of file
1 61
new file mode 100644
... ...
@@ -0,0 +1,84 @@
1
+#' Common sequences Venn diagram
2
+#' 
3
+#' Creates a Venn diagram comparing the number of common sequences in two or 
4
+#' three samples.
5
+#' 
6
+#' @param samples A character vector of two or three names of samples in 
7
+#' productive.seqs to compare.
8
+#' @param productive.seqs A list of productive amino acid sequences generated
9
+#' by the LymphoSeq function productiveSeq.
10
+#' @return Returns a a Venn diagram of the number of common sequences between
11
+#' two or three samples.
12
+#' @seealso \code{\link{commonSeqs}}
13
+#' @examples
14
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
15
+#' 
16
+#' file.list <- readImmunoSeq(path = file.path)
17
+#' 
18
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
19
+#' 
20
+#' # Plot a triple Venn diagram
21
+#' commonSeqsVenn(samples = c("TCRB_Day0_Unsorted", 
22
+#'    "TCRB_Day32_Unsorted", "TCRB_Day83_Unsorted"), 
23
+#'    productive.seqs = productive.aa)
24
+#' 
25
+#' # Plot a double Venn diagram
26
+#' commonSeqsVenn(samples = c("TCRB_Day0_Unsorted", 
27
+#'    "TCRB_Day32_Unsorted"), productive.seqs = productive.aa)
28
+#' 
29
+#' # Save Venn diagram as a .png file to working directory
30
+#' png(filename = "Venn diagram.png", res = 300, units = "in", height = 5, width = 5)
31
+#' 
32
+#' commonSeqsVenn(samples = c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted"), 
33
+#'    productive.seqs = productive.aa)
34
+#' 
35
+#' dev.off()
36
+#' @export
37
+#' @importFrom VennDiagram draw.pairwise.venn draw.triple.venn
38
+#' @importFrom grid grid.newpage grid.draw
39
+commonSeqsVenn <- function(samples, productive.seqs) {
40
+    if (length(samples) > 3 | length(samples) < 2) {
41
+        stop("Please enter 2 or 3 samples.")
42
+    }
43
+    if (length(samples) == 2) {
44
+        a <- productive.seqs[[samples[1]]]
45
+        b <- productive.seqs[[samples[2]]]
46
+        grid::grid.newpage()
47
+        venn <- VennDiagram::draw.pairwise.venn(area1 = length(a$aminoAcid), 
48
+                                                area2 = length(b$aminoAcid), 
49
+                                                cross.area = length(intersect(a$aminoAcid, b$aminoAcid)), 
50
+                                                category = c(samples[1], samples[2]), 
51
+                                                cat.fontfamily = rep("sans", 2), 
52
+                                                fontfamily = rep("sans", 3), 
53
+                                                fill = c("#3288bd", "#d53e4f"), 
54
+                                                cat.pos = c(0, 0),
55
+                                                cat.dist = rep(0.025, 2),
56
+                                                cex = 1, 
57
+                                                cat.cex = 0.7,
58
+                                                lwd = rep(2, 2))
59
+        grid::grid.draw(venn)
60
+    }
61
+    if (length(samples) == 3) {
62
+        a <- productive.seqs[[samples[1]]]
63
+        b <- productive.seqs[[samples[2]]]
64
+        c <- productive.seqs[[samples[3]]]
65
+        grid::grid.newpage()
66
+        venn <- VennDiagram::draw.triple.venn(area1 = length(a$aminoAcid), 
67
+                                              area2 = length(b$aminoAcid), 
68
+                                              area3 = length(c$aminoAcid), 
69
+                                              n12 = length(intersect(a$aminoAcid, b$aminoAcid)), 
70
+                                              n23 = length(intersect(b$aminoAcid, c$aminoAcid)), 
71
+                                              n13 = length(intersect(a$aminoAcid, c$aminoAcid)), 
72
+                                              n123 = length(Reduce(intersect, list(a$aminoAcid, b$aminoAcid, c$aminoAcid))), 
73
+                                              category = c(samples[1], samples[2], samples[3]), 
74
+                                              cat.fontfamily = rep("sans", 3), 
75
+                                              fontfamily = rep("sans", 7), 
76
+                                              fill = c("#3288bd", "#abdda4", "#d53e4f"), 
77
+                                              cat.pos = c(0, 0, 180), 
78
+                                              cat.dist = rep(0.025, 3),
79
+                                              cex = 1, 
80
+                                              cat.cex = 0.7,
81
+                                              lwd = rep(2, 3))
82
+        grid::grid.draw(venn)
83
+    }
84
+} 
0 85
\ No newline at end of file
1 86
new file mode 100644
... ...
@@ -0,0 +1,127 @@
1
+#' Gene frequencies
2
+#' 
3
+#' Creates a data frame of VDJ gene counts and frequencies.
4
+#' 
5
+#' @param productive.nt A list of one or more data frames of productive sequences 
6
+#' generated by the LymphoSeq function productiveSeq where the parameter 
7
+#' aggregate is set to "nucleotide".
8
+#' @param locus A character vector indicating which VDJ genes to include in the 
9
+#' output.  Available options include "VDJ", "DJ", "VJ", "DJ", "V", "D", or "J".
10
+#' @param family A Boolean value indicating whether or not family names instead 
11
+#' of gene names are used.  If TRUE, then family names are used and if FALSE, 
12
+#' gene names are used.
13
+#' @return Returns a data frame with the sample names, VDJ gene name, count, and 
14
+#' \% frequency of the V, D, or J genes (each gene frequency should add to 
15
+#' 100\% for each sample).
16
+#' @examples
17
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
18
+#' 
19
+#' file.list <- readImmunoSeq(path = file.path)
20
+#' 
21
+#' productive.nt <- productiveSeq(file.list = file.list, aggregate = "nucleotide")
22
+#' 
23
+#' geneFreq(productive.nt, locus = "VDJ", family = FALSE)
24
+#' 
25
+#' # Create a heat map of V gene usage
26
+#' vfamilies <- geneFreq(productive.nt, locus = "V", family = TRUE)
27
+#' 
28
+#' require(reshape)
29
+#' 
30
+#' vfamilies <- reshape::cast(vfamilies, familyName ~ samples, value = "frequencyGene", sum)
31
+#' 
32
+#' rownames(vfamilies) <- as.character(vfamilies$familyName)
33
+#' 
34
+#' vfamilies$familyName <- NULL
35
+#' 
36
+#' RedBlue <- grDevices::colorRampPalette(rev(RColorBrewer::brewer.pal(11, "RdBu")))(256)
37
+#' 
38
+#' require(pheatmap)
39
+#' 
40
+#' pheatmap::pheatmap(vfamilies, color = RedBlue, scale = "row")
41
+#' 
42
+#' # Create a word cloud of V gene usage
43
+#' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE)
44
+#' 
45
+#' require(wordcloud)
46
+#' 
47
+#' wordcloud::wordcloud(words = vgenes[vgenes$samples == "TCRB_Day83_Unsorted", "geneName"], 
48
+#'    freq = vgenes[vgenes$samples == "TCRB_Day83_Unsorted", "frequencyGene"], 
49
+#' 	  colors = RedBlue)
50
+#' 
51
+#' # Create a cumulative frequency bar plot of V gene usage
52
+#' vgenes <- geneFreq(productive.nt, locus = "V", family = FALSE)
53
+#' 
54
+#' require(ggplot2)
55
+#' 
56
+#' ggplot2::ggplot(vgenes, aes(x = samples, y = frequencyGene, fill = geneName)) +
57
+#'   geom_bar(stat = "identity") +
58
+#'   theme_minimal() + 
59
+#'   scale_y_continuous(expand = c(0, 0)) + 
60
+#'   guides(fill = guide_legend(ncol = 3)) +
61
+#'   labs(y = "Frequency (%)", x = "", fill = "") +
62
+#'   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
63
+#' @export
64
+#' @importFrom stats aggregate
65
+geneFreq <- function(productive.nt, locus = "VDJ", family = FALSE) {
66
+    gene.names <- data.frame()
67
+    if (family == FALSE) {
68
+        i <- 1
69
+        for (i in 1:length(productive.nt)) {
70
+            file <- productive.nt[[i]]
71
+            file[file == ""] <- "unresolved"
72
+            name <- names(productive.nt)
73
+            v <- data.frame()
74
+            d <- data.frame()
75
+            j <- data.frame()
76
+            if (grepl("V|v", locus)) {
77
+                v <- aggregate(count ~ vGeneName, data = file, FUN = sum)
78
+                v <- data.frame(samples = name[i], geneName = v$vGeneName, 
79
+                                count = v$count, 
80
+                                frequencyGene = v$count/sum(v$count) * 100)
81
+            }
82
+            if (grepl("D|d", locus)) {
83
+                d <- aggregate(count ~ dGeneName, data = file, FUN = sum)
84
+                d <- data.frame(samples = name[i], geneName = d$dGeneName, 
85
+                                count = d$count, 
86
+                                frequencyGene = d$count/sum(d$count) * 100)
87
+            }
88
+            if (grepl("J|j", locus)) {
89
+                j <- aggregate(count ~ jGeneName, data = file, FUN = sum)
90
+                j <- data.frame(samples = name[i], geneName = j$jGeneName, 
91
+                                count = j$count, 
92
+                                frequencyGene = j$count/sum(j$count) * 100)
93
+            }
94
+            gene.names <- rbind(v, d, j, gene.names)
95
+        }
96
+    } else {
97
+        i <- 1
98
+        for (i in 1:length(productive.nt)) {
99
+            file <- productive.nt[[i]]
100
+            file[file == ""] <- "unresolved"
101
+            name <- names(productive.nt)
102
+            v <- data.frame()
103
+            d <- data.frame()
104
+            j <- data.frame()
105
+            if (grepl("V|v", locus)) {
106
+                v <- aggregate(count ~ vFamilyName, data = file, FUN = sum)
107
+                v <- data.frame(samples = name[i], familyName = v$vFamilyName, 
108
+                                count = v$count, 
109
+                                frequencyGene = v$count/sum(v$count) * 100)
110
+            }
111
+            if (grepl("D|d", locus)) {
112
+                d <- aggregate(count ~ dFamilyName, data = file, FUN = sum)
113
+                d <- data.frame(samples = name[i], familyName = d$dFamilyName, 
114
+                                count = d$count, 
115
+                                frequencyGene = d$count/sum(d$count) * 100)
116
+            }
117
+            if (grepl("J|j", locus)) {
118
+                j <- aggregate(count ~ jFamilyName, data = file, FUN = sum)
119
+                j <- data.frame(samples = name[i], familyName = j$jFamilyName, 
120
+                                count = j$count, 
121
+                                frequencyGene = j$count/sum(j$count) * 100)
122
+            }
123
+            gene.names <- rbind(v, d, j, gene.names)
124
+        }
125
+    }
126
+    return(gene.names)
127
+}
0 128
\ No newline at end of file
1 129
new file mode 100644
... ...
@@ -0,0 +1,72 @@
1
+#' Lorenz curve
2
+#' 
3
+#' Plots a Lorenz curve derived from the frequency of the amino acid sequences.
4
+#' 
5
+#' @param samples A character vector of sample names in list.
6
+#' @param list A list data frames generated using the LymphoSeq function readImmunoSeq 
7
+#' or productiveSeq.  "frequencyCount" is a required column.
8
+#' @return Returns a Lorenz curve.
9
+#' @details The Gini coefficient is an alternative metric used to calculate 
10
+#' repertoire diversity and is derived from the Lorenz curve.  The Lorenz curve 
11
+#' is drawn such that x-axis represents the cumulative percentage of unique 
12
+#' sequences and the y-axis represents the cumulative percentage of reads.  A 
13
+#' line passing through the origin with a slope of 1 reflects equal frequencies 
14
+#' of all sequences.  The Gini coefficient is the ratio of the area between the 
15
+#' line of equality and the observed Lorenz curve over the total area under the 
16
+#' line of equality.
17
+#' 
18
+#' The plot is made using the package ggplot2 and can be reformatted
19
+#' using ggplot2 functions.  See examples below.
20
+#' @seealso An excellent resource for examples on how to reformat a ggplot can 
21
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}).
22
+#' @examples
23
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
24
+#' 
25
+#' file.list <- readImmunoSeq(path = file.path)
26
+#' 
27
+#' lorenzCurve(samples = names(file.list), list = file.list)
28
+#' 
29
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
30
+#' 
31
+#' lorenzCurve(samples = names(productive.aa), list = productive.aa)
32
+#' 
33
+#' # Change the legend labels, line colors, and add a title
34
+#' samples <- c("TCRB_Day0_Unsorted", "TCRB_Day32_Unsorted", 
35
+#'    "TCRB_Day83_Unsorted", "TCRB_Day949_Unsorted", "TCRB_Day1320_Unsorted")
36
+#' 
37
+#' lorenz.curve <- lorenzCurve(samples = samples, list = productive.aa)
38
+#' 
39
+#' labels <- c("Day 0", "Day 32", "Day 83", "Day 949", "Day 1320")
40
+#' 
41
+#' colors <- c("navyblue", "red", "darkgreen", "orange", "purple")
42
+#' 
43
+#' lorenz.curve + ggplot2::scale_color_manual(name = "Samples", breaks = samples, 
44
+#'    labels = labels, values = colors) + ggplot2::ggtitle("Figure Title")
45
+#' @export
46
+#' @import ggplot2
47
+#' @importFrom RColorBrewer brewer.pal
48
+#' @importFrom ineq Lc
49
+lorenzCurve <- function(samples, list) {
50
+    lorenz <- data.frame()
51
+    i <- 1
52
+    for (i in 1:length(samples)) {
53
+        sample <- samples[i]
54
+        file <- list[[sample]]
55
+        lc <- ineq::Lc(file$frequencyCount)
56
+        lcdf <- data.frame(L = lc$L, p = lc$p)
57
+        lcdf$sample <- rep(sample, nrow(lcdf))
58
+        lorenz <- rbind(lcdf, lorenz)
59
+    }
60
+    getPalette <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
61
+    plot <- ggplot2::ggplot(lorenz, aes_string(x = "p", y = "L", color = "sample")) + 
62
+        geom_line(size = 1) + 
63
+        theme_minimal() + 
64
+        scale_color_manual(values = getPalette(length(samples) + 1)) + 
65
+        scale_y_continuous(expand = c(0, 0)) + 
66
+        scale_x_continuous(expand = c(0,0)) + 
67
+        geom_abline(intercept = 0, slope = 1, color = "grey", linetype = 2) + 
68
+        coord_fixed() + 
69
+        labs(x = "Cumulative percentage of unique sequences", 
70
+             y = "Cumulative percentage of reads", color = "")
71
+    return(plot)
72
+}
0 73
new file mode 100644
... ...
@@ -0,0 +1,49 @@
1
+#' Merge files
2
+#' 
3
+#' Merges two or more sample data frames into a single data frame and aggregates 
4
+#' count, frequencyCount, and estimatedNumberGenomes.
5
+#' 
6
+#' @param samples A character vector of the names of the samples that are to be 
7
+#' merged in file.list.
8
+#' @param file.list A list of data frames imported using the LymphoSeq function
9
+#' readImmunoSeq that contain the sample names that are to be merged.  "aminoAcid", 
10
+#' "nucleotide", "count" and "frequencyCount" are required columns.
11
+#' @return Returns a data frame of the merged samples.  The values of count, 
12
+#' frequencyCount, and estimatedNumberGenomes are aggregated.  That is, the sum 
13
+#' of count and estimatedNumberGenomes columns of the merged data frame should 
14
+#' equal the sum of the columns from the unmerged samples.  Likewise, the 
15
+#' frequencyCount of the merged data frame should add up to 100\%.  All other 
16
+#' columns in the unmerged data frames are included in the merge data frame.
17
+#' @examples
18
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
19
+#' 
20
+#' file.list <- readImmunoSeq(path = file.path)
21
+#' 
22
+#' TCRB_Day949_Merged <- mergeFiles(samples = c("TCRB_Day949_CD4", 
23
+#'    "TCRB_Day949_CD8"), file.list)
24
+#' 
25
+#' # To combine the merged data frames with file.list
26
+#' file.list <- c(list("TCRB_Day949_Merged" = TCRB_Day949_Merged), file.list)
27
+#' @export
28
+#' @importFrom stats aggregate
29
+mergeFiles <- function(samples, file.list) {
30
+    merged <- data.frame()
31
+    i <- 1
32
+    for (i in 1:length(samples)) {
33
+        file <- file.list[[as.character(samples[i])]]
34
+        merged <- rbind(merged, file)
35
+    }
36
+    count <- aggregate(count ~ nucleotide, data = merged, FUN = sum)
37
+    if (any(grepl("estimatedNumberGenomes", colnames(merged)))) {
38
+        merged$estimatedNumberGenomes[is.na(merged$estimatedNumberGenomes)] <- 0
39
+        estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ nucleotide, 
40
+                                            data = merged, FUN = sum)
41
+    }
42
+    merged$count <- NULL
43
+    merged$estimatedNumberGenomes <- NULL
44
+    merged <- merged[!duplicated(merged$nucleotide), ]
45
+    merged <- Reduce(function(x, y) 
46
+        merge(x, y, by = "nucleotide"), list(count, estimatedNumberGenomes, merged))
47
+    merged$frequencyCount <- (merged$count/sum(merged$count)) * 100
48
+    return(merged)
49
+} 
0 50
\ No newline at end of file
1 51
new file mode 100644
... ...
@@ -0,0 +1,57 @@
1
+#' Pairwise comparison plot
2
+#' 
3
+#' Creates a heat map from a similarity or Bhattacharyya matrix.
4
+#' 
5
+#' @param matrix A similarity or Bhattacharyya matrix produced by the LymphoSeq 
6
+#' functions similarityMatrix or bhattacharyyaMatrix.
7
+#' @return A pairwise comparison heat map.
8
+#' @details The plot is made using the package ggplot2 and can be reformatted
9
+#' using ggplot2 functions.  See examples below.
10
+#' @seealso An excellent resource for examples on how to reformat a ggplot can 
11
+#' be found in the R Graphics Cookbook online (\url{http://www.cookbook-r.com/Graphs/}).
12
+#' The functions to create the similarity or Bhattacharyya matrix can be found 
13
+#' here: \code{\link{similarityMatrix}} and \code{\link{bhattacharyyaMatrix}}
14
+#' @examples
15
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
16
+#' 
17
+#' file.list <- readImmunoSeq(path = file.path)
18
+#' 
19
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
20
+#' 
21
+#' similarity.matrix <- similarityMatrix(productive.seqs = productive.aa)
22
+#' 
23
+#' pairwisePlot(matrix = similarity.matrix)
24
+#' 
25
+#' bhattacharyya.matrix <- bhattacharyyaMatrix(productive.seqs = productive.aa)
26
+#' 
27
+#' pairwisePlot(matrix = bhattacharyya.matrix)
28
+#' 
29
+#' # Change plot color, title legend, and add title
30
+#' pairwisePlot(matrix = similarity.matrix) + 
31
+#'    ggplot2::scale_fill_gradient(low = "#deebf7", high = "#3182bd") + 
32
+#'    ggplot2::labs(fill = "Similarity score") + ggplot2::ggtitle("Figure Title")
33
+#' @export
34
+#' @import ggplot2
35
+#' @importFrom reshape melt.data.frame
36
+#' @importFrom stats na.omit
37
+pairwisePlot <- function(matrix) {
38
+    i <- 1
39
+    l <- length(matrix)
40
+    p <- l - 1
41
+    for (i in 1:p) {
42
+        j <- i + 1
43
+        matrix[i, j:l] <- NA
44
+    }
45
+    matrix$Samples <- rownames(matrix)
46
+    melt <- reshape::melt.data.frame(matrix, id.vars = "Samples")
47
+    melt <- na.omit(melt)
48
+    melt$variable <- factor(melt$variable, levels = rownames(matrix))
49
+    melt$Samples <- factor(melt$Samples, levels = rev(rownames(matrix)))
50
+    plot <- ggplot(data = melt, aes_string("Samples", "variable", fill = "value")) + 
51
+        geom_tile() + 
52
+        scale_fill_gradient(low = "#fee8c8", high = "#e34a33") + 
53
+        theme_classic() + 
54
+        labs(x = "", y = "", fill = "") + 
55
+        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
56
+    return(plot)
57
+}
0 58
\ No newline at end of file
1 59
new file mode 100644
... ...
@@ -0,0 +1,68 @@
1
+#' Productive sequences
2
+#' 
3
+#' Remove unproductive CDR3 sequences from a single data frame.
4
+#' 
5
+#' @param sample A data frame consisting of antigen receptor sequencing data.  
6
+#' "aminoAcid", "count", and "frequencyCount" are required columns.
7
+#' @param aggregate Indicates whether the values of "count", "frequencyCount", 
8
+#' and "esimatedNumberGenomes" should be aggregated by amino acid or nucleotide 
9
+#' sequence.  Acceptable values are "aminoAcid" or "nucleotide".  If "aminoAcid" 
10
+#' is selected, then the resulting data frame will have columns corresponding to 
11
+#' "aminoAcid", "count", "frequnecyCount", and "estimatedNumberGenomes" (if this 
12
+#' column is available).  If "nucleotide" is selected then all columns in the 
13
+#' original data frame will be present in the outputted data frame.  The difference in 
14
+#' output is due to the fact that the same amino acid CDR3 sequence may be 
15
+#' encoded by multiple unique nucleotide sequences with differing V, D, and J 
16
+#' genes. 
17
+#' @return Returns a data frame of productive amino acid sequences with recomputed 
18
+#' values for "count", "frequencyCount", and "esimatedNumberGenomes".  A 
19
+#' productive sequences is defined as a sequence that is in frame and does not 
20
+#' have an early stop codon.
21
+#' @examples
22
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
23
+#' 
24
+#' file.list <- readImmunoSeq(path = file.path)
25
+#' 
26
+#' productive <- productive(sample = file.list[["TCRB_Day32_Unsorted"]], aggregate = "aminoAcid")
27
+#' @seealso \code{\link{productiveSeq}}
28
+#' @export
29
+#' @importFrom stats aggregate
30
+productive <- function(sample, aggregate = "aminoAcid") {
31
+    productive.seqs <- sample[which(sample[, "aminoAcid"] != "" &
32
+                                        !grepl("\\*", sample[, "aminoAcid"])), ]
33
+    if (any(grepl("estimatedNumberGenomes", colnames(productive.seqs)))) {
34
+        if (aggregate == "aminoAcid") {
35
+            count <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum)
36
+            productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0
37
+            estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ aminoAcid, 
38
+                                                data = productive.seqs, FUN = sum)
39
+            merged <- merge(count, estimatedNumberGenomes, by = "aminoAcid")
40
+            merged$frequencyCount <- merged$count/sum(count$count) * 100
41
+        }
42
+        if (aggregate == "nucleotide") {
43
+            count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum)
44
+            productive.seqs$estimatedNumberGenomes[is.na(productive.seqs$estimatedNumberGenomes)] <- 0
45
+            estimatedNumberGenomes <- aggregate(estimatedNumberGenomes ~ nucleotide, 
46
+                                                data = productive.seqs, FUN = sum)
47
+            productive.seqs$count <- NULL
48
+            productive.seqs$estimatedNumberGenomes <- NULL
49
+            merged <- Reduce(function(x, y) merge(x, y, by = "nucleotide"), 
50
+                             list(productive.seqs, count, estimatedNumberGenomes))
51
+            merged$frequencyCount <- merged$count/sum(count$count) * 100
52
+        }
53
+    } else {
54
+        if (aggregate == "aminoAcid") {
55
+            merged <- aggregate(count ~ aminoAcid, data = productive.seqs, FUN = sum)
56
+            merged$frequencyCount <- merged$count/sum(merged$count) * 100
57
+        }
58
+        if (aggregate == "nucleotide") {
59
+            count <- aggregate(count ~ nucleotide, data = productive.seqs, FUN = sum)
60
+            productive.seqs$count <- NULL
61
+            merged <- merge(productive.seqs, count, by = "nucleotide")
62
+            merged$frequencyCount <- merged$count/sum(merged$count) * 100
63
+        }
64
+    }
65
+    merged <- merged[order(merged$frequencyCount, decreasing = TRUE), intersect(names(sample), names(merged))]
66
+    rownames(merged) <- NULL
67
+    return(merged)
68
+} 
0 69
\ No newline at end of file
1 70
new file mode 100644
... ...
@@ -0,0 +1,65 @@
1
+#' Productive sequences
2
+#' 
3
+#' Remove unproductive CDR3 sequences from a list of data frames.
4
+#' 
5
+#' @param file.list A list of data frames consisting antigen receptor sequencing 
6
+#' data imported by the LymphoSeq function readImmunoSeq. "aminoAcid", "count", and 
7
+#' "frequencyCount" are required columns.
8
+#' @param aggregate Indicates whether the values of "count", "frequencyCount", 
9
+#' and "esimatedNumberGenomes" should be aggregated by amino acid or nucleotide 
10
+#' sequence.  Acceptable values are "aminoAcid" or "nucleotide".  If "aminoAcid" 
11
+#' is selected, then resulting data frame will have columns corresponding to 
12
+#' aminoAcid, count, frequencyCount, and estimatedNumberGenomes (if this 
13
+#' column is available).  If "nucleotide" is selected then all columns in the 
14
+#' original list will be present in the outputted list.  The difference in 
15
+#' output is due to the fact that the same amino acid CDR3 sequence may be 
16
+#' encoded by multiple unique nucleotide sequences with differing V, D, and J 
17
+#' genes.
18
+#' @param prevalence A Boolean value indicating if a new column should be added 
19
+#' to each of the data frames giving the prevalence of each CDR3 amino acid 
20
+#' sequence in 55 healthy donor peripheral blood samples.  TRUE means the column 
21
+#' is added and FALSE means it is not.  Values range from 0 to 100\% where 
22
+#' 100\% means the sequence appeared in the blood of all 55 individuals.  The 
23
+#' prevalenceTRB database is located in a separate package called LymphoSeqDB 
24
+#' that should be loaded automatically.
25
+#' @return Returns a list of data frames of productive amino acid sequences with
26
+#' recomputed values for "count", "frequencyCount", and 
27
+#' "esimatedNumberGenomes".  A productive sequences is defined as a sequences 
28
+#' that is in frame and does not have an early stop codon.
29
+#' @examples
30
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
31
+#' 
32
+#' file.list <- readImmunoSeq(path = file.path)
33
+#' 
34
+#' productive.nt <- productiveSeq(file.list = file.list, 
35
+#'    aggregate = "nucleotide", prevalence = FALSE)
36
+#' 
37
+#' productive.aa <- productiveSeq(file.list = file.list, 
38
+#'   aggregate = "aminoAcid", prevalence = TRUE)
39
+#' @seealso Refer to the LymphoSeqDB package for details regarding the 
40
+#' prevalenceTRB database.
41
+#' @export
42
+#' @importFrom plyr llply
43
+#' @import LymphoSeqDB
44
+productiveSeq <- function(file.list, aggregate = "aminoAcid", prevalence = FALSE) {
45
+    productive.list <- plyr::llply(file.list, function(x) 
46
+        productive(x, aggregate), .progress = "text")
47
+    if (prevalence == TRUE) {
48
+        if (aggregate != "aminoAcid") {
49
+            stop("In order to add prevalence to your list of data frames, aggregate must be equal 'aminoAcid'.", call. = FALSE)
50
+        }
51
+        cat("Adding prevalence column...")
52
+        i <- 1
53
+        productive.list.prevalence <- list()
54
+        for(i in 1:length(productive.list)) {
55
+            file <- productive.list[[i]]
56
+            file$prevalence <- LymphoSeqDB::prevalenceTRB[match(file$aminoAcid, LymphoSeqDB::prevalenceTRB$aminoAcid), "prevalence"]
57
+            file$prevalence[is.na(file$prevalence)] <- 0
58
+            productive.list.prevalence <- c(productive.list.prevalence, list(file))
59
+        }
60
+        names(productive.list.prevalence) <- names(productive.list)
61
+        return(productive.list.prevalence)
62
+    } else {
63
+        return(productive.list)
64
+    }
65
+} 
0 66
\ No newline at end of file
1 67
new file mode 100644
... ...
@@ -0,0 +1,89 @@
1
+#' Read ImmunoSeq files
2
+#' 
3
+#' Imports tab-separated value (.tsv) files exported by the Adaptive 
4
+#' Biotechnologies ImmunoSEQ analyzer and stores them as a list of data frames.
5
+#' 
6
+#' @param path Path to the directory containing tab-delimited files.  Only 
7
+#' files with the extension .tsv are imported.  The names of the data frames are 
8
+#' the same as names of the files.
9
+#' @param columns Column names from the tab-delimited files that you desire to 
10
+#' import, all others will be disregarded.  May use "all" to import all columns.  
11
+#' A warning may be called if columns contain no data or must be coereced to a 
12
+#' different class.  Usually this warning can be ignored.
13
+#' @param recursive A Boolean value indicating whether tab-delimited files in 
14
+#' subdirectories should be imported.  If TRUE, then all files in the parent as 
15
+#' well as the subdirectory are imported.  If FALSE, then only files in the 
16
+#' parent directory are imported.
17
+#' @details May import tab-delimited files containing antigen receptor 
18
+#' sequencing from other sources (e.g. miTCR or miXCR) as long as the column 
19
+#' names are the same as used by ImmunoSEQ files.  Available column headings in 
20
+#' ImmunoSEQ files are:  
21
+#' "nucleotide", "aminoAcid", "count", "count (templates)", "count (reads)", 
22
+#' "frequencyCount", "frequencyCount (\%)", "cdr3Length", "vMaxResolved", 
23
+#' "vFamilyName", "vGeneName", "vGeneAllele", "vFamilyTies", "vGeneNameTies", 
24
+#' "vGeneAlleleTies", "dMaxResolved", "dFamilyName", "dGeneName", "dGeneAllele",
25
+#' "dFamilyTies", "dGeneNameTies", "dGeneAlleleTies", "jMaxResolved", 
26
+#' "jFamilyName", "jGeneName", "jGeneAllele", "jFamilyTies", "jGeneNameTies", 
27
+#' "jGeneAlleleTies", "vDeletion", "d5Deletion", "d3Deletion", "jDeletion", 
28
+#' "n2Insertion", "n1Insertion", "vIndex", "n2Index", "dIndex", "n1Index", 
29
+#' "jIndex", "estimatedNumberGenomes", "sequenceStatus", "cloneResolved", 
30
+#' "vOrphon", "dOrphon", "jOrphon", "vFunction", "dFunction", "jFunction", 
31
+#' "fractionNucleated".  
32
+#'  
33
+#' IMPORTANT: be aware that Adaptive has changed the 
34
+#' column names of their files over time and if the headings of your files are 
35
+#' inconsistent, then specify column = "all" or include all variations of the 
36
+#' headings you want to important.  For example, column = c("count", 
37
+#' "count (templates)", "count (reads)").  Also be aware that the "count" 
38
+#' column previously reported the number of sequencing reads in earlier 
39
+#' versions of ImmunoSEQ but now is equivalent to the 
40
+#' "estimatedNumberGenomes" column.
41
+#' @return Returns a list of data frames.  The names of each data frame are
42
+#' assigned according to the original ImmunoSEQ file names.
43
+#' @examples
44
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
45
+#' 
46
+#' file.list <- readImmunoSeq(path = file.path, 
47
+#'                            columns = c("aminoAcid", "nucleotide", "count", 
48
+#'                                      "count (templates)", "count (reads)", 
49
+#'                                      "frequencyCount", "frequencyCount (%)", 
50
+#'                                      "estimatedNumberGenomes"), 
51
+#'                            recursive = FALSE)
52
+#' @export
53
+#' @importFrom data.table fread
54
+#' @importFrom plyr llply
55
+readImmunoSeq <- function(path, columns = c("aminoAcid", "nucleotide", "count", 
56
+                                            "count (templates)", "count (reads)", 
57
+                                            "frequencyCount", "frequencyCount (%)", 
58
+                                            "estimatedNumberGenomes", "vFamilyName", 
59
+                                            "dFamilyName", "jFamilyName","vGeneName", 
60
+                                            "dGeneName", "jGeneName"), 
61
+                          recursive = FALSE) {
62
+    file.paths <- list.files(path, full.names = TRUE, all.files = FALSE, 
63
+                             recursive = recursive, pattern = ".tsv", 
64
+                             include.dirs = FALSE)
65
+    if (any(columns == "all")) {
66
+        file.list <- plyr::llply(file.paths, data.table::fread, 
67
+                                 data.table = FALSE, .progress = "text")
68
+    } else {
69
+        file.list <- plyr::llply(file.paths, data.table::fread, select = columns, 
70
+                                 data.table = FALSE, .progress = "text")
71
+        if(length(unique(plyr::llply(file.list, ncol))) > 1){
72
+            warning("One or more of the files you are trying to import do not contain all the columns you specified.", call. = FALSE)
73
+        }
74
+    }
75
+    file.names <- gsub(".tsv", "", basename(file.paths))
76
+    names(file.list) <- file.names
77
+    i <- 1
78
+    for (i in 1:length(file.list)) {
79
+        colnames(file.list[[i]]) <- ifelse(grepl("frequencyCount*", 
80
+                                                 colnames(file.list[[i]])), 
81
+                                           "frequencyCount", 
82
+                                           colnames(file.list[[i]]))
83
+        colnames(file.list[[i]]) <- ifelse(grepl("count*", 
84
+                                                 colnames(file.list[[i]])), 
85
+                                           "count", 
86
+                                           colnames(file.list[[i]]))
87
+    }
88
+    return(file.list)
89
+} 
0 90
\ No newline at end of file
1 91
new file mode 100644
... ...
@@ -0,0 +1,37 @@
1
+#' Remove sequence
2
+#' 
3
+#' Removes an amino acid sequence and associated data from all instances within 
4
+#' a list of data frames and then recomputes the frequencyCount.
5
+#' 
6
+#' @param file.list A list of data frames imported using the LymphoSeq function 
7
+#' readImmunoSeq.  "aminoAcid", "count", and "frequencyCount" are required columns.
8
+#' @param sequence A character vector of one or more amino acid sequences to 
9
+#' remove from the list of data frames.
10
+#' @return Returns a list of data frames like the one imported except all rows 
11
+#' with the specified amino acid sequence are removed.  The frequencyCount is 
12
+#' recalculated.
13
+#' @examples
14
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
15
+#' 
16
+#' file.list <- readImmunoSeq(path = file.path)
17
+#' 
18
+#' searchSeq(list = file.list, sequence = "CASSDLIGNGKLFF")
19
+#' 
20
+#' cleansed <- removeSeq(file.list = file.list, sequence = "CASSDLIGNGKLFF")
21
+#' 
22
+#' searchSeq(list = cleansed, sequence = "CASSDLIGNGKLFF")
23
+#' @export
24
+#' @importFrom plyr llply
25
+removeSeq <- function(file.list, sequence) {
26
+    cleansed <- plyr::llply(file.list, function(x) 
27
+        x[which(!(x[, "aminoAcid"] %in% sequence)), ])
28
+    recompute <- list()
29
+    i <- 1
30
+    for (i in 1:length(cleansed)) {
31
+        file <- cleansed[[i]]
32
+        file$frequencyCount <- file$count/sum(file$count) * 100
33
+        recompute <- c(recompute, list(file))
34
+    }
35
+    names(recompute) <- names(cleansed)
36
+    return(recompute)
37
+}
0 38
\ No newline at end of file
1 39
new file mode 100644
... ...
@@ -0,0 +1,50 @@
1
+#' Search for T cell receptor beta CDR3 amino acid sequences with known antigen 
2
+#' specificity
3
+#' 
4
+#' Search for published T cell receptor beta CDR3 amino acid sequences with 
5
+#' known antigen specificity in a list of data frames.
6
+#' 
7
+#' @param list A list of data frames generated by the LymphoSeq functions 
8
+#' readImmunoSeq or productiveSeq.  "aminoAcid", "frequencyCount", and "count" 
9
+#' are required columns.
10
+#' @return Returns a data frame of each sample name and instance in the sample 
11
+#' that the published TCR sequence appeared along with additional 
12
+#' information including antigen specificity, epitope, HLA type, and PubMed ID 
13
+#' (PMID) for the reference where the sequence was characterized.  The 
14
+#' publishedTRB database is located in a separate package called LymphoSeqDB 
15
+#' that should be loaded automatically.
16
+#' @examples
17
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
18
+#' 
19
+#' file.list <- readImmunoSeq(path = file.path)
20
+#' 
21
+#' productive.aa <- productiveSeq(file.list = file.list, aggregate = "aminoAcid")
22
+#' 
23
+#' searchPublished(list = productive.aa)
24
+#' @seealso Refer to the LymphoSeqDB package for details regarding the 
25
+#' publishedTRB database.
26
+#' @export
27
+#' @importFrom plyr llply
28
+#' @import LymphoSeqDB
29
+searchPublished <- function(list) {
30
+    search <- plyr::llply(list, function(x) 
31
+        x[which(x[, "aminoAcid"] %in% LymphoSeqDB::publishedTRB$aminoAcid), ])
32
+    found <- NULL
33
+    l <- 1
34
+    for (l in 1:length(search)) {
35
+        if (nrow(search[[l]]) != 0) {
36
+            search[[l]] <- cbind(rep(names(search)[l], 
37
+                                     nrow(search[[l]])), search[[l]])
38
+            colnames(search[[l]])[1] <- "sample"
39
+            found <- rbind(found, search[[l]])
40
+        }
41
+    }
42
+    if (is.null(found)) {
43
+        message("No sequences found.")
44
+    } else {
45
+        found$prevalence = NULL
46
+        found <- merge(found, LymphoSeqDB::publishedTRB, by = "aminoAcid")
47
+        found <- found[c("sample", setdiff(names(found), "sample"))]
48
+        return(found)
49
+    }
50
+}
0 51
\ No newline at end of file
1 52
new file mode 100644
... ...
@@ -0,0 +1,125 @@
1
+#' Search for a sequence
2
+#' 
3
+#' Search for one or more amino acid or nucleotide CDR3 sequences in a list of 
4
+#' data frames.
5
+#' 
6
+#' @param list A list of data frames generated by the LymphoSeq functions readImmunoSeq 
7
+#' or productiveSeq.  "aminoAcid" or "nucleotide", "frequencyCount", and 
8
+#' "count" are required columns.
9
+#' @param sequence A character vector of one ore more amino acid or nucleotide 
10
+#' CDR3 sequences to search.
11
+#' @param type A character vector specifying the type of sequence(s) to be 
12
+#' searched.  Available options are "aminoAcid" or "nucleotide". 
13
+#' @param match A character vector specifying whether an exact partial or exact global
14
+#'  match of the searched sequence(s) is desired.  Available options are 
15
+#' "partial" and "global".
16
+#' @param editDistance An integer giving the minimum edit distance that the 
17
+#' sequence must be less than or equal to.  See details below.
18
+#' @details An exact partial match means the searched sequence is contained within 
19
+#' target sequence.  An exact global match means the searched sequence is identical to 
20
+#' the target sequence.
21
+#' 
22
+#' Edit distance is a way of quantifying how dissimilar two sequences 
23
+#' are to one another by counting the minimum number of operations required to 
24
+#' transform one sequence into the other.  For example, an edit distance of 0 
25
+#' means the sequences are identical and an edit distance of 1 indicates that 
26
+#' the sequences different by a single amino acid or nucleotide.
27
+#' @return Returns the rows for every instance in the list of data frames where 
28
+#' the searched sequence(s) appeared.
29
+#' @examples
30
+#' file.path <- system.file("extdata", "TCRB_sequencing", package = "LymphoSeq")
31
+#' 
32
+#' file.list <- readImmunoSeq(path = file.path)
33
+#' 
34
+#' aa1 <- "CASSPVSNEQFF"
35
+#' 
36
+#' aa2 <- "CASSQEVPPYQAFF"
37
+#' 
38
+#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", 
39
+#'    match = "global", editDistance = 0)
40
+#' 
41
+#' searchSeq(list = file.list, sequence = c(aa1, aa2), 
42
+#'    type = "aminoAcid", match = "global", editDistance = 0)
43
+#' 
44
+#' searchSeq(list = file.list, sequence = aa1, type = "aminoAcid", editDistance = 1)
45
+#' 
46
+#' nt <- "CTGATTCTGGAGTCCGCCAGCACCAACCAGACATCTATGTACCTCTGTGCCAGCAGTCCGGTAAGCAATGAGCAGTTCTTCGGGCCA"
47
+#' 
48
+#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 3)
49
+#' 
50
+#' searchSeq(list = file.list, sequence = "CASSPVS", type = "aminoAcid", 
51
+#'    match = "partial", editDistance = 0)
52
+#' 
53
+#' searchSeq(list = file.list, sequence = nt, type = "nucleotide", editDistance = 0)
54
+#' @export
55
+#' @importFrom plyr llply
56
+#' @importFrom utils adist
57
+searchSeq <- function(list, sequence, type = "aminoAcid", match = "global", editDistance = 0) {
58
+    if (editDistance >= 1) {
59
+        merged.results <- data.frame()
60
+        i <- 1
61
+        for (i in 1:length(list)) {
62
+            file <- list[[i]]
63
+            ed <- adist(sequence, file[, type])
64
+            rownames(ed) <- sequence
65
+            colnames(ed) <- file[, type]
66
+            ed.index <- which(ed <= editDistance, arr.ind = TRUE)
67
+            if (nrow(ed.index) >= 1) {
68
+                ed.subset <- as.integer()
69
+                j <- 1
70
+                for (j in 1:nrow(ed.index)) {
71
+                    ed.j <- ed[ed.index[j, "row"], ed.index[j, "col"]]
72
+                    ed.subset <- c(ed.subset, ed.j)
73
+                }
74
+                results <- data.frame(searchSequnece = rownames(ed)[ed.index[, 1]],