Browse code

Doc/data/NAMESPACE updates

Andrew McDavid authored on 11/06/2019 18:49:27
Showing 18 changed files

... ...
@@ -3,3 +3,5 @@
3 3
 extdata/.*\.json$
4 4
 ^data-raw$
5 5
 extdata/refdata-cellranger-vdj-GRCh38-alts-ensembl-2.0.0/
6
+^doc$
7
+^Meta$
... ...
@@ -6,3 +6,5 @@
6 6
 data-raw/*.csv
7 7
 data-raw/*.json
8 8
 inst/doc
9
+doc
10
+Meta
... ...
@@ -1,7 +1,7 @@
1 1
 Package: CellaRepertorium
2 2
 Type: Package
3 3
 Title: Methods for clustering and analyzing high-throughput single cell immune cell repertoires (RepSeq)
4
-Version: 0.2.3
4
+Version: 0.2.4
5 5
 Author: Andrew McDavid
6 6
 Maintainer: Andrew McDavid <Andrew_McDavid@urmc.rochester.edu>
7 7
 Description: Methods to cluster and analyze high-throughput single cell immune cell repertoires,
... ...
@@ -23,7 +23,8 @@ Imports:
23 23
    rlang,
24 24
    purrr,
25 25
    Matrix,
26
-   S4Vectors
26
+   S4Vectors,
27
+   tidyr
27 28
 Suggests: 
28 29
     testthat,
29 30
     readr,
... ...
@@ -31,7 +32,7 @@ Suggests:
31 32
     rmarkdown,
32 33
     ggplot2,
33 34
     BiocStyle,
34
-    tidyr
35
+    ggdendro
35 36
 RoxygenNote: 6.1.1
36 37
 LinkingTo: Rcpp
37 38
 NeedsCompilation: yes
... ...
@@ -8,24 +8,34 @@ export(cdhit)
8 8
 export(cdhit_ccdb)
9 9
 export(cluster_permute_test)
10 10
 export(entropy)
11
+export(enumerate_pairing)
11 12
 export(fancy_name_contigs)
12
-export(fine_cluster)
13
+export(fine_cluster_seqs)
14
+export(fine_clustering)
13 15
 export(get_canonical_representative)
16
+export(ig_chain_recode)
14 17
 export(modal_category)
15 18
 export(np)
16 19
 export(pairing_tables)
20
+export(tcr_chain_recode)
21
+exportMethods("cluster_tbls<-")
22
+exportMethods(cluster_tbls)
17 23
 import(Biostrings)
18 24
 importFrom(S4Vectors,List)
19 25
 importFrom(S4Vectors,SimpleList)
20 26
 importFrom(dplyr,"%>%")
21 27
 importFrom(dplyr,anti_join)
22 28
 importFrom(dplyr,bind_rows)
29
+importFrom(dplyr,case_when)
23 30
 importFrom(dplyr,group_by)
24 31
 importFrom(dplyr,left_join)
25 32
 importFrom(dplyr,mutate)
26 33
 importFrom(dplyr,summarize)
27 34
 importFrom(dplyr,ungroup)
35
+importFrom(methods,"slot<-")
28 36
 importFrom(methods,as)
37
+importFrom(methods,new)
38
+importFrom(methods,slot)
29 39
 importFrom(rlang,":=")
30 40
 importFrom(rlang,sym)
31 41
 importFrom(rlang,syms)
... ...
@@ -40,5 +50,6 @@ importFrom(stringr,str_replace_all)
40 50
 importFrom(tibble,as_data_frame)
41 51
 importFrom(tibble,as_tibble)
42 52
 importFrom(tibble,data_frame)
53
+importFrom(tibble,tibble)
43 54
 importFrom(utils,data)
44 55
 useDynLib(CellaRepertorium)
... ...
@@ -1,7 +1,12 @@
1
+setGeneric('cluster_tbls', function(x, ...) standardGeneric('cluster_tbls'))
2
+setGeneric('cluster_tbls<-', function(x, ..., value) standardGeneric('cluster_tbls<-'),  signature=c("x", "value"))
3
+
4
+
1 5
 get_cluster_tbls = function(x, index){
2 6
     if(missing(index)) x@cluster_tbls else x@cluster_tbls[[index]]
3 7
 }
4
-setGeneric('cluster_tbls', function(x, ...) standardGeneric('cluster_tbls'))
8
+
9
+#' @export
5 10
 setMethod('cluster_tbls', signature = c(x = 'ContigCellDB'), get_cluster_tbls)
6 11
 
7 12
 set_cluster_tbls = function(x, index, value){
... ...
@@ -9,6 +14,6 @@ set_cluster_tbls = function(x, index, value){
9 14
     x
10 15
 }
11 16
 
12
-setGeneric('cluster_tbls<-', function(x, ..., value) standardGeneric('cluster_tbls<-'),  signature=c("x", "value"))
13 17
 
18
+#' @export
14 19
 setReplaceMethod('cluster_tbls', signature = c(x = 'ContigCellDB'), set_cluster_tbls)
... ...
@@ -46,26 +46,24 @@ cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'),
46 46
 
47 47
 #' Perform additional clustering of sequences within groups
48 48
 #'
49
-#' @param clustering `Clustering`
50
-#' @param sequence_key 
49
+#' @param clustering `Clustering` object
50
+#' @param sequence_key `character` naming column in `contig_tbl` with sequence
51 51
 #' @param type 'AA' or 'DNA'
52
-#' @param max_affinity 
53
-#' @param keep_clustering_details 
54
-#' @param ... passed to `clustering``
52
+#' @param max_affinity `numeric` naming the maximal affinity for the sparse affinity matrix that is constructed.  Not currently used.
53
+#' @param keep_clustering_details `logical` -- should output of `fine_cluster_seqs` be kept as a list column
54
+#' @param ... passed to `fine_cluster_seqs`
55 55
 #'
56
-#' @return
56
+#' @return `Clustering` object with updated `contig_tbl` and `cluster_tbl`
57 57
 #' @export
58
-#'
59
-#' @examples
60 58
 fine_clustering = function(clustering, sequence_key = 'seq', type = clustering$type, max_affinity = NULL, keep_clustering_details = FALSE, ...){
61 59
     cctb = clustering$contig_tbl
62 60
     message('Calculating intradistances on ', nrow(clustering$cluster_tbl), ' clusters.')
63
-    # run `fine_cluster` within each cluster_pk
64
-    cluster_tbl = cctb %>% group_by(!!!syms(clustering$cluster_pk)) %>% summarize(fc = list(fine_cluster(!!sym(sequence_key), type = type, ...)), n_cluster = n())
61
+    # run `fine_cluster_seqs` within each cluster_pk
62
+    cluster_tbl = cctb %>% group_by(!!!syms(clustering$cluster_pk)) %>% summarize(fc = list(fine_cluster_seqs(!!sym(sequence_key), type = type, ...)), n_cluster = n())
65 63
     message('Summarizing')
66
-    contig_by_cluster = cctb[union(clustering$contig_pk, clustering$cluster_pk)] %>% nest(!!!syms(clustering$contig_pk)) %>% 
64
+    contig_by_cluster = cctb[union(clustering$contig_pk, clustering$cluster_pk)] %>% nest(!!!syms(clustering$contig_pk)) %>%
67 65
         right_join(cluster_tbl %>% select(!!!syms(clustering$cluster_pk)), by=clustering$cluster_pk) # need to make sure these are in the same order!
68
-    
66
+
69 67
     if(is.null(max_affinity)){
70 68
         max_max = max(purrr::map_dbl(cluster_tbl$fc, 'max_dist'))
71 69
     } else {
... ...
@@ -100,15 +98,15 @@ fine_clustering = function(clustering, sequence_key = 'seq', type = clustering$t
100 98
 #' @param cluster_method character passed to `hclust`
101 99
 #'
102 100
 #' @seealso hclust, stringDist
103
-#' @return dendrogram of class `hclust`
101
+#' @return `list` containing
104 102
 #' @export
105 103
 #' @import Biostrings
106 104
 #' @examples
107 105
 #' fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium')
108 106
 #' aaseq = Biostrings::readAAStringSet(fasta_path)[1:100]
109
-#' cls = fine_cluster(aaseq)
107
+#' cls = fine_cluster_seqs(aaseq)
110 108
 #' plot(cls$cluster)
111
-fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster_fun = 'hclust', cluster_method = 'complete'){
109
+fine_cluster_seqs = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster_fun = 'hclust', cluster_method = 'complete'){
112 110
     if(length(seqs) > 4000 & !big_memory_brute) stop("Won't try to cluster ", length(seqs), " sequences unless `big_memory_brute` = TRUE.  (Make sure you actually have lots of memory!)")
113 111
     type = match.arg(type, choices = c('AA', 'DNA'))
114 112
     cluster_fun = match.arg(cluster_fun, c('hclust', 'none'))
... ...
@@ -1,9 +1,17 @@
1 1
 #' Filtered and annotated contigs of TCR from mice
2 2
 #'
3 3
 #' The details of how these are generated are shown in the vignette mouse_tcell_qc
4
-#' and are serialied to serve as an examples for other vignettes and documentation.
4
+#' and are serialized to serve as an examples for other vignettes and documentation.
5 5
 #' @format A data frame of 3399 contigs and 22 fields,
6 6
 #'  all except 4 are originally defined in https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/output/annotation#contig
7 7
 #'  The following fields were defined ex post facto. anno_file: Path to original csv file, pop: Mouse strain. sample: An artificial "replicate" from the original data defined by subsampling with replacement, celltype:The putative cell type of the contig.
8 8
 #'
9 9
 "contigs_qc"
10
+
11
+#' A preconstructed `ContigClusterDB` from the `contigs_qc` data
12
+#'
13
+#' Used in various examples.
14
+#' @format
15
+#' \code{ccdb_ex = ContigCellDB_10XVDJ(contigs_qc, contig_pk = c('pop',   'sample', 'barcode', 'contig_id'), cell_pk = c('pop',   'sample', 'barcode'))}
16
+#'
17
+"ccdb_ex"
... ...
@@ -207,7 +207,10 @@ plot_pairing = function(pairing_list, color_labels_by){
207 207
 
208 208
 }
209 209
 
210
-
210
+#' @export
211
+#' @describeIn enumerate_pairing Recode a table with IG chains
212
+#' @importFrom dplyr case_when
213
+#' @importFrom tibble tibble
211 214
 ig_chain_recode = function(tbl){
212 215
     pairing = case_when(tbl$IGH>0 & (tbl$IGK>0 | tbl$IGL>0) ~ 'paired',
213 216
                         tbl$IGH>0 ~ 'heavy',
... ...
@@ -220,13 +223,16 @@ ig_chain_recode = function(tbl){
220 223
     dplyr::bind_cols(tbl, tibble(pairing, canonical))
221 224
 }
222 225
 
226
+#' @export
227
+#' @describeIn enumerate_pairing Recode a table with TCR chains
228
+#' @param tbl output from enumerate_pairing containing TRA/TRB or IGH/IHK/IHL columns
223 229
 tcr_chain_recode = function(tbl){
224 230
     pairing = case_when(tbl$TRA>0 & tbl$TRB>0 ~ 'paired',
225 231
                         tbl$TRB>0 ~ 'beta',
226 232
                         tbl$TRA>0 ~ 'alpha')
227 233
     canonical = case_when(tbl$TRB==2 ~ 'double-beta',
228 234
                           tbl$TRA==2  ~ 'double-alpha',
229
-                          (tbl$TRB + tbl$TRA) > 1 ~ 'other',
235
+                          (tbl$TRB + tbl$TRA) > 2 ~ 'other',
230 236
                           TRUE ~ 'classical')
231 237
     dplyr::bind_cols(tbl, tibble(pairing, canonical))
232 238
 }
... ...
@@ -238,7 +244,7 @@ tcr_chain_recode = function(tbl){
238 244
 #' Return a tibble, keyed by cells that includes the counts of the chains, the `raw_chain_type` and any additional output from running `chain_recode_fun`.
239 245
 #' @param ccdb `ContigCellDB`
240 246
 #' @param chain_key `character` naming the field in the `contig_tbl` identifying chain
241
-#' @param chain_recode_fun a function that operates on the output of this function that further reduces the chain combinations to some other summary.  Set to 'guess' to apply functions that may work for 10X data or `NULL` to skip.  See `CellaRepertorium:::tcr_chain_recode` for an example.
247
+#' @param chain_recode_fun a function that operates on the output of this function that further reduces the chain combinations to some other summary.  Set to 'guess' to apply functions that may work for 10X data or `NULL` to skip.  See `CellaRepertorium::tcr_chain_recode` for an example.
242 248
 #'
243 249
 #' @return a `tibble` keyed by cells.
244 250
 #' @export
... ...
@@ -261,7 +267,7 @@ enumerate_pairing = function(ccdb, chain_key = 'chain', chain_recode_fun = NULL)
261 267
     if(!is.function(chain_recode_fun)) stop("`chain_recode_fun` must be a function, NULL, or 'guess'")
262 268
 
263 269
     chain_keys = union(chain_key, ccdb$cell_pk)
264
-    chain_count = ccdb$contig_tbl %>% group_by(!!!syms(chain_keys)) %>% summarize(n_chains = n()) %>% spread(chain_key, 'n_chains', fill = 0)
270
+    chain_count = ccdb$contig_tbl %>% group_by(!!!syms(chain_keys)) %>% summarize(n_chains = n()) %>% tidyr::spread(chain_key, 'n_chains', fill = 0)
265 271
     chain_type = ccdb$contig_tbl %>% group_by(!!!syms(ccdb$cell_pk)) %>% summarize(raw_chain_type = paste(sort(!!sym(chain_key)), collapse = '_'))
266 272
     chain_summary = left_join(chain_type, chain_count, by = ccdb$cell_pk) %>% ungroup()
267 273
     chain_recode_fun(chain_summary)
... ...
@@ -69,9 +69,9 @@ jsn_ss %>% rowwise() %>% do({
69 69
     tibble()
70 70
 })
71 71
 
72
-knitr::purl('vignettes/mouse_tcell_qc.Rmd', output = 'data-raw/mouse_tcells.R')
73
-source('data-raw/mouse_tcell_qc.R')
72
+knitr::purl('vignettes/mouse_tcell_qc.Rmd', output = 'data-raw/mouse_tcells_qc.R')
73
+source('data-raw/mouse_tcells_qc.R')
74 74
 ccdb_ex = ContigCellDB_10XVDJ(contigs_qc, contig_pk = c('pop',   'sample', 'barcode', 'contig_id'), cell_pk = c('pop',   'sample', 'barcode'))
75
-use_data(contigs_qc)
76
-use_data(ccdb_ex)
75
+use_data(contigs_qc, overwrite = TRUE)
76
+use_data(ccdb_ex, overwrite = TRUE)
77 77
 unlink('data-raw/mouse_tcell_qc.R')
78 78
new file mode 100644
79 79
Binary files /dev/null and b/data/ccdb_ex.rda differ
80 80
Binary files a/data/contigs_qc.rda and b/data/contigs_qc.rda differ
81 81
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/data.R
3
+\docType{data}
4
+\name{ccdb_ex}
5
+\alias{ccdb_ex}
6
+\title{A preconstructed `ContigClusterDB` from the `contigs_qc` data}
7
+\format{\code{ccdb_ex = ContigCellDB_10XVDJ(contigs_qc, contig_pk = c('pop',   'sample', 'barcode', 'contig_id'), cell_pk = c('pop',   'sample', 'barcode'))}}
8
+\usage{
9
+ccdb_ex
10
+}
11
+\description{
12
+Used in various examples.
13
+}
14
+\keyword{datasets}
... ...
@@ -9,7 +9,7 @@ cdhit(seqs, identity = NULL, kmerSize = NULL, min_length = 6,
9 9
   s = 1, name = "CD-Hit", only_index = FALSE,
10 10
   showProgress = interactive(), ...)
11 11
 
12
-cdhit_ccdb(object, sequence_col, type = c("DNA", "AA"),
12
+cdhit_ccdb(object, sequence_key, type = c("DNA", "AA"),
13 13
   cluster_tbl_name = length(cluster_tbls(object)) + 1, ...)
14 14
 }
15 15
 \arguments{
... ...
@@ -32,7 +32,9 @@ You may need to lower it below 5 for AAseq with identity less than .7.}
32 32
 
33 33
 \item{...}{other arguments that can be passed to cdhit, see https://github.com/weizhongli/cdhit/wiki/3.-User's-Guide#CDHIT for details.  These will override any default values.}
34 34
 
35
-\item{sequence_col}{`character` naming the column in the `contig_tbl` containing the sequence to be clustered}
35
+\item{object}{An object of class `ClusterContigDB`}
36
+
37
+\item{sequence_key}{`character` naming the column in the `contig_tbl` containing the sequence to be clustered}
36 38
 
37 39
 \item{type}{one of 'DNA' or 'AA'}
38 40
 
... ...
@@ -12,6 +12,6 @@ contigs_qc
12 12
 }
13 13
 \description{
14 14
 The details of how these are generated are shown in the vignette mouse_tcell_qc
15
-and are serialied to serve as an examples for other vignettes and documentation.
15
+and are serialized to serve as an examples for other vignettes and documentation.
16 16
 }
17 17
 \keyword{datasets}
... ...
@@ -1,5 +1,5 @@
1 1
 % Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/fine-cluster.R
2
+% Please edit documentation in R/clustering-methods.R
3 3
 \name{entropy}
4 4
 \alias{entropy}
5 5
 \alias{np}
6 6
new file mode 100644
... ...
@@ -0,0 +1,43 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/pairing-methods.R
3
+\name{ig_chain_recode}
4
+\alias{ig_chain_recode}
5
+\alias{tcr_chain_recode}
6
+\alias{enumerate_pairing}
7
+\title{Categorize the pairing present in a cell}
8
+\usage{
9
+ig_chain_recode(tbl)
10
+
11
+tcr_chain_recode(tbl)
12
+
13
+enumerate_pairing(ccdb, chain_key = "chain", chain_recode_fun = NULL)
14
+}
15
+\arguments{
16
+\item{tbl}{output from enumerate_pairing containing TRA/TRB or IGH/IHK/IHL columns}
17
+
18
+\item{ccdb}{`ContigCellDB`}
19
+
20
+\item{chain_key}{`character` naming the field in the `contig_tbl` identifying chain}
21
+
22
+\item{chain_recode_fun}{a function that operates on the output of this function that further reduces the chain combinations to some other summary.  Set to 'guess' to apply functions that may work for 10X data or `NULL` to skip.  See `CellaRepertorium::tcr_chain_recode` for an example.}
23
+}
24
+\value{
25
+a `tibble` keyed by cells.
26
+}
27
+\description{
28
+For each cell (defined by `ccdb$cell_pk`) count the number of each level of `chain_key` occurs, and cross tabulate.
29
+Also for each cell, paste together all values `chain_key`.
30
+Return a tibble, keyed by cells that includes the counts of the chains, the `raw_chain_type` and any additional output from running `chain_recode_fun`.
31
+}
32
+\section{Functions}{
33
+\itemize{
34
+\item \code{ig_chain_recode}: Recode a table with IG chains
35
+
36
+\item \code{tcr_chain_recode}: Recode a table with TCR chains
37
+}}
38
+
39
+\examples{
40
+data(ccdb_ex)
41
+enumerate_pairing(ccdb_ex)
42
+enumerate_pairing(ccdb_ex, chain_recode_fun = 'guess')
43
+}
0 44
similarity index 86%
1 45
rename from man/fine_cluster.Rd
2 46
rename to man/fine_cluster_seqs.Rd
... ...
@@ -1,10 +1,10 @@
1 1
 % Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/fine-cluster.R
3
-\name{fine_cluster}
4
-\alias{fine_cluster}
2
+% Please edit documentation in R/clustering-methods.R
3
+\name{fine_cluster_seqs}
4
+\alias{fine_cluster_seqs}
5 5
 \title{Calculate distances and perform hierarchical clustering on a set of sequences}
6 6
 \usage{
7
-fine_cluster(seqs, type = "AA", big_memory_brute = FALSE,
7
+fine_cluster_seqs(seqs, type = "AA", big_memory_brute = FALSE,
8 8
   method = "levenshtein", substitution_matrix = "BLOSUM100",
9 9
   cluster_fun = "hclust", cluster_method = "complete")
10 10
 }
... ...
@@ -24,7 +24,7 @@ fine_cluster(seqs, type = "AA", big_memory_brute = FALSE,
24 24
 \item{cluster_method}{character passed to `hclust`}
25 25
 }
26 26
 \value{
27
-dendrogram of class `hclust`
27
+`list` containing
28 28
 }
29 29
 \description{
30 30
 The distances between AA sequences is defined to be 1-score/max(score) times the median length of the input sequences.
... ...
@@ -33,7 +33,7 @@ The distances between nucleotide sequences is defined to be edit_distance/max(ed
33 33
 \examples{
34 34
 fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium')
35 35
 aaseq = Biostrings::readAAStringSet(fasta_path)[1:100]
36
-cls = fine_cluster(aaseq)
36
+cls = fine_cluster_seqs(aaseq)
37 37
 plot(cls$cluster)
38 38
 }
39 39
 \seealso{
40 40
new file mode 100644
... ...
@@ -0,0 +1,29 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/clustering-methods.R
3
+\name{fine_clustering}
4
+\alias{fine_clustering}
5
+\title{Perform additional clustering of sequences within groups}
6
+\usage{
7
+fine_clustering(clustering, sequence_key = "seq",
8
+  type = clustering$type, max_affinity = NULL,
9
+  keep_clustering_details = FALSE, ...)
10
+}
11
+\arguments{
12
+\item{clustering}{`Clustering` object}
13
+
14
+\item{sequence_key}{`character` naming column in `contig_tbl` with sequence}
15
+
16
+\item{type}{'AA' or 'DNA'}
17
+
18
+\item{max_affinity}{`numeric` naming the maximal affinity for the sparse affinity matrix that is constructed.  Not currently used.}
19
+
20
+\item{keep_clustering_details}{`logical` -- should output of `fine_cluster_seqs` be kept as a list column}
21
+
22
+\item{...}{passed to `fine_cluster_seqs`}
23
+}
24
+\value{
25
+`Clustering` object with updated `contig_tbl` and `cluster_tbl`
26
+}
27
+\description{
28
+Perform additional clustering of sequences within groups
29
+}