... | ... |
@@ -83,15 +83,16 @@ cdhit = function(seqs, identity = NULL, kmerSize = NULL, min_length = 6, s = 1, |
83 | 83 |
options = c(uopts, options) |
84 | 84 |
options = options[!duplicated(names(options))] |
85 | 85 |
options = lapply(options, as.character) |
86 |
- out = switch( |
|
87 |
- class(seqs), |
|
88 |
- AAStringSet = cdhitC(options, name, showProgress) + 1, |
|
89 |
- DNAStringSet = cdhitestC(options, name, showProgress) + 1, |
|
90 |
- stop('seqs must be either AAStringSet or DNAStringSet') |
|
91 |
- ) |
|
92 |
- if(only_index) return(out) |
|
93 |
- tibble::tibble(query_name = names(seqs), seq = as.character(seqs), cluster_idx = out) %>% |
|
94 |
- dplyr::group_by(cluster_idx) %>% dplyr::mutate(n_cluster = dplyr::n()) |
|
86 |
+ if(type == 'cdhitC'){ |
|
87 |
+ seq_cluster_index = cdhitC(options, name, showProgress) + 1 |
|
88 |
+ } else{ |
|
89 |
+ seq_cluster_index = cdhitestC(options, name, showProgress) + 1 |
|
90 |
+ } |
|
91 |
+ if(only_index) return(seq_cluster_index) |
|
92 |
+ tibble::tibble(query_name = names(seqs), seq = as.character(seqs), |
|
93 |
+ cluster_idx = seq_cluster_index) %>% |
|
94 |
+ dplyr::group_by(cluster_idx) %>% |
|
95 |
+ dplyr::mutate(n_cluster = dplyr::n()) %>% ungroup() |
|
95 | 96 |
} |
96 | 97 |
|
97 | 98 |
|
... | ... |
@@ -100,7 +101,7 @@ cdhit = function(seqs, identity = NULL, kmerSize = NULL, min_length = 6, s = 1, |
100 | 101 |
##' @param ccdb An object of class [ContigCellDB()] |
101 | 102 |
##' @param sequence_key `character` naming the column in the `contig_tbl` containing the sequence to be clustered |
102 | 103 |
##' @param type one of 'DNA' or 'AA' |
103 |
-##' @param cluster_name `character` specifying key, and name for the clustering. |
|
104 |
+##' @param cluster_pk `character` specifying key, and name for the clustering. |
|
104 | 105 |
##' @return [ContigCellDB()] |
105 | 106 |
##' @inheritDotParams cdhit -seqs -only_index |
106 | 107 |
##' @export |
... | ... |
@@ -113,7 +114,7 @@ cdhit = function(seqs, identity = NULL, kmerSize = NULL, min_length = 6, s = 1, |
113 | 114 |
##' res$contig_tbl |
114 | 115 |
##' res$cluster_pk |
115 | 116 |
cdhit_ccdb = function(ccdb, sequence_key, type = c('DNA', 'AA'), |
116 |
- cluster_name = 'cluster_idx', ...){ |
|
117 |
+ cluster_pk = 'cluster_idx', ...){ |
|
117 | 118 |
seqs = ccdb$contig_tbl[[sequence_key]] |
118 | 119 |
if(length(seqs) < 1) stop("No sequences were provided") |
119 | 120 |
type = match.arg(type, c('DNA', 'AA')) |
... | ... |
@@ -123,7 +124,8 @@ cdhit_ccdb = function(ccdb, sequence_key, type = c('DNA', 'AA'), |
123 | 124 |
seqset = AAStringSet(seqs) |
124 | 125 |
} |
125 | 126 |
cluster_idx = cdhit(seqset, ..., only_index = TRUE) |
126 |
- contig_tbl = dplyr::mutate(ccdb$contig_tbl, !!sym(cluster_name) := cluster_idx) |
|
127 |
- cluster_tbl = as_tibble(unique(contig_tbl[cluster_name])) |
|
128 |
- replace_cluster_tbl(ccdb, cluster_tbl, contig_tbl, cluster_pk = cluster_name) |
|
127 |
+ contig_tbl = ccdb$contig_tbl |
|
128 |
+ contig_tbl[[cluster_pk]] = cluster_idx |
|
129 |
+ cluster_tbl = as_tibble(unique(contig_tbl[cluster_pk])) |
|
130 |
+ replace_cluster_tbl(ccdb, cluster_tbl, contig_tbl, cluster_pk = cluster_pk) |
|
129 | 131 |
} |
... | ... |
@@ -4,7 +4,7 @@ globalVariables('ngrp') |
4 | 4 |
#' |
5 | 5 |
#' @param ccdb [ContigCellDB()] |
6 | 6 |
#' @param segment_keys fields in `contig_tbl` that identify a cluster |
7 |
-#' @param cluster_name name of cluster to be added to `cluster_tbl` |
|
7 |
+#' @param cluster_pk name of cluster to be added to `cluster_tbl` |
|
8 | 8 |
#' |
9 | 9 |
#' @return [ContigCellDB()] |
10 | 10 |
#' @export |
... | ... |
@@ -13,13 +13,13 @@ globalVariables('ngrp') |
13 | 13 |
#' data(ccdb_ex) |
14 | 14 |
#' ccdb_ex = cluster_germline(ccdb_ex) |
15 | 15 |
#' ccdb_ex$cluster_tbl |
16 |
-cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_name = 'cluster_idx'){ |
|
16 |
+cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'cluster_idx'){ |
|
17 | 17 |
contig_tbl = ccdb$contig_tbl |
18 | 18 |
seg_types = contig_tbl %>% group_by(!!!syms(segment_keys)) %>% summarize() %>% ungroup() |
19 |
- seg_types[[cluster_name]] = seq_len(nrow(seg_types)) |
|
19 |
+ seg_types[[cluster_pk]] = seq_len(nrow(seg_types)) |
|
20 | 20 |
cl_con_tbl = left_join_warn(seg_types, contig_tbl, by = segment_keys) |
21 |
- cluster_tbl = as_tibble(unique(cl_con_tbl[union(cluster_name, segment_keys)])) |
|
22 |
- replace_cluster_tbl(ccdb, cluster_tbl, cl_con_tbl, cluster_pk = cluster_name) |
|
21 |
+ cluster_tbl = as_tibble(unique(cl_con_tbl[union(cluster_pk, segment_keys)])) |
|
22 |
+ replace_cluster_tbl(ccdb, cluster_tbl, cl_con_tbl, cluster_pk = cluster_pk) |
|
23 | 23 |
} |
24 | 24 |
|
25 | 25 |
globalVariables(c('fc', 'd(medoid)', 'is_medoid', 'n_cluster')) |
... | ... |
@@ -8,7 +8,7 @@ cdhit_ccdb( |
8 | 8 |
ccdb, |
9 | 9 |
sequence_key, |
10 | 10 |
type = c("DNA", "AA"), |
11 |
- cluster_name = "cluster_idx", |
|
11 |
+ cluster_pk = "cluster_idx", |
|
12 | 12 |
... |
13 | 13 |
) |
14 | 14 |
} |
... | ... |
@@ -19,7 +19,7 @@ cdhit_ccdb( |
19 | 19 |
|
20 | 20 |
\item{type}{one of 'DNA' or 'AA'} |
21 | 21 |
|
22 |
-\item{cluster_name}{\code{character} specifying key, and name for the clustering.} |
|
22 |
+\item{cluster_pk}{\code{character} specifying key, and name for the clustering.} |
|
23 | 23 |
|
24 | 24 |
\item{...}{ |
25 | 25 |
Arguments passed on to \code{\link[=cdhit]{cdhit}} |
... | ... |
@@ -7,7 +7,7 @@ |
7 | 7 |
cluster_germline( |
8 | 8 |
ccdb, |
9 | 9 |
segment_keys = c("v_gene", "j_gene", "chain"), |
10 |
- cluster_name = "cluster_idx" |
|
10 |
+ cluster_pk = "cluster_idx" |
|
11 | 11 |
) |
12 | 12 |
} |
13 | 13 |
\arguments{ |
... | ... |
@@ -15,7 +15,7 @@ cluster_germline( |
15 | 15 |
|
16 | 16 |
\item{segment_keys}{fields in \code{contig_tbl} that identify a cluster} |
17 | 17 |
|
18 |
-\item{cluster_name}{name of cluster to be added to \code{cluster_tbl}} |
|
18 |
+\item{cluster_pk}{name of cluster to be added to \code{cluster_tbl}} |
|
19 | 19 |
} |
20 | 20 |
\value{ |
21 | 21 |
\code{\link[=ContigCellDB]{ContigCellDB()}} |
... | ... |
@@ -30,7 +30,7 @@ test_that("Don't need to be sorted", { |
30 | 30 |
data(ccdb_ex) |
31 | 31 |
|
32 | 32 |
test_that("cdhit_ccdb do update", { |
33 |
- res = cdhit_ccdb(ccdb_ex, 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97', identity = .965, min_length = 12, G = 1) |
|
33 |
+ res = cdhit_ccdb(ccdb_ex, 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', identity = .965, min_length = 12, G = 1) |
|
34 | 34 |
expect_is(res, 'ContigCellDB') |
35 | 35 |
expect_equal(res$cluster_pk, 'DNA97') |
36 | 36 |
expect_equal(ncol(ccdb_ex$contig_tbl)+1,ncol(res$contig_tbl)) |
... | ... |
@@ -6,7 +6,7 @@ test_that("Cluster contigs by germline properties",{ |
6 | 6 |
expect_is(cdb,'ContigCellDB') |
7 | 7 |
expect_gt(ncol(cdb$cluster_tbl),ncol(ccdb_ex$cluster_tbl)) |
8 | 8 |
expect_equal(names(cdb$cluster_tbl),c('cluster_idx','v_gene','j_gene','chain')) |
9 |
- cdb1 <- cluster_germline(ccdb_ex,segment_keys = c('v_gene', 'j_gene'),cluster_name = 'clusterID') |
|
9 |
+ cdb1 <- cluster_germline(ccdb_ex,segment_keys = c('v_gene', 'j_gene'),cluster_pk = 'clusterID') |
|
10 | 10 |
expect_gt(ncol(cdb$cluster_tbl),ncol(cdb1$cluster_tbl)) |
11 | 11 |
expect_equal(names(cdb1$cluster_tbl)[1],'clusterID') |
12 | 12 |
expect_error(cluster_germline(ccdb_ex,segment_keys = c('v_gene', 'm_gene'))) |
... | ... |
@@ -15,7 +15,7 @@ test_that("Cluster contigs by germline properties",{ |
15 | 15 |
ccdb_ex_small <- ccdb_ex |
16 | 16 |
ccdb_ex_small$cell_tbl <- ccdb_ex_small$cell_tbl[1:200,] |
17 | 17 |
ccdb_ex_small <- cdhit_ccdb(ccdb_ex_small, |
18 |
- sequence_key = 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97', |
|
18 |
+ sequence_key = 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', |
|
19 | 19 |
identity = .965, min_length = 12, G = 1) |
20 | 20 |
ccdb_ex_small_fine <- fine_clustering(ccdb_ex_small, sequence_key = 'cdr3_nt', type = 'DNA') |
21 | 21 |
|
... | ... |
@@ -23,8 +23,11 @@ library(stringr) |
23 | 23 |
library(purrr) |
24 | 24 |
``` |
25 | 25 |
|
26 |
+ |
|
26 | 27 |
# Load filtered contig files |
27 | 28 |
|
29 |
+We begin with a `data.frame` of concatenated contig files ('all_contig_annotations.csv'), output from the Cellranger VDJ pipeline. |
|
30 |
+ |
|
28 | 31 |
```{r} |
29 | 32 |
data(contigs_qc) |
30 | 33 |
MIN_CDR3_AA = 6 |
... | ... |
@@ -53,11 +56,9 @@ ggplot(paired_chain, aes(x = interaction(sample, pop), fill = pairing)) + geom_b |
53 | 56 |
|
54 | 57 |
|
55 | 58 |
```{r} |
56 |
- |
|
57 |
-aa80 = cdhit_ccdb(cdb, 'cdr3', type = 'AA', cluster_name = 'aa80', identity = .8) |
|
59 |
+aa80 = cdhit_ccdb(cdb, 'cdr3', type = 'AA', cluster_pk = 'aa80', identity = .8) |
|
58 | 60 |
aa80 = fine_clustering(aa80, sequence_key = 'cdr3', type = 'AA', keep_clustering_details = TRUE) |
59 | 61 |
|
60 |
- |
|
61 | 62 |
# This maybe should be a turned into a function |
62 | 63 |
# Other plots should be considered: |
63 | 64 |
# That show how clusters are split between samples, chains, etc |
... | ... |
@@ -71,7 +72,7 @@ We cluster the CDR3 translated amino acid residues with the program [CD-HIT](htt |
71 | 72 |
# Cluster CDR3 DNA sequences |
72 | 73 |
|
73 | 74 |
```{r, results = 'hide'} |
74 |
-cdb = cdhit_ccdb(cdb, 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97', identity = .965, min_length = MIN_CDR3_AA*3-1, G = 1) |
|
75 |
+cdb = cdhit_ccdb(cdb, 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', identity = .965, min_length = MIN_CDR3_AA*3-1, G = 1) |
|
75 | 76 |
cdb = fine_clustering(cdb, sequence_key = 'cdr3_nt', type = 'DNA') |
76 | 77 |
ggplot(cdb$cluster_tbl %>% filter(n_cluster>1) %>% gather(key, value, -DNA97) , aes(x = value))+ facet_wrap(~key, scales = 'free') + geom_histogram() + scale_y_sqrt() |
77 | 78 |
``` |
... | ... |
@@ -81,7 +82,7 @@ We can also cluster by DNA identity. |
81 | 82 |
# Cluster by V-J identity |
82 | 83 |
|
83 | 84 |
```{r} |
84 |
-germline_cluster = cluster_germline(cdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_name = 'segment_idx') |
|
85 |
+germline_cluster = cluster_germline(cdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx') |
|
85 | 86 |
``` |
86 | 87 |
|
87 | 88 |
We can cluster by any other feature of the contigs. Here we cluster each contig based on the chain and V-J genes. This gives us the set of observed V-J pairings: |
... | ... |
@@ -61,7 +61,7 @@ sce = SingleCellExperiment(assay = list(counts = expression), colData = all_bc) |
61 | 61 |
```{r} |
62 | 62 |
ccdb2 = ContigCellDB(ccdb_ex$contig_tbl, contig_pk = ccdb_ex$contig_pk, cell_tbl = colData(sce), cell_pk = ccdb_ex$cell_pk, equalize = FALSE) |
63 | 63 |
|
64 |
-ccdb2 = cdhit_ccdb(ccdb2, 'cdr3', type = 'AA', cluster_name = 'aa80', identity = .8, min_length = 5) |
|
64 |
+ccdb2 = cdhit_ccdb(ccdb2, 'cdr3', type = 'AA', cluster_pk = 'aa80', identity = .8, min_length = 5) |
|
65 | 65 |
ccdb2 = fine_clustering(ccdb2, sequence_key = 'cdr3', type = 'AA', keep_clustering_details = FALSE) |
66 | 66 |
|
67 | 67 |
``` |