Browse code

Rename cluster_name to cluster_pk (since that's what it is).

Andrew McDavid authored on 15/10/2020 14:27:43
Showing 9 changed files

... ...
@@ -14,3 +14,4 @@ manuscript/
14 14
 .ignore
15 15
 R/immcantation-utils.R
16 16
 ^WORDLIST$
17
+^doc$
... ...
@@ -83,15 +83,16 @@ cdhit = function(seqs, identity = NULL, kmerSize = NULL, min_length = 6, s = 1,
83 83
     options = c(uopts, options)
84 84
     options = options[!duplicated(names(options))]
85 85
     options = lapply(options, as.character)
86
-    out = switch(
87
-        class(seqs),
88
-        AAStringSet = cdhitC(options, name, showProgress) + 1,
89
-        DNAStringSet = cdhitestC(options, name, showProgress) + 1,
90
-        stop('seqs must be either AAStringSet or DNAStringSet')
91
-    )
92
-    if(only_index) return(out)
93
-    tibble::tibble(query_name = names(seqs), seq = as.character(seqs), cluster_idx = out) %>%
94
-        dplyr::group_by(cluster_idx) %>% dplyr::mutate(n_cluster = dplyr::n())
86
+    if(type == 'cdhitC'){
87
+        seq_cluster_index = cdhitC(options, name, showProgress) + 1
88
+    } else{
89
+        seq_cluster_index = cdhitestC(options, name, showProgress) + 1
90
+    }
91
+    if(only_index) return(seq_cluster_index)
92
+    tibble::tibble(query_name = names(seqs), seq = as.character(seqs),
93
+                   cluster_idx = seq_cluster_index) %>%
94
+        dplyr::group_by(cluster_idx) %>%
95
+        dplyr::mutate(n_cluster = dplyr::n()) %>% ungroup()
95 96
 }
96 97
 
97 98
 
... ...
@@ -100,7 +101,7 @@ cdhit = function(seqs, identity = NULL, kmerSize = NULL, min_length = 6, s = 1,
100 101
 ##' @param ccdb An object of class [ContigCellDB()]
101 102
 ##' @param sequence_key `character` naming the column in the `contig_tbl` containing the sequence to be clustered
102 103
 ##' @param type one of 'DNA' or 'AA'
103
-##' @param cluster_name `character` specifying key, and name for the clustering.
104
+##' @param cluster_pk `character` specifying key, and name for the clustering.
104 105
 ##' @return [ContigCellDB()]
105 106
 ##' @inheritDotParams cdhit -seqs -only_index
106 107
 ##' @export
... ...
@@ -113,7 +114,7 @@ cdhit = function(seqs, identity = NULL, kmerSize = NULL, min_length = 6, s = 1,
113 114
 ##' res$contig_tbl
114 115
 ##' res$cluster_pk
115 116
 cdhit_ccdb = function(ccdb, sequence_key, type = c('DNA', 'AA'),
116
-                      cluster_name = 'cluster_idx', ...){
117
+                      cluster_pk = 'cluster_idx', ...){
117 118
     seqs = ccdb$contig_tbl[[sequence_key]]
118 119
     if(length(seqs) < 1) stop("No sequences were provided")
119 120
     type = match.arg(type, c('DNA', 'AA'))
... ...
@@ -123,7 +124,8 @@ cdhit_ccdb = function(ccdb, sequence_key, type = c('DNA', 'AA'),
123 124
         seqset = AAStringSet(seqs)
124 125
     }
125 126
     cluster_idx = cdhit(seqset, ..., only_index = TRUE)
126
-    contig_tbl = dplyr::mutate(ccdb$contig_tbl, !!sym(cluster_name) := cluster_idx)
127
-    cluster_tbl = as_tibble(unique(contig_tbl[cluster_name]))
128
-    replace_cluster_tbl(ccdb, cluster_tbl, contig_tbl, cluster_pk = cluster_name)
127
+    contig_tbl = ccdb$contig_tbl
128
+    contig_tbl[[cluster_pk]] = cluster_idx
129
+    cluster_tbl = as_tibble(unique(contig_tbl[cluster_pk]))
130
+    replace_cluster_tbl(ccdb, cluster_tbl, contig_tbl, cluster_pk = cluster_pk)
129 131
 }
... ...
@@ -4,7 +4,7 @@ globalVariables('ngrp')
4 4
 #'
5 5
 #' @param ccdb [ContigCellDB()]
6 6
 #' @param segment_keys fields in `contig_tbl` that identify a cluster
7
-#' @param cluster_name name of cluster to be added to `cluster_tbl`
7
+#' @param cluster_pk name of cluster to be added to `cluster_tbl`
8 8
 #'
9 9
 #' @return [ContigCellDB()]
10 10
 #' @export
... ...
@@ -13,13 +13,13 @@ globalVariables('ngrp')
13 13
 #' data(ccdb_ex)
14 14
 #' ccdb_ex = cluster_germline(ccdb_ex)
15 15
 #' ccdb_ex$cluster_tbl
16
-cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_name = 'cluster_idx'){
16
+cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'cluster_idx'){
17 17
     contig_tbl = ccdb$contig_tbl
18 18
     seg_types = contig_tbl %>% group_by(!!!syms(segment_keys)) %>% summarize() %>% ungroup()
19
-    seg_types[[cluster_name]] = seq_len(nrow(seg_types))
19
+    seg_types[[cluster_pk]] = seq_len(nrow(seg_types))
20 20
     cl_con_tbl = left_join_warn(seg_types, contig_tbl, by = segment_keys)
21
-    cluster_tbl = as_tibble(unique(cl_con_tbl[union(cluster_name, segment_keys)]))
22
-    replace_cluster_tbl(ccdb, cluster_tbl, cl_con_tbl, cluster_pk = cluster_name)
21
+    cluster_tbl = as_tibble(unique(cl_con_tbl[union(cluster_pk, segment_keys)]))
22
+    replace_cluster_tbl(ccdb, cluster_tbl, cl_con_tbl, cluster_pk = cluster_pk)
23 23
 }
24 24
 
25 25
 globalVariables(c('fc', 'd(medoid)', 'is_medoid', 'n_cluster'))
... ...
@@ -8,7 +8,7 @@ cdhit_ccdb(
8 8
   ccdb,
9 9
   sequence_key,
10 10
   type = c("DNA", "AA"),
11
-  cluster_name = "cluster_idx",
11
+  cluster_pk = "cluster_idx",
12 12
   ...
13 13
 )
14 14
 }
... ...
@@ -19,7 +19,7 @@ cdhit_ccdb(
19 19
 
20 20
 \item{type}{one of 'DNA' or 'AA'}
21 21
 
22
-\item{cluster_name}{\code{character} specifying key, and name for the clustering.}
22
+\item{cluster_pk}{\code{character} specifying key, and name for the clustering.}
23 23
 
24 24
 \item{...}{
25 25
   Arguments passed on to \code{\link[=cdhit]{cdhit}}
... ...
@@ -7,7 +7,7 @@
7 7
 cluster_germline(
8 8
   ccdb,
9 9
   segment_keys = c("v_gene", "j_gene", "chain"),
10
-  cluster_name = "cluster_idx"
10
+  cluster_pk = "cluster_idx"
11 11
 )
12 12
 }
13 13
 \arguments{
... ...
@@ -15,7 +15,7 @@ cluster_germline(
15 15
 
16 16
 \item{segment_keys}{fields in \code{contig_tbl} that identify a cluster}
17 17
 
18
-\item{cluster_name}{name of cluster to be added to \code{cluster_tbl}}
18
+\item{cluster_pk}{name of cluster to be added to \code{cluster_tbl}}
19 19
 }
20 20
 \value{
21 21
 \code{\link[=ContigCellDB]{ContigCellDB()}}
... ...
@@ -30,7 +30,7 @@ test_that("Don't need to be sorted", {
30 30
 data(ccdb_ex)
31 31
 
32 32
 test_that("cdhit_ccdb do update", {
33
-  res = cdhit_ccdb(ccdb_ex, 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97', identity = .965, min_length = 12, G = 1)
33
+  res = cdhit_ccdb(ccdb_ex, 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', identity = .965, min_length = 12, G = 1)
34 34
   expect_is(res, 'ContigCellDB')
35 35
   expect_equal(res$cluster_pk, 'DNA97')
36 36
   expect_equal(ncol(ccdb_ex$contig_tbl)+1,ncol(res$contig_tbl))
... ...
@@ -6,7 +6,7 @@ test_that("Cluster contigs by germline properties",{
6 6
   expect_is(cdb,'ContigCellDB')
7 7
   expect_gt(ncol(cdb$cluster_tbl),ncol(ccdb_ex$cluster_tbl))
8 8
   expect_equal(names(cdb$cluster_tbl),c('cluster_idx','v_gene','j_gene','chain'))
9
-  cdb1 <- cluster_germline(ccdb_ex,segment_keys = c('v_gene', 'j_gene'),cluster_name = 'clusterID')
9
+  cdb1 <- cluster_germline(ccdb_ex,segment_keys = c('v_gene', 'j_gene'),cluster_pk = 'clusterID')
10 10
   expect_gt(ncol(cdb$cluster_tbl),ncol(cdb1$cluster_tbl))
11 11
   expect_equal(names(cdb1$cluster_tbl)[1],'clusterID')
12 12
   expect_error(cluster_germline(ccdb_ex,segment_keys = c('v_gene', 'm_gene')))
... ...
@@ -15,7 +15,7 @@ test_that("Cluster contigs by germline properties",{
15 15
 ccdb_ex_small <- ccdb_ex
16 16
 ccdb_ex_small$cell_tbl <- ccdb_ex_small$cell_tbl[1:200,]
17 17
 ccdb_ex_small <- cdhit_ccdb(ccdb_ex_small,
18
-                           sequence_key = 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97',
18
+                           sequence_key = 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97',
19 19
                            identity = .965, min_length = 12, G = 1)
20 20
 ccdb_ex_small_fine <- fine_clustering(ccdb_ex_small, sequence_key = 'cdr3_nt', type = 'DNA')
21 21
 
... ...
@@ -23,8 +23,11 @@ library(stringr)
23 23
 library(purrr)
24 24
 ```
25 25
 
26
+
26 27
 # Load filtered contig files
27 28
 
29
+We begin with a `data.frame` of concatenated contig files ('all_contig_annotations.csv'), output from the Cellranger VDJ pipeline.
30
+
28 31
 ```{r}
29 32
 data(contigs_qc)
30 33
 MIN_CDR3_AA = 6
... ...
@@ -53,11 +56,9 @@ ggplot(paired_chain, aes(x = interaction(sample, pop), fill = pairing)) + geom_b
53 56
 
54 57
 
55 58
 ```{r}
56
-
57
-aa80 = cdhit_ccdb(cdb, 'cdr3', type = 'AA', cluster_name = 'aa80', identity = .8)
59
+aa80 = cdhit_ccdb(cdb, 'cdr3', type = 'AA', cluster_pk = 'aa80', identity = .8)
58 60
 aa80 = fine_clustering(aa80, sequence_key = 'cdr3', type = 'AA', keep_clustering_details = TRUE)
59 61
 
60
-
61 62
 # This maybe should be a turned into a function 
62 63
 # Other plots should be considered:
63 64
 # That show how clusters are split between samples, chains, etc
... ...
@@ -71,7 +72,7 @@ We cluster the CDR3 translated amino acid residues with the program [CD-HIT](htt
71 72
 # Cluster CDR3 DNA sequences
72 73
 
73 74
 ```{r, results = 'hide'}
74
-cdb = cdhit_ccdb(cdb, 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97', identity = .965, min_length = MIN_CDR3_AA*3-1, G = 1)
75
+cdb = cdhit_ccdb(cdb, 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', identity = .965, min_length = MIN_CDR3_AA*3-1, G = 1)
75 76
 cdb = fine_clustering(cdb, sequence_key = 'cdr3_nt', type = 'DNA')
76 77
 ggplot(cdb$cluster_tbl %>% filter(n_cluster>1) %>% gather(key, value, -DNA97) , aes(x = value))+ facet_wrap(~key, scales = 'free') + geom_histogram() + scale_y_sqrt()
77 78
 ```
... ...
@@ -81,7 +82,7 @@ We can also cluster by DNA identity.
81 82
 # Cluster by V-J identity
82 83
 
83 84
 ```{r}
84
-germline_cluster = cluster_germline(cdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_name = 'segment_idx')
85
+germline_cluster = cluster_germline(cdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx')
85 86
 ```
86 87
 
87 88
 We can cluster by any other feature of the contigs. Here we cluster each contig based on the chain and V-J genes.  This gives us the set of observed V-J pairings:
... ...
@@ -61,7 +61,7 @@ sce = SingleCellExperiment(assay = list(counts = expression), colData = all_bc)
61 61
 ```{r}
62 62
 ccdb2 = ContigCellDB(ccdb_ex$contig_tbl, contig_pk = ccdb_ex$contig_pk, cell_tbl = colData(sce), cell_pk = ccdb_ex$cell_pk, equalize = FALSE)
63 63
 
64
-ccdb2 = cdhit_ccdb(ccdb2, 'cdr3', type = 'AA', cluster_name = 'aa80', identity = .8, min_length = 5)
64
+ccdb2 = cdhit_ccdb(ccdb2, 'cdr3', type = 'AA', cluster_pk = 'aa80', identity = .8, min_length = 5)
65 65
 ccdb2 = fine_clustering(ccdb2, sequence_key = 'cdr3', type = 'AA', keep_clustering_details = FALSE)
66 66
 
67 67
 ```