Browse code

Expand/cleanup cdr3 clustering vignette Add cluster_plot function Add a (somewhat fragile) function to test if fine_clustering has been run.

Andrew McDavid authored on 19/10/2020 20:32:40
Showing 8 changed files

... ...
@@ -2,7 +2,7 @@ Type: Package
2 2
 Package: CellaRepertorium
3 3
 Title: Data structures, clustering and testing for single 
4 4
     cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)
5
-Version: 0.99.4
5
+Version: 0.99.5
6 6
 Authors@R: 
7 7
     c(person(given = "Andrew",
8 8
              family = "McDavid",
... ...
@@ -59,7 +59,8 @@ Suggests:
59 59
     RColorBrewer,
60 60
     SingleCellExperiment,
61 61
     scater,
62
-    broom.mixed
62
+    broom.mixed,
63
+    cowplot
63 64
 LinkingTo: 
64 65
     Rcpp
65 66
 VignetteBuilder: 
... ...
@@ -11,6 +11,7 @@ export(cluster_filterset)
11 11
 export(cluster_germline)
12 12
 export(cluster_logistic_test)
13 13
 export(cluster_permute_test)
14
+export(cluster_plot)
14 15
 export(cluster_test_by)
15 16
 export(crosstab_by_celltype)
16 17
 export(entropy)
... ...
@@ -112,6 +112,10 @@ left_join_warn = function(x, y, by, overwrite = FALSE, join = left_join, ...){
112 112
     join(x  = x, y = y, by = by, suffix = c('', '.y'), ...)
113 113
 }
114 114
 
115
+has_fineclustering = function(ccdb){
116
+    ('is_medoid' %in% names(ccdb$contig_tbl))
117
+}
118
+
115 119
 #' Find a canonical contig to represent a cluster
116 120
 #'
117 121
 #' @param ccdb [ContigCellDB()]
... ...
@@ -141,8 +145,8 @@ tie_break_keys = character(), order = 1, representative = ccdb$cluster_pk[1], co
141 145
         message("Filtering `contig_tbl` by `is_medoid`, override by setting `contig_filter_args == TRUE`")
142 146
         contig_filter_args = quote(is_medoid)
143 147
     }
144
-    if(!('is_medoid' %in% names(ccdb$contig_tbl))){
145
-        stop('Run `fine_clustering` first.')
148
+    if(!has_fineclustering(ccdb)){
149
+        stop('Run `fine_clustering(ccdb)` first.')
146 150
     }
147 151
     req_contig_fields = unique(c(contig_fields, representative, tie_break_keys))
148 152
     if(length(missing_contig <- setdiff(req_contig_fields, names(ccdb$contig_tbl))) > 0) stop('`contig_tbl` is missing fields, ', paste(missing_contig, collapse = ', '), '.')
... ...
@@ -239,7 +243,6 @@ fine_cluster_seqs = function(seqs, type = 'AA', big_memory_brute = FALSE, method
239 243
     list(cluster = hc, distance_mat = sd, distance = distance, medoid = medoid, max_dist = max(sd))
240 244
 }
241 245
 
242
-
243 246
 #' Calculate the entropy of a vector
244 247
 #'
245 248
 #' @param v categorical vector
... ...
@@ -1,5 +1,5 @@
1 1
 get_axis_labels = function(plt){
2
-    if(!requireNamespace('ggplot2')) stop("Install ggplot2 >= 3.0.0")
2
+    check_plot_infra()
3 3
 
4 4
     b = ggplot2::ggplot_build(plt)$layout$panel_params[[1]]
5 5
     if(utils::packageVersion('ggplot2') >='3.3.0'){
... ...
@@ -12,10 +12,6 @@ get_axis_labels = function(plt){
12 12
 }
13 13
 
14 14
 
15
-if(requireNamespace('ggplot2')) {
16
-    ScaleAxisColor = ggplot2::ggproto('ScaleAxisColor', ggplot2::ScaleDiscrete)
17
-}
18
-
19 15
 replace_empty = function(x, y) if(length(x) == 0) y else x
20 16
 
21 17
 #' Color axis labels
... ...
@@ -35,20 +31,27 @@ replace_empty = function(x, y) if(length(x) == 0) y else x
35 31
 #' require(dplyr)
36 32
 #' plt = ggplot(mpg, aes(x = manufacturer, y = drv)) + geom_jitter()
37 33
 #' label_data = mpg %>% select(manufacturer) %>% unique() %>%
38
-#' mutate(euro = manufacturer %in% c('audi', 'volkswagen'), drv = NA_character_)
39
-#' map_axis_labels(plt, label_data, label_data, euro)
40
-map_axis_labels = function(plt, label_data_x,  label_data_y, aes_label, scale = ggplot2::scale_color_hue(aesthetics = 'axis_color')){
34
+#' mutate(euro = manufacturer %in% c('audi', 'volkswagen'))
35
+#' map_axis_labels(plt, label_data_x = label_data, aes_label = euro)
36
+map_axis_labels = function(plt, label_data_x = NULL,  label_data_y = NULL, aes_label, scale = ggplot2::scale_color_hue(aesthetics = 'axis_color')){
41 37
     actual_labs = get_axis_labels(plt)
42 38
     aes_label_str = rlang::as_name(rlang::enquo(aes_label))
43 39
     label_data = bind_rows(label_data_x, label_data_y)
44
-    data_x = hushWarning(left_join(tibble(X_ = actual_labs$xlabels), label_data_x, by = c(X_ = rlang::as_name(plt$mapping$x))), 'coercing into character')
45
-    if(nrow(data_x) != length(actual_labs$xlabels)) stop("Bad join on xlabels!")
46
-    data_y = hushWarning(left_join(tibble(Y_ := actual_labs$ylabels), label_data_y,  by = c(Y_ = rlang::as_name(plt$mapping$y))), 'coercing into character')
47
-    if(nrow(data_y) != length(actual_labs$ylabels)) stop("Bad join on ylabels!")
48 40
     scale$train(label_data[[aes_label_str]])
49
-    data_x$COLOR = replace_empty(scale$map(data_x[[aes_label_str]]),  'black')
50
-    data_y$COLOR = replace_empty(scale$map(data_y[[aes_label_str]]), 'black')
51
-    plt + hushWarning(ggplot2::theme(axis.text.x = ggplot2::element_text(color = data_x$COLOR), axis.text.y = ggplot2::element_text(color = data_y$COLOR)), 'Vectorized input')
41
+    thm = ggplot2::theme()
42
+    if(!is.null(label_data_x)){
43
+        data_x = hushWarning(left_join(tibble(X_ = actual_labs$xlabels), label_data_x, by = c(X_ = rlang::as_name(plt$mapping$x))), 'coercing into character')
44
+        if(nrow(data_x) != length(actual_labs$xlabels)) stop("Bad join on xlabels!")
45
+        data_x$COLOR = replace_empty(scale$map(data_x[[aes_label_str]]),  'black')
46
+        thm = hushWarning(thm + ggplot2::theme(axis.text.x = ggplot2::element_text(color = data_x$COLOR)), 'Vectorized input')
47
+    }
48
+    if(!is.null(label_data_y)){
49
+        data_y = hushWarning(left_join(tibble(Y_ := actual_labs$ylabels), label_data_y,  by = c(Y_ = rlang::as_name(plt$mapping$y))), 'coercing into character')
50
+        if(nrow(data_y) != length(actual_labs$ylabels)) stop("Bad join on ylabels!")
51
+        data_y$COLOR = replace_empty(scale$map(data_y[[aes_label_str]]), 'black')
52
+        thm = hushWarning(thm + ggplot2::theme(axis.text.y = ggplot2::element_text(color = data_y$COLOR)), 'Vectorized input')
53
+    }
54
+    plt + thm
52 55
 }
53 56
 
54 57
 globalVariables('Y_')
55 58
new file mode 100644
... ...
@@ -0,0 +1,29 @@
1
+check_plot_infra = function(){
2
+    if(!requireNamespace('ggplot2')) stop("Install ggplot2 >= 3.0.0.")
3
+    if(!requireNamespace('cowplot')) stop("Install cowplot.")
4
+}
5
+
6
+#' Make a plot showing properties of the clustering
7
+#'
8
+#' The number of elements per cluster and the average distance between the medoid and other elements are plotted.
9
+#' @param cdb A `fine_clustering` `ContigCellDB` object
10
+#' @param return_plotlist should  a list of `ggplot2` plots be returned.  If FALSE, a `cowplot` composite is retuned.
11
+#'
12
+#' @return  a `cowplot` composite or a list of plots.
13
+#' @export
14
+#'
15
+#' @example inst/examples/small_cluster_example.R
16
+#' @examples
17
+#' cluster_plot(ccdb_ex_small)
18
+cluster_plot = function(cdb, return_plotlist = FALSE){
19
+    check_plot_infra()
20
+    if(!has_fineclustering(cdb)) stop("Run `cdhit_cdb(cdb)` and/or `fine_clustering(cdb)` first.")
21
+    dist_expanded = dplyr::filter(cdb$cluster_tbl, .data$n_cluster>1)
22
+    n_cluster = cdb$cluster_tbl
23
+    plts = list(
24
+        ggplot2::ggplot(dist_expanded, ggplot2::aes(x = .data$avg_distance)) + ggplot2::geom_histogram() + ggplot2::xlab('Intra distance') + ggplot2::ggtitle(' Distance (non-singletons)', subtitle = cdb$cluster_pk),
25
+        ggplot2::ggplot(cdb$cluster_tbl, ggplot2::aes(x = .data$n_cluster)) + ggplot2::geom_histogram() + ggplot2::xlab('Number of members') + ggplot2::ggtitle('Cluster sizes', subtitle =  cdb$cluster_pk))
26
+    if(return_plotlist) return(plts)
27
+
28
+    cowplot::plot_grid(plotlist = plts)
29
+}
0 30
new file mode 100644
... ...
@@ -0,0 +1,40 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/plotting.R
3
+\name{cluster_plot}
4
+\alias{cluster_plot}
5
+\title{Make a plot showing properties of the clustering}
6
+\usage{
7
+cluster_plot(cdb, return_plotlist = FALSE)
8
+}
9
+\arguments{
10
+\item{cdb}{A \code{fine_clustering} \code{ContigCellDB} object}
11
+
12
+\item{return_plotlist}{should  a list of \code{ggplot2} plots be returned.  If FALSE, a \code{cowplot} composite is retuned.}
13
+}
14
+\value{
15
+a \code{cowplot} composite or a list of plots.
16
+}
17
+\description{
18
+The number of elements per cluster and the average distance between the medoid and other elements are plotted.
19
+}
20
+\examples{
21
+library(dplyr)
22
+data(ccdb_ex)
23
+ccdb_ex_small = ccdb_ex
24
+ccdb_ex_small$cell_tbl = ccdb_ex_small$cell_tbl[1:200,]
25
+ccdb_ex_small = cdhit_ccdb(ccdb_ex_small,
26
+sequence_key = 'cdr3_nt', type = 'DNA', cluster_name = 'DNA97',
27
+identity = .965, min_length = 12, G = 1)
28
+ccdb_ex_small = fine_clustering(ccdb_ex_small, sequence_key = 'cdr3_nt', type = 'DNA')
29
+
30
+# Canonicalize with the medoid contig is probably what is most common
31
+ccdb_medoid = canonicalize_cluster(ccdb_ex_small)
32
+
33
+# But there are other possibilities.
34
+# To pass multiple "AND" filter arguments must use &
35
+ccdb_umi = canonicalize_cluster(ccdb_ex_small,
36
+contig_filter_args = chain == 'TRA' & length > 500, tie_break_keys = 'umis',
37
+contig_fields = c('chain', 'length'))
38
+ccdb_umi$cluster_tbl \%>\% dplyr::select(chain, length) \%>\% summary()
39
+cluster_plot(ccdb_ex_small)
40
+}
... ...
@@ -6,8 +6,8 @@
6 6
 \usage{
7 7
 map_axis_labels(
8 8
   plt,
9
-  label_data_x,
10
-  label_data_y,
9
+  label_data_x = NULL,
10
+  label_data_y = NULL,
11 11
   aes_label,
12 12
   scale = ggplot2::scale_color_hue(aesthetics = "axis_color")
13 13
 )
... ...
@@ -36,6 +36,6 @@ require(ggplot2)
36 36
 require(dplyr)
37 37
 plt = ggplot(mpg, aes(x = manufacturer, y = drv)) + geom_jitter()
38 38
 label_data = mpg \%>\% select(manufacturer) \%>\% unique() \%>\%
39
-mutate(euro = manufacturer \%in\% c('audi', 'volkswagen'), drv = NA_character_)
40
-map_axis_labels(plt, label_data, label_data, euro)
39
+mutate(euro = manufacturer \%in\% c('audi', 'volkswagen'))
40
+map_axis_labels(plt, label_data_x = label_data, aes_label = euro)
41 41
 }
... ...
@@ -1,12 +1,18 @@
1 1
 ---
2
-title: "Clustering repertoire via CDR3 sequences"
2
+title: "Clustering and differential usage of repertoire CDR3 sequences"
3 3
 output: BiocStyle::html_document
4
+author:
5
+- name: Andrew McDavid
6
+  affiliation: University of Rochester, Department of Biostatistics and Computational Biology
7
+  email: Andrew_McDavid@urmc.rochester.edu
4 8
 vignette: >
5
-  %\VignetteIndexEntry{Clustering repertoire via CDR3 sequences}
9
+  %\VignetteIndexEntry{Clustering and differential usage of repertoire CDR3 sequences}
6 10
   %\VignetteEngine{knitr::rmarkdown}
7 11
   %\VignetteEncoding{UTF-8}
8 12
 ---
9 13
 
14
+In this vignette we demonstrate clustering of 3rd complementary determining region sequence (CDR3) and V-J gene identity of mouse T cells,  ways to visualize and explore clusters that are expanded, pairing of alpha-beta clusters, tests of differential CDR3 usage, and permutation tests for overall clonal properties.
15
+
10 16
 ```{r, include = FALSE}
11 17
 knitr::opts_chunk$set(
12 18
   collapse = TRUE,
... ...
@@ -24,6 +30,7 @@ library(purrr)
24 30
 ```
25 31
 
26 32
 
33
+
27 34
 # Load filtered contig files
28 35
 
29 36
 We begin with a `data.frame` of concatenated contig files ('all_contig_annotations.csv'), output from the Cellranger VDJ pipeline.
... ...
@@ -34,52 +41,67 @@ MIN_CDR3_AA = 6
34 41
 
35 42
 
36 43
 cdb = ContigCellDB_10XVDJ(contigs_qc, contig_pk = c('barcode', 'pop', 'sample', 'contig_id'), cell_pk = c('barcode', 'pop', 'sample'))
37
-
38
-cdb$contig_tbl = dplyr::filter(cdb$contig_tbl, full_length, productive == 'True', high_confidence, chain != 'Multi', str_length(cdr3) > MIN_CDR3_AA) %>% mutate( fancy_name = fancy_name_contigs(., str_c(pop, '_', sample)))
39
-
44
+cdb
40 45
 ```
41 46
 
42
-`r nrow(cdb)` good chains (either TRA or TRB); each cell can appear more than once.
47
+Initially we start with  `r nrow(cdb)` cells and `r nrow(cdb$contig_tbl)` contigs.  We keep contigs that are 
43 48
 
49
+* full - length 
50
+* productive
51
+* high-confidence
52
+* only from T cells
53
+* and with CDR3 sufficiently long.
44 54
 
45
-# Chain pairings
55
+Then we add a descriptive readable name for each contig.
46 56
 
47 57
 ```{r}
48
-paired_chain = enumerate_pairing(cdb, chain_recode_fun = 'guess')
49
-
50
-ggplot(paired_chain, aes(x = interaction(sample, pop), fill = pairing)) + geom_bar() + facet_wrap(~canonical, scale = 'free_x') + coord_flip() + theme_minimal()
58
+cdb$contig_tbl = dplyr::filter(cdb$contig_tbl, full_length, productive == 'True', high_confidence, chain != 'Multi', str_length(cdr3) > MIN_CDR3_AA) %>% mutate( fancy_name = fancy_name_contigs(., str_c(pop, '_', sample)))
51 59
 
52 60
 ```
53 61
 
62
+After filtering, there are `r nrow(cdb)` cells and `r nrow(cdb$contig_tbl)` contigs.
63
+
64
+
54 65
 
55
-# Cluster CDR3 protein sequences
66
+# Clustering contigs by sequence characteristics
56 67
 
68
+As a first step to define clonotypes, we will first find equivalence classes of CDR3 sequences with the program [CD-HIT](http://weizhongli-lab.org/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit). In this case, we use the translated amino acid residues, but often one might prefer to use the DNA sequences, by setting the `sequence_key` accordingly and `type = 'DNA'`. Additionally, a higher identity threshold might be appropriate  (see below).
57 69
 
58 70
 ```{r}
59
-aa80 = cdhit_ccdb(cdb, 'cdr3', type = 'AA', cluster_pk = 'aa80', identity = .8)
71
+aa80 = cdhit_ccdb(cdb, sequence_key = 'cdr3', type = 'AA', cluster_pk = 'aa80', 
72
+                  identity = .8, min_length = 5, G = 1)
60 73
 aa80 = fine_clustering(aa80, sequence_key = 'cdr3', type = 'AA', keep_clustering_details = TRUE)
74
+```
75
+
76
+This partitions sequences into sets with >80% mutual similarity in the amino acid sequence,  adds some additional information about the clustering, and returns it as a `ContigCellDB` object named `aa80`.  The primary key for the clusters is `r aa80$cluster_pk`.  The `min_length` can be set somewhat smaller, but there is a lower limit for the cdhit algorithm.  `G=1`, the default, specifies a global alignment.  This is almost always what is desired, but local alignment is available if `G=0`.
77
+
78
+```{r}
79
+head(aa80$cluster_tbl)
80
+head(aa80$contig_tbl) %>% select(contig_id, aa80, is_medoid, `d(medoid)`)
81
+```
82
+
83
+The `cluster_tbl`  lists the `r nrow(aa80$cluster_tbl)` 80% identity groups found, including the number of contigs in the cluster, and the average distance between elements in the group.
84
+In the `contig_tbl`, there are two columns specifying if the contig `is_medoid`, that is, is the most representative element of the set and the distance to the medoid element `d(medoid)`.
61 85
 
62
-# This maybe should be a turned into a function 
63
-# Other plots should be considered:
64
-# That show how clusters are split between samples, chains, etc
65
-ggplot(aa80$cluster_tbl %>% filter(n_cluster>1) %>% gather(key, value, -aa80, -fc) , aes(x = value))+ facet_wrap(~key, scales = 'free') + geom_histogram() + scale_y_sqrt()
66 86
 
87
+```{r}
88
+cluster_plot(aa80)
67 89
 ```
68 90
 
69
-We cluster the CDR3 translated amino acid residues with the program [CD-HIT](http://weizhongli-lab.org/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit).  A sequence is included in a cluster if it matches by 100% similarity and has the same CDR3 length.  Note that this can and should be relaxed -- especially in the beta chain we see "near clones" that only differ by a residue or two, seemingly in stylized places.
70 91
 
71 92
 
72
-# Cluster CDR3 DNA sequences
93
+## Cluster CDR3 DNA sequences
73 94
 
74 95
 ```{r, results = 'hide'}
75 96
 cdb = cdhit_ccdb(cdb, 'cdr3_nt', type = 'DNA', cluster_pk = 'DNA97', identity = .965, min_length = MIN_CDR3_AA*3-1, G = 1)
76 97
 cdb = fine_clustering(cdb, sequence_key = 'cdr3_nt', type = 'DNA')
77
-ggplot(cdb$cluster_tbl %>% filter(n_cluster>1) %>% gather(key, value, -DNA97) , aes(x = value))+ facet_wrap(~key, scales = 'free') + geom_histogram() + scale_y_sqrt()
98
+
99
+cluster_plot(cdb)
78 100
 ```
79 101
 
80 102
 We can also cluster by DNA identity.
81 103
 
82
-# Cluster by V-J identity
104
+## Cluster by V-J identity
83 105
 
84 106
 ```{r}
85 107
 germline_cluster = cluster_germline(cdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx')
... ...
@@ -102,8 +124,7 @@ Average Levenshtein distance of CDR3 within each pair.  This might be turned int
102 124
 
103 125
 
104 126
 
105
-
106
-# Oligo clusters
127
+## Expanded clusters
107 128
 
108 129
 Next, we will examine the clusters that are found in many contigs.  First we will get a canonical contig to represent each cluster.  This will be the medoid contig, by default.
109 130
 
... ...
@@ -131,7 +152,7 @@ knitr::kable(oligo_clusters %>% select(aa80:cdr3, chain:j_gene, avg_distance, n_
131 152
 
132 153
 ```
133 154
 
134
-Report some statistics about these expanded clusters.
155
+Report some statistics about these expanded clusters, such as how often they are found, how many subjects, etc.
135 156
 
136 157
 ```{r}
137 158
 oligo_plot = ggplot(oligo_contigs$contig_tbl, aes(x = representative, fill = chain)) + geom_bar() + coord_flip() + scale_fill_brewer(type = 'qual') + theme_minimal()
... ...
@@ -149,33 +170,40 @@ But come from multiple populations and samples.
149 170
 
150 171
 ## Some simple phylogenetic relationships
151 172
 
173
+By using the within-cluster distances, some rudamentory plots attempting to show phylogenetic associations are possible.  (These are most biologically appropriate for B cells that undergo somatic hypermutation.)  
174
+
152 175
 ```{r}
153 176
 library(ggdendro)
154 177
 
155
-# This should be turned into a function in the package somehow
156
-# But plot arguments will be super-variable
157
-# Maybe just return the `hc` object?
158 178
 dendro_plot = function(ccdb, idx, method = 'complete'){
159 179
     h = filter(ccdb$cluster_tbl, !!sym(ccdb$cluster_pk) == idx) %>% pull(fc) %>% .[[1]]
160 180
     quer = filter(ccdb$contig_tbl, !!sym(ccdb$cluster_pk) == idx)
161 181
     hc = hclust(as.dist(h$distance_mat), method = method) %>% dendro_data(type = "rectangle")
162 182
     hc$labels = cbind(hc$labels, quer)
163 183
    ggplot(hc$segments, aes(x=x, y=y)) + geom_segment(aes(xend=xend, yend=yend)) + 
164
-  theme_classic() + geom_text(data = hc$labels, aes(color = sample, label = fancy_name), size = 3, angle = 60) + scale_x_continuous(breaks = NULL) + ylab('AA Distance') + xlab('')
184
+  theme_classic() + geom_text(data = hc$labels, aes(color = sample, label = fancy_name), size = 3, angle = 60, hjust =0, vjust = 0) + scale_x_continuous(breaks = NULL) + ylab('AA Distance') + xlab('')
165 185
 }
166 186
 
167
-to_plot = filter(aa80$cluster_tbl, n_cluster >= MIN_OLIGO)
187
+to_plot = aa80$cluster_tbl %>% filter(min_rank(-n_cluster) == 1)
168 188
 
169 189
 map(to_plot$aa80, ~ dendro_plot(aa80, .))
170 190
 
171 191
 ```
172 192
 
193
+A full-blown generative model of clonal generation and selection would be recommended for any actual analysis, but these plots may suffice to get a quick idea of the phylogenetic structure.
194
+
173 195
 ## Formal testing for frequency differences
174 196
 
197
+We can test for differential usage of a clone, or cluster with `cluster_logistic_test` and `cluster_test_by`.  The latter splits the `cluster_tbl` by `field = 'chain'`, thereby adjusting the number of cell trials included in the "denominator" of the logistic regression. 
198
+The formula tests for differences between populations, including the sample as a random effect, and only tests clusters that are included in the `oligo_clusters` set.
199
+
200
+
175 201
 ```{r,  results = 'hide'}
176
-mm_out = cluster_test_by(aa80, fields = 'chain', 'cluster_tbl', ~ pop + (1|sample), filterset = cluster_filterset(white_list = oligo_clusters)) %>% left_join(oligo_clusters)
202
+mm_out = cluster_test_by(aa80, fields = 'chain', tbl = 'cluster_tbl', formula = ~ pop + (1|sample), filterset = cluster_filterset(white_list = oligo_clusters)) %>%
203
+  left_join(oligo_clusters)
177 204
 
178
-mm_out = mutate(mm_out, conf.low = estimate-1.96*std.error, conf.high = estimate + 1.96*std.error)
205
+mm_out = mutate(mm_out, conf.low = estimate-1.96*std.error, 
206
+                conf.high = estimate + 1.96*std.error)
179 207
 
180 208
 ```
181 209
 
... ...
@@ -186,9 +214,32 @@ ggplot(mm_outj, aes(x = representative, ymin = conf.low, ymax = conf.high, y = e
186 214
 ```
187 215
 
188 216
 
217
+We test if the binomial rate of clone expression differs between balbc and b6, for the selected clones.  None appear to be different.
218
+
219
+## Length of CDR3
220
+
221
+```{r}
222
+aa80$contig_tbl = aa80$contig_tbl %>% mutate(cdr3_length = str_length(cdr3_nt))
223
+ggplot(aa80$contig_tbl, aes(fill = pop, x= cdr3_length)) +
224
+  geom_histogram(binwidth = 1, mapping = aes(y = ..density..)) + 
225
+  theme_minimal() + scale_fill_brewer(type = 'qual') + 
226
+  facet_grid(sample ~chain) + theme(strip.text.y = element_text(angle = 0)) + coord_cartesian(xlim = c(25, 55))
227
+
228
+```
229
+
230
+Some authors have noted that the length of the CDR3 region can be predictive of T cell differentiation. In our study, there doesn't appear to be a noticeable difference between BALB/c and C57BL/6J (b6) mice, but if we needed to make sure, an appropriate procedure would be to run a mixed model with a random `sample` effect (assumed to represent a biological replicate).
189 231
 
232
+```{r cdr3_len, fig.width = 3, fig.height = 3}
233
+cdr_len = aa80$contig_tbl %>% group_by(chain) %>% do(broom::tidy(lme4::lmer(cdr3_length ~ pop + (1|sample), data = .), conf.int = TRUE))
234
+ggplot(cdr_len %>% filter(term == 'popbalbc'), aes(x = interaction(chain, term), y = estimate, ymin = conf.low, ymax = conf.high)) + 
235
+  geom_pointrange() + theme_minimal() + coord_flip() + 
236
+  ylab('Length(CDR3 Nt)') + xlab('Term/Chain') + geom_hline(yintercept = 0, lty = 2)
237
+
238
+```
239
+
240
+We end up with a (harmless) convergence warning about a singular fit.  This is expected, because the `samples` aren't actually replicates -- they are just subsamples drawn for illustrative purposes.
241
+The Balbc mice  have .5 fewer nucleotides per contig, on average,  and this is not significant.
190 242
 
191
-We test if the binomial rate of clone expression differs between CD31+/- or term, for each clone.
192 243
 
193 244
 # Clonal pairs
194 245
 
... ...
@@ -224,23 +275,78 @@ pairing_list = pairing_tables(aa80, table_order = 2, orphan_level = 1, min_expan
224 275
 
225 276
 By setting `min_expansion = Inf, cluster_whitelist = whitelist` we can examine any pairings for a set of cluster_idx, in this case the ones that were seen multiple times.  Interestingly (and unlike some human samples) the expanded clusters are  $\beta$-chain, and their $\alpha$ chains are sprinkled quite evenly across clusters.
226 277
 
227
-# Length of CDR3
278
+# Permutation tests
279
+
280
+Permutation tests allow tests of independence between cluster assignments and other cell-level covariates (such as the sample from which the cell was derived).  The cluster label is permuted to break the link between cell and cluster, and an arbitrary statistic of both cluster label, and cell covariate is evaluated.
228 281
 
229 282
 ```{r}
230
-aa80$contig_tbl = aa80$contig_tbl %>% mutate(cdr3_length = str_length(cdr3_nt))
231
-ggplot(aa80$contig_tbl, aes(fill = pop, x= cdr3_length)) + geom_histogram(binwidth = 1, mapping = aes(y = ..density..)) + theme_minimal() + scale_fill_brewer(type = 'qual') + facet_grid(sample ~chain) + theme(strip.text.y = element_text(angle = 0)) + coord_cartesian(xlim = c(25, 55))
283
+aa80_chain = split_cdb(aa80, 'chain') %>% lapply(canonicalize_cell, contig_fields = 'aa80')
284
+
285
+compare_expanded = function(cluster_idx, grp){
286
+  # cluster_idx contains the permuted cluster assignments
287
+  # grp the cell_covariate_keys.
288
+  # NB: this is always a data.frame even if it is just a single column
289
+  # cross tab by pop
290
+  tab = table(cluster_idx, grp[[1]])
291
+  # count number of times an aa80 class was expanded
292
+  expanded = colSums(tab>=2)
293
+  # compare difference
294
+  expanded['b6'] - expanded['balbc']
295
+}
296
+```
297
+
298
+The signature of the statistic should be of a vector `cluster_idx` and `data.frame`.
232 299
 
300
+```{r}
301
+set.seed(1234)
302
+perm1 = cluster_permute_test(aa80_chain$TRB, cell_covariate_keys = 'pop', cell_label_key = 'aa80', n_perm = 100, statistic = compare_expanded)
303
+
304
+perm1
233 305
 ```
234 306
 
235
-Plot the CDR3 length distribution for each sample and pop.  There doesn't appear to be a noticeable difference between BALB/c and C57BL/6J (b6) mice, but if we needed to make sure, an appropriate procedure would be to run a mixed model with a random `sample` effect (assumed to represent a biological replicate).
307
+Although b6 mice had `r perm1$observed` more clones observed to be expanded (occuring >=2 times) than balbc, this is not signficant under a null model where cells were permuted between mouse types (populations), where b6 are expected to have about `r round(perm1$expected)` more expanded clones, just due to the additional number of cells sampled in b6 and the particular spectrum of clonal frequencies in this experiment:
308
+
309
+```{r}
310
+knitr::kable(table(pop = aa80_chain$TRB$pop))
311
+```
236 312
 
313
+Indeed if we resample in a way that fixes each group to have the same number of cells:
314
+```{r}
315
+rarify = aa80_chain$TRB$cell_tbl %>% group_by(pop) %>% do(slice_sample(., n = 377))
237 316
 
238
-```{r cdr3_len, fig.width = 3, fig.height = 3}
239
-cdr_len = aa80$contig_tbl %>% group_by(chain) %>% do(broom::tidy(lme4::lmer(cdr3_length ~ pop + (1|sample), data = .), conf.int = TRUE))
240
-ggplot(cdr_len %>% filter(group == 'fixed', term != '(Intercept)'), aes(x = interaction(chain, term), y = estimate, ymin = conf.low, ymax = conf.high)) + geom_pointrange() + theme_minimal() + coord_flip() + ylab('Length(CDR3 Nt)') + xlab('Term/Chain')
317
+aa80_chain$TRB$cell_tbl = semi_join(aa80_chain$TRB$cell_tbl, rarify)
241 318
 
319
+cluster_permute_test(aa80_chain$TRB, cell_covariate_keys = 'pop', cell_label_key = 'aa80', n_perm = 500, statistic = compare_expanded)
242 320
 ```
243 321
 
244
-We end up with a (harmless) convergence warning about a singular fit.  This is expected, because the `samples` aren't actually replicates -- they are just subsamples drawn for illustrative purposes.
245
-The Balbc mice  have .5 fewer nucleotides per contig, on average,  and this is not significant.
322
+We see that this discrepacy between the number of expanded clones between subpopulations is  mostly explained by a greater number of cells sampled in b6, but also random variability plays a role.
323
+
324
+We can also test for oligoclonality, eg, how often is a beta chain expanded in a sample:
325
+
326
+```{r}
327
+count_expanded = function(cluster_idx, grp){
328
+  # clusters x sample contigency table
329
+  tab = table(cluster_idx, grp[[1]])
330
+  # number of cluster x samples that occured more than once
331
+  expanded = sum(tab>1)
332
+  expanded
333
+}
334
+
335
+perm3 = cluster_permute_test(aa80_chain$TRB,  cell_covariate_keys = 'sample', cell_label_key = 'aa80', n_perm = 500, statistic = count_expanded)
336
+perm3
337
+```
246 338
 
339
+`r perm3$observed` expanded clones were observed in each of the two populations vs `r round(perm3$expected)` expected, and this discrepancy would be significant at $p<$ `r ceiling(perm3$p.value*100)/100`.  This is indicating that there is underdispersion -- fewer clusters are expanded than expected, given the spectrum of clonal frequencies and the number of cells per sample.
340
+
341
+To further elucidate this, we can restrict the permutations to maintain certain margins of the table by specifying `cell_stratify_keys.`  This doesn't effect the observed values of the statistics, but will change the expected values (since these are now conditional expectations.)  Here we restrict the permutations within levels of `pop` (eg, only permuting within balbc, and within b6).
342
+
343
+```{r}
344
+cluster_permute_test(aa80_chain$TRB,   cell_covariate_keys = 'sample', cell_stratify_keys = 'pop', cell_label_key = 'aa80', n_perm = 500, statistic = count_expanded)
345
+```
346
+
347
+In the restricted permutations, the expected number of expanded clusters is even greater.  Both of these effects are due to the fact that the "sample" replicates, within each population actually are not biological replicates, which inflates the `cluster_idx` margin of the table.
348
+
349
+# Colophone
350
+```{r}
351
+sessionInfo()
352
+```