Browse code

Pairing / canonicalization methods now in separate file

Provide a chain-canonicalization method which is now used by default in pairing_tables

Some doc updates, too.

Andrew McDavid authored on 02/04/2019 16:08:49
Showing 11 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,3 @@
1
+^.*\.Rproj$
2
+^\.Rproj\.user$
3
+extdata/.*\.json$
... ...
@@ -1,70 +1,20 @@
1
-#' For each cell, return a single, canonical chain-cluster
2
-#'
3
-#' In single cell data, multiple chains (heavy-light or alpha-beta) are expected.  In some cases, there could be more than two (eg multiple alpha alleles for T cells).
4
-#' This picks a cluster id for each cell based on the overall prevalence of cluster ids over all cells in `tbl`.
5
-#' If order = 1 then the canonical chain-cluster will be the most prevalent, and if order = 2, it will be the 2nd most prevalent, and so on.  Ties are broken arbitrarily (possibly by lexicographic order of `cluster_idx`).
6
-#' @param tbl `data.frame` containing columns specified in `cell_identifiers` and `cluster_idx`
7
-#' @param cell_identifiers `character` vector specifying columns in `tbl` that identify a cell
8
-#' @param cluster_idx `character` specifying the column in `tbl` that identifies a
9
-#' @param order return the 1st, 2nd, 3rd, etc, most common chain-cluster
10
-#'
11
-#'
12
-#' @return `data.frame` with columns from `cell_identifiers` and a single `cluster_idx` for each cell
13
-#' @export
14
-get_canonical_chain = function(tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', order = 1){
15
-    prevalence = tbl %>% group_by(!!sym(cluster_idx)) %>% summarize(prev = n())
16
-    tbl_prevalence = left_join(tbl, prevalence, by = cluster_idx)
17
-    tbl_order = tbl_prevalence %>% group_by(!!!syms(cell_identifiers)) %>% summarize(!!cluster_idx := dplyr::nth(!!sym(cluster_idx), -order, prev))
18
-    ungroup(tbl_order)
1
+cluster_germline = function(tbl, segment_identifiers = c('v_gene', 'j_gene', 'chain')){
2
+    seg_types = tbl %>% group_by(!!!syms(segment_identifiers)) %>% summarize() %>% ungroup() %>% mutate(germline_idx = seq_len(nrow(.)))
3
+    tbl %>% left_join(seg_types, by = segment_identifiers)
19 4
 }
20 5
 
21
-# Define canonical chain types per cell
22
-
23
-#' Given a family of similar sequences, return a "representative"
24
-#'
25
-#' @param seqs character vector
26
-#' @param medoid_idx optional index into seqs
27
-#' @param warn_if_distinct Should a warning be emitted if there are distinct elements in seqs?
28
-#'
29
-#' If medoid_idx is supplied, the medoid sequence is returned, otherwise the longest
30
-#' sequence is returned
31
-#'
32
-#' @return character vector
33
-#' @export
34
-#'
35
-#' @examples
36
-#' get_canonical_representative(c('apple', 'manzana', 'pomme'))
37
-get_canonical_representative = function(seqs, medoid_idx, warn_if_distinct = FALSE){
38
-    if(!missing(medoid_idx)){
39
-        if(!is.integer(medoid_idx) || medoid_idx < 1 || medoid_idx > length(seqs)) stop("Illegal `medoid_idx`")
40
-        ii = medoid_idx
41
-    } else{
42
-        len = str_length(seqs)
43
-        ii = which.max(len)
44
-    }
45
-    rep = seqs[ii]
46
-
47
-    if(warn_if_distinct && length(nomatch <- which(seqs != rep)) > 0) warning("At indices ", nomatch, " sequences did not match the representative")
48
-
49
-    return(rep)
50
-}
51
-
52
-
53
-
54
-
55
-
56 6
 fine_cluster_by = function(seqs, by, max_dist = NULL, ...){
57 7
     seq_list = split(seqs, by)
58 8
     fc_list = lapply(seq_list, fine_cluster, ...)
59 9
     if(is.null(max_dist)){
60
-        max_max = max(map_dbl(fc_list, 'max_dist'))
10
+        max_max = max(purrr::map_dbl(fc_list, 'max_dist'))
61 11
     } else {
62 12
         max_max = max_dist
63 13
     }
64
-    affinities = map(fc_list, ~ max_max - .[['distance']])
14
+    affinities = purrr::map(fc_list, ~ max_max - .[['distance']])
65 15
     aff_matrix = Matrix::.bdiag(affinities)
66
-    homologies = unlist(map(fc_list, 'homology'), use.names = FALSE)
67
-    medoid = map_dbl(fc_list, 'medoid')
16
+    homologies = unlist(purrr::map(fc_list, 'homology'), use.names = FALSE)
17
+    medoid = purrr::map_dbl(fc_list, 'medoid')
68 18
     list(affinities, aff_matrix, homologies, medoid)
69 19
 }
70 20
 
... ...
@@ -72,12 +22,15 @@ fine_cluster_by = function(seqs, by, max_dist = NULL, ...){
72 22
 #'
73 23
 #' The distances between AA sequences is defined to be 1-score/max(score) times the median length of the input sequences.
74 24
 #' The distances between nucleotide sequences is defined to be edit_distance/max(edit_distance) times the median length of input sequences.
25
+#'
75 26
 #' @param seqs character vector, DNAStringSet or AAStringSet
76 27
 #' @param type character either `AA` or `DNA` specifying type of `seqs`
77 28
 #' @param big_memory_brute attempt to cluster more than 4000 sequences?  Clustering is quadratic, so this will take a long time and might exhaust memory
78 29
 #' @param method one of 'substitutionMatrix' or 'levenshtein'
79 30
 #' @param substitution_matrix a character vector naming a substition matrix available in Biostrings, or a substitution matrix itself
31
+#' @param cluster_fun `character`, one of "hclust" or "none", determining if distance matrices should also be clustered with `hclust`
80 32
 #' @param cluster_method character passed to `hclust`
33
+#'
81 34
 #' @seealso hclust, stringDist
82 35
 #' @return dendrogram of class `hclust`
83 36
 #' @export
... ...
@@ -87,10 +40,10 @@ fine_cluster_by = function(seqs, by, max_dist = NULL, ...){
87 40
 #' aaseq = Biostrings::readAAStringSet(fasta_path)[1:100]
88 41
 #' cls = fine_cluster(aaseq)
89 42
 #' plot(cls$cluster)
90
-fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster = 'hclust', cluster_method = 'complete'){
43
+fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster_fun = 'hclust', cluster_method = 'complete'){
91 44
     if(length(seqs) > 4000 & !big_memory_brute) stop("Won't try to cluster ", length(seqs), " sequences unless `big_memory_brute` = TRUE.  (Make sure you actually have lots of memory!)")
92 45
     type = match.arg(type, choices = c('AA', 'DNA'))
93
-    cluster = match.arg(cluster, c('hclust', 'none'))
46
+    cluster_fun = match.arg(cluster_fun, c('hclust', 'none'))
94 47
     method = match.arg(method, c('substitutionMatrix', 'levenshtein'))
95 48
 
96 49
     if(is.character(seqs)){
... ...
@@ -122,7 +75,7 @@ fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'l
122 75
         #if(length(ss)>1) sd = sd/max(sd)*median(Biostrings::nchar(ss))
123 76
     }
124 77
 
125
-    if(length(seqs)>1 && cluster != 'none'){
78
+    if(length(seqs)>1 && cluster_fun != 'none'){
126 79
         hc = stats::hclust(as.dist(sd), method = cluster_method)
127 80
         hc$labels = names(ss)
128 81
     } else{
... ...
@@ -169,6 +122,7 @@ entropy = function(v, pseudo_count = length(v)/1000, na.action = na.fail){
169 122
 
170 123
 #' @export
171 124
 #' @describeIn entropy The number of categories exceeding `p` proportion of the total
125
+#' @param p proportion threshold
172 126
 np = function(v, p = .05, pseudo_count = p/5, na.action = na.fail){
173 127
     v = na.action(v)
174 128
     if(length(v) == 0) return(0)
175 129
new file mode 100644
... ...
@@ -0,0 +1,209 @@
1
+#' For each cell, return a single, canonical chain-cluster
2
+#'
3
+#' In single cell data, multiple chains (heavy-light or alpha-beta) are expected.  In some cases, there could be more than two (eg multiple alpha alleles for T cells).
4
+#' This picks a cluster id for each cell based on the overall prevalence of cluster ids over all cells in `tbl`.
5
+#' If order = 1 then the canonical chain-cluster will be the most prevalent, and if order = 2, it will be the 2nd most prevalent, and so on.  Ties are broken arbitrarily (possibly by lexicographic order of `cluster_idx`).
6
+#' @param tbl `data.frame` containing columns specified in `cell_identifiers`, `cluster_idx` and optionally `chain_identifiers`
7
+#' @param cell_identifiers `character` vector specifying columns in `tbl` that identify a cell
8
+#' @param cluster_idx `character` specifying the column in `tbl` that identifies a clsuter
9
+#' @param order return the 1st, 2nd, 3rd, etc, most common chain-cluster
10
+#'
11
+#' @return `data.frame` with columns from `cell_identifiers` and a single `cluster_idx` for each cell
12
+#' @export
13
+canonicalize_by_prevalence = function(tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', order = 1){
14
+    prevalence = tbl %>% group_by(!!sym(cluster_idx)) %>% summarize(prev = dplyr::n())
15
+    tbl_prevalence = left_join(tbl, prevalence, by = cluster_idx)
16
+    tbl_order = tbl_prevalence %>% group_by(!!!syms(cell_identifiers)) %>% summarize(!!cluster_idx := dplyr::nth(!!sym(cluster_idx), -order, prev))
17
+    ungroup(tbl_order)
18
+}
19
+
20
+
21
+
22
+
23
+#' @param sort_factors `character` vector naming columns in `tbl` to sorted on, within  `cell_identifier`. Sorted by first element first, then ties broken by subsequent elements.  Sorted in decreasing order for each element.
24
+#' @param chain_levels an optional `character` vector providing the sort order of the `chain` column in `tbl`.  Set to length zero to disable.
25
+#' @export
26
+#' @describeIn canonicalize_by_prevalence return a canonical contig by chain type, with TRB/IGH returned first. By default, ties are broken by umis and reads.
27
+canonicalize_by_chain = function(tbl,  cell_identifiers = 'barcode', sort_factors = c('chain', 'umis', 'reads'), cluster_idx = 'cluster_idx', order = 1, chain_levels = c('IGL', 'IGK', 'TRA', 'TRB', 'IGH')){
28
+    #ochain = tbl[[sort_factors[1]]]
29
+    if(length(chain_levels) > 0){
30
+        if(!('chain' %in% names(tbl))) stop('`tbl` must contain a `chain` column to set `chain_levels`')
31
+        tbl = tbl %>% mutate(chain = factor(chain, levels = chain_levels, ordered = TRUE))
32
+    }
33
+    arranging = purrr::map(sort_factors, ~ rlang::quo(desc(!!sym(.x))))
34
+    tbl %>% group_by(!!!syms(cell_identifiers)) %>% dplyr::arrange(!!!arranging) %>% mutate(rank = seq_along(!!sym(sort_factors[[1]]))) %>% dplyr::filter(rank == order) %>% dplyr::select(-rank)
35
+
36
+}
37
+
38
+# Define canonical chain types per cell
39
+
40
+#' Given a family of similar sequences, return a "representative"
41
+#'
42
+#' @param seqs character vector
43
+#' @param medoid_idx optional index into seqs
44
+#' @param warn_if_distinct Should a warning be emitted if there are distinct elements in seqs?
45
+#'
46
+#' If medoid_idx is supplied, the medoid sequence is returned, otherwise the longest
47
+#' sequence is returned
48
+#'
49
+#' @return character vector
50
+#' @export
51
+#'
52
+#' @examples
53
+#' get_canonical_representative(c('apple', 'manzana', 'pomme'))
54
+get_canonical_representative = function(seqs, medoid_idx, warn_if_distinct = FALSE){
55
+    if(!missing(medoid_idx)){
56
+        if(!is.integer(medoid_idx) || medoid_idx < 1 || medoid_idx > length(seqs)) stop("Illegal `medoid_idx`")
57
+        ii = medoid_idx
58
+    } else{
59
+        len = str_length(seqs)
60
+        ii = which.max(len)
61
+    }
62
+    rep = seqs[ii]
63
+
64
+    if(warn_if_distinct && length(nomatch <- which(seqs != rep)) > 0) warning("At indices ", nomatch, " sequences did not match the representative")
65
+
66
+    return(rep)
67
+}
68
+
69
+#' Generate a list of tables representing clusters paired in cells
70
+#'
71
+#' A contingency table of every combination of `cluster_idx` up to `table_order` is generated.
72
+#' Combinations that are found in at least `min_expansion` number of cells are reported.  All cells that have these combinations are returned, as well as cells that only have `orphan_level` of matching `cluster_idx`.
73
+#'
74
+#' For example, if `table_order=2` and `min_expansion=2` then heavy/light or alpha/beta pairs found two or more times will be returned (as well as alpha-alpha pairs, etc, if those are present).
75
+#' If `orphan_level=1` then all cells that share just a single chain with an expanded clone will be returned.
76
+#'
77
+#' The `cluster_idx.1_fct` and `cluster_idx.2_fct` fields in `cell_tbl`, `idx1_tbl`, `idx2_tbl` are cast to factors and ordered such that pairings will tend to occur along the diagonal when they are cross-tabulated.
78
+#' This facilitates plotting.
79
+#'
80
+#' @section Caveats and warnings:
81
+#'  The cell_idx -> cluster_idx map is generally one-to-many, and is resolved by `canonicalize_fun`.  For `table_order>1`, few collisions are expected as most cells will contain no more than 2 clusters.  Any collisions are resolved by returning the most prevalent cluster, across samples.  When two clusters are tied for most prevalent within a cell, the `cluster_idx` returned is arbitrary. Therefore, when `table_order=1`, it is strongly recommended to subset the `cluster_tbl` to just a single chain.
82
+#'
83
+#' @param cluster_tbl a table with all combinations of clusters in all cells
84
+#' @param cell_identifiers character vector naming fields that key a cell
85
+#' @param cluster_idx character naming a single field IDing the clusters
86
+#' @param min_expansion the minimal number of times a pairing needs to occur for it to be reported
87
+#' @param cell_tbl optional, ancillary table with additional cell features.  Must also be keyed by `cell_identifiers`
88
+#' @param feature_tbl optional, ancillary table with additional cluster features.  Must also be keyed by `cluster_idx`
89
+#' @param table_order Integer larger than 1. What order of cluster_idx will be paired, eg, order = 2 means that the most common and second most common cluster_idx will be sought for each cell
90
+#' @param orphan_level Integer larger than 0 and less than or equal to `table_order`.  Given that at least `min_expansion` cells are found that have `table_order` chains identical, how many `cluster_idx` pairs will we match on to select other cells.  Example: `ophan_level=1` means that cells that share just a single chain with the
91
+#' @param cluster_whitelist a table of "cluster_idx" that should always be reported.  In contrast to the `cluster_tbl`, here the clusters must be named "cluster_idx.1", "cluster_idx.2" (if order-2 pairs are being selected).
92
+#' @param cluster_blacklist a table of "cluster_idx" that will never be reported.  Must be named as per `cluster_whitelist`.
93
+#' @param canonicalize_fun a function with signature `canonicalize_fun(cluster_tbl, cell_identifiers, cluster_idx, order = i)` that for each `cell_identifier` returns a single contig that depends on the `order`.  For instance \link{canonicalize_by_prevalence} or \link{canonicalize_by_chain}.
94
+#'
95
+#' @return list of tables.  The `cell_tbl` is keyed by the `cell_identifiers`, with fields "cluster_idx.1", "cluster_idx.2", etc, IDing the contigs present in each cell. "cluster_idx.1_fct" and "cluster_idx.2_fct" cast these fields to factors and are reordered to maximize the number of pairs along the diagonal. The `idx1_tbl` and `idx2_tbl` report information (passed in about the `cluster_idx` by `feature_tbl`.)  The `cluster_pair_tbl` reports all pairings found of contigs, and the number of times observed.
96
+#' @export
97
+#'
98
+#' @seealso canonicalize_by_prevalence, canonicalize_by_chain
99
+#' @importFrom tibble as_data_frame
100
+#' @importFrom dplyr bind_rows left_join ungroup summarize anti_join
101
+#' @importFrom stringr str_length str_c
102
+#' @importFrom rlang sym syms :=
103
+#' @examples
104
+#' library(dplyr)
105
+#' cluster_tbl = data_frame(clust_idx = gl(3, 2), cell_idx = rep(1:3, times = 2))
106
+#' # no pairs found twice
107
+#' pt1 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', canonicalize_by_prevalence)
108
+#' # all pairs found, found once.
109
+#' pt2 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx',
110
+#'     canonicalize_by_prevalence, min_expansion = 1)
111
+#' pt2$cell_tbl
112
+#' cluster_tbl2 = bind_rows(cluster_tbl, cluster_tbl %>% mutate(cell_idx = rep(4:6, times = 2)))
113
+#' #all pairs found twice
114
+#' pt3 = pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', canonicalize_by_prevalence, min_expansion = 1)
115
+#' pt3$cell_tbl
116
+#' # `canonicalize_by_chain` expects fields `umis`, `reads`
117
+#' # to break ties,  wrap the function to change this
118
+#' cluster_tbl3 = cluster_tbl2 %>%
119
+#'     mutate(umis = 1, reads = 1, chain = rep(c('TRA', 'TRB'), times = 6))
120
+#' pt4 = pairing_tables(cluster_tbl3, 'cell_idx', 'clust_idx',
121
+#'     canonicalize_by_chain, min_expansion = 1, table_order = 2)
122
+pairing_tables = function(cluster_tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', canonicalize_fun = canonicalize_by_chain, table_order = 2, min_expansion = 2,  orphan_level = 1, cluster_whitelist = NULL, cluster_blacklist = NULL, cell_tbl = NULL, feature_tbl = NULL ){
123
+
124
+    if(orphan_level > table_order) stop('`ophan_level` must be less than or equal to `table_order`')
125
+    if(table_order < 1) stop('Table order must be at least 1')
126
+
127
+    # get `table_order` most common clusters for each cell
128
+    # forcibly rename cluster_idx -> "cluster_idx"
129
+    bar_chain_tbls = purrr::map(seq_len(table_order), function(i) canonicalize_fun(cluster_tbl, cell_identifiers = cell_identifiers, cluster_idx = cluster_idx, order = i) %>% dplyr::select(!!!c(syms(cell_identifiers), rlang::quo(cluster_idx))) %>% dplyr::rename( !!paste0('cluster_idx.', i) := !!cluster_idx))
130
+    # for each cell, what clusters are present
131
+    oligo_cluster_pairs = purrr::reduce(bar_chain_tbls, left_join, by = cell_identifiers)
132
+
133
+    # Set up cluster_ids for indexing into the pairing tables
134
+    cluster_ids = str_c('cluster_idx.', seq_len(table_order))
135
+    cluster_ids_to_select = cluster_ids[seq_len(orphan_level)]
136
+
137
+    # In how many cells do each cluster pairing appear?
138
+    cluster_pair_tbl = oligo_cluster_pairs %>% group_by(!!!syms(cluster_ids)) %>% summarize(n_clone_pairs = dplyr::n())
139
+    # which clusters are expanded
140
+    expanded_cluster = cluster_pair_tbl %>% dplyr::filter(n_clone_pairs >= min_expansion) %>% dplyr::filter_at(.vars = cluster_ids, .vars_predicate = all_vars(!is.na(.)))
141
+    expanded_cluster = ungroup(expanded_cluster) %>% dplyr::select(!!!syms(cluster_ids_to_select), max_pairs = n_clone_pairs)
142
+    if(!is.null(cluster_whitelist)){
143
+        expanded_cluster = bind_rows(expanded_cluster, cluster_whitelist)
144
+    }
145
+    if(!is.null(cluster_blacklist)) expanded_cluster = anti_join(expanded_cluster, cluster_blacklist)
146
+
147
+    # Could have duplicated cluster_ids after binding to the whitelist or from considering orphans
148
+    expanded_cluster = expanded_cluster[!duplicated(expanded_cluster %>% dplyr::select(-max_pairs)),]
149
+    expanded_c1 = oligo_cluster_pairs %>% dplyr::inner_join(expanded_cluster, by = cluster_ids_to_select)
150
+    if(anyDuplicated(expanded_c1 %>% dplyr::select(!!!syms(cell_identifiers)))) stop("Ruhoh, duplicated cell identifiers, this is a bug!")
151
+
152
+    # cross-tab pairings in order to figure out how to order the cluster_idx to put most common pairing on diagonal
153
+    if(nrow(expanded_c1)>0 && table_order > 1){
154
+        expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ cluster_idx.2, fun.aggregate = length, value.var = 'max_pairs')
155
+        mat = expanded_counts[,-1]
156
+        #rownames(mat) = rowid[['cluster_idx']]
157
+        ro = hclust(dist(mat))$order
158
+        co = hclust(dist(t(mat)))$order
159
+    } else if (nrow(expanded_c1) == 0) {
160
+        warning('No pairs found')
161
+        return()
162
+    } else{
163
+        expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ ., fun.aggregate = length, value.var = 'max_pairs')
164
+        ro = order(expanded_counts[[2]])
165
+        co = NA_integer_
166
+    }
167
+
168
+    ci_class = class(cluster_tbl[[cluster_idx]])
169
+    as_method = if(ci_class == 'factor') as.factor else function(x) as(x, ci_class)
170
+    rowid = data_frame(cluster_idx = expanded_counts$cluster_idx.1 %>% as_method, plot_order = ro)
171
+    colid = suppressWarnings(data_frame(cluster_idx = colnames(expanded_counts)[-1] %>% as_method, plot_order = co))
172
+    rowid[['cluster_idx.1_fct']] = factor(rowid[['cluster_idx']], levels = rowid[['cluster_idx']][ro])
173
+    colid[['cluster_idx.2_fct']] = factor(colid[['cluster_idx']], levels = colid[['cluster_idx']][co])
174
+
175
+
176
+    # also fix levels of this tbl
177
+    expanded_c1 = expanded_c1 %>% mutate(cluster_idx.1_fct = factor(cluster_idx.1, levels = levels(rowid[['cluster_idx.1_fct']])))
178
+    if(table_order>1) expanded_c1 = expanded_c1 %>% mutate(cluster_idx.2_fct = factor(cluster_idx.2, levels = levels(colid[['cluster_idx.2_fct']])))
179
+
180
+    if(!is.null(cell_tbl)){
181
+        if(anyDuplicated(cell_tbl %>% select(!!!syms(cell_identifiers)))) stop('`cell_tbl` must not have duplicate `cell_identifiers`.')
182
+        expanded_c1 = left_join(expanded_c1, cell_tbl, by = cell_identifiers)
183
+    }
184
+
185
+    idx1_tbl = rowid %>% dplyr::rename(!!cluster_idx := cluster_idx)
186
+    idx2_tbl = colid %>% dplyr::rename(!!cluster_idx := cluster_idx)
187
+    if(!is.null(feature_tbl)){
188
+        if(anyDuplicated(feature_tbl %>% select(!!sym(cluster_idx)))) stop('`feature_tbl` must not have duplicate `cluster_idx`.')
189
+        idx1_tbl = left_join(idx1_tbl, feature_tbl, by = cluster_idx)
190
+        idx2_tbl = left_join(idx2_tbl, feature_tbl, by = cluster_idx)
191
+    }
192
+
193
+    list(cell_tbl = expanded_c1, idx1_tbl = idx1_tbl, idx2_tbl = idx2_tbl, cluster_pair_tbl = cluster_pair_tbl)
194
+
195
+}
196
+
197
+plot_pairing = function(pairing_list, color_labels_by){
198
+    pl = pairing_list
199
+    pairs_plt = ggplot(pairing_list$cell_tbl, aes(x = cluster_idx.1_fct, y = cluster_idx.2_fct, color = sample, shape = pop)) + geom_jitter(width = .3, height = .3)
200
+
201
+    ylab = data_frame(!!color_labels_by :=  ggplot_build(pairs_plt)$layout$panel_params[[1]]$y.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color))
202
+
203
+    xlab = data_frame(!!color_labels_by :=  ggplot_build(pairs_plt)$layout$panel_params[[1]]$x.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color))
204
+
205
+    pairs_plt = pairs_plt + theme(axis.text.x = element_text(angle = 90, color = xlab$class_color, size = 8), axis.text.y = element_text(color = ylab$class_color, size = 8))
206
+
207
+
208
+}
209
+
... ...
@@ -12,9 +12,11 @@
12 12
 #'
13 13
 #' @examples
14 14
 #' library(dplyr)
15
-#' contig_anno_path = system.file('extdata', 'cellranger_contig_annotation.csv', package = 'CellaRepertorium')
15
+#' contig_anno_path = system.file('extdata', 'cellranger_contig_annotation.csv',
16
+#'     package = 'CellaRepertorium')
16 17
 #' contig_anno = readr::read_csv(contig_anno_path)
17
-#' contig_anno = contig_anno %>% mutate(fancy_name = fancy_name_contigs(., prefix = paste(sample, pop, sep = '_')))
18
+#' contig_anno = contig_anno %>% mutate(fancy_name =
19
+#'     fancy_name_contigs(., prefix = paste(sample, pop, sep = '_')))
18 20
 #' stopifnot(!any(duplicated(contig_anno$fancy_name)))
19 21
 fancy_name_contigs = function(contig_frame, prefix){
20 22
     notin = setdiff(c('chain', 'v_gene', 'd_gene', 'j_gene'), names(contig_frame))
... ...
@@ -31,138 +33,6 @@ variable_genes = function(ref = '10X'){
31 33
 }
32 34
 
33 35
 
34
-#' Generate a list of tables representing clusters paired in cells
35
-#'
36
-#' A contingency table of every combination of `cluster_idx` up to `table_order` is generated.
37
-#' Combinations that are found in at least `min_expansion` number of cells are reported.  All cells that have these combinations are returned, as well as cells that only have `orphan_level` of matching `cluster_idx`.
38
-#'
39
-#' For example, if `table_order=2` and `min_expansion=2` then heavy/light or alpha/beta pairs found two or more times will be returned (as well as alpha-alpha pairs, etc, if those are present).
40
-#' If `orphan_level=1` then all cells that share just a single chain with an expanded clone will be returned.
41
-#'
42
-#' The `cluster_idx.1_fct` and `cluster_idx.2_fct` fields in `cell_tbl`, `idx1_tbl`, `idx2_tbl` are cast to factors and ordered such that pairings will tend to occur along the diagonal when they are cross-tabulated.
43
-#' This facilitates plotting.
44
-#'
45
-#' @section Caveats and warnings:
46
-#'  The cell_idx -> cluster_idx map is generally one-to-many.  For `table_order>1`, few collisions are expected as most cells will contain no more than 2 clusters.  Any collisions are resolved by returning the most prevalent cluster, across samples.  When two clusters are tied for most prevalent within a cell, the `cluster_idx` returned is arbitrary. Therefore, when `table_order=1`, it is strongly recommended to subset the `cluster_tbl` to just a single chain.
47
-#' @param cluster_tbl a table with all combinations of clusters in all cells
48
-#' @param cell_identifiers character vector naming fields that key a cell
49
-#' @param cluster_idx character naming a single field IDing the clusters
50
-#' @param min_expansion the minimal number of times a pairing needs to occur for it to be reported
51
-#' @param cell_tbl optional, ancillary table with additional cell features.  Must also be keyed by `cell_identifiers`
52
-#' @param feature_tbl optional, ancillary table with additional cluster features.  Must also be keyed by `cluster_idx`
53
-#' @param table_order Integer larger than 1. What order of cluster_idx will be paired, eg, order = 2 means that the most common and second most common cluster_idx will be sought for each cell
54
-#' @param orphan_level Integer larger than 0 and less than or equal to `table_order`.  Given that at least `min_expansion` cells are found that have `table_order` chains identical, how many `cluster_idx` pairs will we match on to select other cells.  Example: `ophan_level=1` means that cells that share just a single chain with the
55
-#' @param cluster_whitelist a table of "cluster_idx" that should always be reported.  In contrast to the `cluster_tbl`, here the clusters must be named "cluster_idx.1", "cluster_idx.2" (if order-2 pairs are being selected).
56
-#' @param cluster_blacklist a table of "cluster_idx" that will never be reported.  Must be named as per `cluster_whitelist`.
57
-#'
58
-#' @return list of tables.  The `cell_tbl` is keyed by the `cell_identifiers`, with fields "cluster_idx.1", "cluster_idx.2", etc, IDing the contigs present in each cell. "cluster_idx.1_fct" and "cluster_idx.2_fct" cast these fields to factors and are reordered to maximize the number of pairs along the diagonal. The `idx1_tbl` and `idx2_tbl` report information (passed in about the `cluster_idx` by `feature_tbl`.)  The `cluster_pair_tbl` reports all pairings found of contigs, and the number of times observed.
59
-#' @export
60
-#'
61
-#' @seealso get_canonical_chain
62
-#' @importFrom tibble as_data_frame
63
-#' @importFrom dplyr bind_rows left_join ungroup summarize anti_join
64
-#' @importFrom stringr str_length str_c
65
-#' @importFrom rlang sym syms :=
66
-#' @examples
67
-#' library(dplyr)
68
-#' cluster_tbl = data_frame(clust_idx = gl(3, 2), cell_idx = rep(1:3, times = 2))
69
-#' # no pairs found twice
70
-#' pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx')
71
-#' # all pairs found, found once.
72
-#' pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', min_expansion = 1)
73
-#' cluster_tbl2 = bind_rows(cluster_tbl, cluster_tbl %>% mutate(cell_idx = rep(4:6, times = 2)))
74
-#' #all pairs found twice
75
-#' pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', min_expansion = 1)
76
-#' # A warning if `table_order = 1` -- best to stratify by chain
77
-#' # (could be accomplished by setting the `cell_identifiers` to include a field that identifies the chain).
78
-#' pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', min_expansion = 1, table_order = 1)
79
-pairing_tables = function(cluster_tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', table_order = 2, min_expansion = 2,  orphan_level = 1, cluster_whitelist = NULL, cluster_blacklist = NULL, cell_tbl = NULL, feature_tbl = NULL ){
80
-
81
-    if(orphan_level > table_order) stop('`ophan_level` must be less than or equal to `table_order`')
82
-    if(table_order < 1) stop('Table order must be at least 1')
83
-
84
-    # get `table_order` most common clusters for each cell
85
-    # forcibly rename cluster_idx -> "cluster_idx"
86
-    bar_chain_tbls = purrr::map(seq_len(table_order), function(i) get_canonical_chain(cluster_tbl, cell_identifiers = cell_identifiers, cluster_idx = cluster_idx, order = i) %>% dplyr::rename( !!paste0('cluster_idx.', i) := !!cluster_idx))
87
-    # for each cell, what clusters are present
88
-    oligo_cluster_pairs = purrr::reduce(bar_chain_tbls, left_join, by = cell_identifiers)
89
-
90
-    # Set up cluster_ids for indexing into the pairing tables
91
-    cluster_ids = str_c('cluster_idx.', seq_len(table_order))
92
-    cluster_ids_to_select = cluster_ids[seq_len(orphan_level)]
93
-
94
-    # In how many cells do each cluster pairing appear?
95
-    cluster_pair_tbl = oligo_cluster_pairs %>% group_by(!!!syms(cluster_ids)) %>% summarize(n_clone_pairs = n())
96
-    # which clusters are expanded
97
-    expanded_cluster = cluster_pair_tbl %>% dplyr::filter(n_clone_pairs >= min_expansion) %>% dplyr::filter_at(.vars = cluster_ids_to_select, .vars_predicate = all_vars(!is.na(.)))
98
-    expanded_cluster = ungroup(expanded_cluster) %>% dplyr::select(!!!syms(cluster_ids_to_select), max_pairs = n_clone_pairs)
99
-    if(!is.null(cluster_whitelist)){
100
-        expanded_cluster = bind_rows(expanded_cluster, cluster_whitelist)
101
-    }
102
-    if(!is.null(cluster_blacklist)) expanded_cluster = anti_join(expanded_cluster, cluster_blacklist)
103
-
104
-    # Could have duplicated cluster_ids after binding to the whitelist or from considering orphans
105
-    expanded_cluster = expanded_cluster[!duplicated(expanded_cluster %>% dplyr::select(-max_pairs)),]
106
-    expanded_c1 = oligo_cluster_pairs %>% right_join(expanded_cluster, by = cluster_ids_to_select)
107
-    if(anyDuplicated(expanded_c1 %>% dplyr::select(!!!syms(cell_identifiers)))) stop("Ruhoh, duplicated cell identifiers, this is a bug!")
108
-
109
-    # cross-tab pairings in order to figure out how to order the cluster_idx to put most common pairing on diagonal
110
-    if(nrow(expanded_c1)>0 && table_order > 1){
111
-        expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ cluster_idx.2, fun.aggregate = length, value.var = 'max_pairs')
112
-        mat = expanded_counts[,-1]
113
-        #rownames(mat) = rowid[['cluster_idx']]
114
-        ro = hclust(dist(mat))$order
115
-        co = hclust(dist(t(mat)))$order
116
-    } else if (nrow(expanded_c1) == 0) {
117
-       warning('No pairs found')
118
-        return()
119
-    } else{
120
-            expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ ., fun.aggregate = length, value.var = 'max_pairs')
121
-            ro = order(expanded_counts[[2]])
122
-            co = NA_integer_
123
-    }
124
-
125
-    ci_class = class(cluster_tbl[[cluster_idx]])
126
-    as_method = if(ci_class == 'factor') as.factor else function(x) as(x, ci_class)
127
-    rowid = data_frame(cluster_idx = expanded_counts$cluster_idx.1 %>% as_method, plot_order = ro)
128
-    colid = suppressWarnings(data_frame(cluster_idx = colnames(expanded_counts)[-1] %>% as_method, plot_order = co))
129
-    rowid[['cluster_idx.1_fct']] = factor(rowid[['cluster_idx']], levels = rowid[['cluster_idx']][ro])
130
-    colid[['cluster_idx.2_fct']] = factor(colid[['cluster_idx']], levels = colid[['cluster_idx']][co])
131
-
132
-
133
-    # also fix levels of this tbl
134
-    expanded_c1 = expanded_c1 %>% mutate(cluster_idx.1_fct = factor(cluster_idx.1, levels = levels(rowid[['cluster_idx.1_fct']])))
135
-    if(table_order>1) expanded_c1 = expanded_c1 %>% mutate(cluster_idx.2_fct = factor(cluster_idx.2, levels = levels(colid[['cluster_idx.2_fct']])))
136
-
137
-    if(!is.null(cell_tbl)){
138
-        if(anyDuplicated(cell_tbl %>% select(!!!syms(cell_identifiers)))) stop('`cell_tbl` must not have duplicate `cell_identifiers`.')
139
-        expanded_c1 = left_join(expanded_c1, cell_tbl, by = cell_identifiers)
140
-    }
141
-
142
-    idx1_tbl = rowid %>% dplyr::rename(!!cluster_idx := cluster_idx)
143
-    idx2_tbl = colid %>% dplyr::rename(!!cluster_idx := cluster_idx)
144
-    if(!is.null(feature_tbl)){
145
-        if(anyDuplicated(feature_tbl %>% select(!!sym(cluster_idx)))) stop('`feature_tbl` must not have duplicate `cluster_idx`.')
146
-        idx1_tbl = left_join(idx1_tbl, feature_tbl, by = cluster_idx)
147
-        idx2_tbl = left_join(idx2_tbl, feature_tbl, by = cluster_idx)
148
-    }
149
-
150
-    list(cell_tbl = expanded_c1, idx1_tbl = idx1_tbl, idx2_tbl = idx2_tbl, cluster_pair_tbl = cluster_pair_tbl)
151
-
152
-}
153
-
154
-plot_pairing = function(pairing_list, color_labels_by){
155
-    pl = pairing_list
156
-    pairs_plt = ggplot(pairing_list$cell_tbl, aes(x = cluster_idx.1_fct, y = cluster_idx.2_fct, color = sample, shape = pop)) + geom_jitter(width = .3, height = .3)
157
-
158
-    ylab = data_frame(!!color_labels_by :=  ggplot_build(pairs_plt)$layout$panel_params[[1]]$y.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color))
159
-
160
-    xlab = data_frame(!!color_labels_by :=  ggplot_build(pairs_plt)$layout$panel_params[[1]]$x.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color))
161
-
162
-    pairs_plt = pairs_plt + theme(axis.text.x = element_text(angle = 90, color = xlab$class_color, size = 8), axis.text.y = element_text(color = ylab$class_color, size = 8))
163
-
164
-
165
-}
166 36
 
167 37
 cleanup_annotations = function(json){
168 38
     anno = json[['annotations']]
169 39
new file mode 100644
... ...
@@ -0,0 +1,4 @@
1
+# External data
2
+T and B cells from healthy PBMC donor, V3 chemistry
3
+https://support.10xgenomics.com/single-cell-vdj/datasets/3.0.0/vdj_v1_hs_pbmc2_t
4
+
0 5
new file mode 100644
... ...
@@ -0,0 +1,41 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/pairing-methods.R
3
+\name{canonicalize_by_prevalence}
4
+\alias{canonicalize_by_prevalence}
5
+\alias{canonicalize_by_chain}
6
+\title{For each cell, return a single, canonical chain-cluster}
7
+\usage{
8
+canonicalize_by_prevalence(tbl, cell_identifiers = "barcode",
9
+  cluster_idx = "cluster_idx", order = 1)
10
+
11
+canonicalize_by_chain(tbl, cell_identifiers = "barcode",
12
+  sort_factors = c("chain", "umis", "reads"),
13
+  cluster_idx = "cluster_idx", order = 1, chain_levels = c("IGL",
14
+  "IGK", "TRA", "TRB", "IGH"))
15
+}
16
+\arguments{
17
+\item{tbl}{`data.frame` containing columns specified in `cell_identifiers`, `cluster_idx` and optionally `chain_identifiers`}
18
+
19
+\item{cell_identifiers}{`character` vector specifying columns in `tbl` that identify a cell}
20
+
21
+\item{cluster_idx}{`character` specifying the column in `tbl` that identifies a clsuter}
22
+
23
+\item{order}{return the 1st, 2nd, 3rd, etc, most common chain-cluster}
24
+
25
+\item{sort_factors}{`character` vector naming columns in `tbl` to sorted on, within  `cell_identifier`. Sorted by first element first, then ties broken by subsequent elements.  Sorted in decreasing order for each element.}
26
+
27
+\item{chain_levels}{an optional `character` vector providing the sort order of the `chain` column in `tbl`.  Set to length zero to disable.}
28
+}
29
+\value{
30
+`data.frame` with columns from `cell_identifiers` and a single `cluster_idx` for each cell
31
+}
32
+\description{
33
+In single cell data, multiple chains (heavy-light or alpha-beta) are expected.  In some cases, there could be more than two (eg multiple alpha alleles for T cells).
34
+This picks a cluster id for each cell based on the overall prevalence of cluster ids over all cells in `tbl`.
35
+If order = 1 then the canonical chain-cluster will be the most prevalent, and if order = 2, it will be the 2nd most prevalent, and so on.  Ties are broken arbitrarily (possibly by lexicographic order of `cluster_idx`).
36
+}
37
+\section{Functions}{
38
+\itemize{
39
+\item \code{canonicalize_by_chain}: return a canonical contig by chain type, with TRB/IGH returned first. By default, ties are broken by umis and reads.
40
+}}
41
+
... ...
@@ -34,7 +34,8 @@ For example, \code{clusters} could be a clonal ID, while \code{labels} is a subj
34 34
 \examples{
35 35
 library(dplyr)
36 36
 purity = function(clusters, labels, covariates){
37
-n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% group_by(clusters, labels) \%>\% summarize(n = n()) \%>\% ungroup()
37
+n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\%
38
+    group_by(clusters, labels) \%>\% summarize(n = n()) \%>\% ungroup()
38 39
 singletons = mean(n_label_cluster$n == 1)
39 40
 singletons
40 41
 }
... ...
@@ -18,6 +18,8 @@ modal_category(v, na.action = na.fail)
18 18
 \item{pseudo_count}{number of pseudo counts to add on, to stablize empty categories}
19 19
 
20 20
 \item{na.action}{how to handle NA values}
21
+
22
+\item{p}{proportion threshold}
21 23
 }
22 24
 \value{
23 25
 the sample entropy
... ...
@@ -21,6 +21,7 @@ Generate a legible name for a series of contigs
21 21
 library(dplyr)
22 22
 contig_anno_path = system.file('extdata', 'cellranger_contig_annotation.csv', package = 'CellaRepertorium')
23 23
 contig_anno = readr::read_csv(contig_anno_path)
24
-contig_anno = contig_anno \%>\% mutate(fancy_name = fancy_name_contigs(., prefix = paste(sample, pop, sep = '_')))
24
+contig_anno = contig_anno \%>\% mutate(fancy_name =
25
+    fancy_name_contigs(., prefix = paste(sample, pop, sep = '_')))
25 26
 stopifnot(!any(duplicated(contig_anno$fancy_name)))
26 27
 }
27 28
new file mode 100644
... ...
@@ -0,0 +1,27 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/pairing-methods.R
3
+\name{get_canonical_representative}
4
+\alias{get_canonical_representative}
5
+\title{Given a family of similar sequences, return a "representative"}
6
+\usage{
7
+get_canonical_representative(seqs, medoid_idx, warn_if_distinct = FALSE)
8
+}
9
+\arguments{
10
+\item{seqs}{character vector}
11
+
12
+\item{medoid_idx}{optional index into seqs}
13
+
14
+\item{warn_if_distinct}{Should a warning be emitted if there are distinct elements in seqs?
15
+
16
+If medoid_idx is supplied, the medoid sequence is returned, otherwise the longest
17
+sequence is returned}
18
+}
19
+\value{
20
+character vector
21
+}
22
+\description{
23
+Given a family of similar sequences, return a "representative"
24
+}
25
+\examples{
26
+get_canonical_representative(c('apple', 'manzana', 'pomme'))
27
+}
0 28
new file mode 100644
... ...
@@ -0,0 +1,76 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/pairing-methods.R
3
+\name{pairing_tables}
4
+\alias{pairing_tables}
5
+\title{Generate a list of tables representing clusters paired in cells}
6
+\usage{
7
+pairing_tables(cluster_tbl, cell_identifiers = "barcode",
8
+  cluster_idx = "cluster_idx",
9
+  canonicalize_fun = canonicalize_by_chain, table_order = 2,
10
+  min_expansion = 2, orphan_level = 1, cluster_whitelist = NULL,
11
+  cluster_blacklist = NULL, cell_tbl = NULL, feature_tbl = NULL)
12
+}
13
+\arguments{
14
+\item{cluster_tbl}{a table with all combinations of clusters in all cells}
15
+
16
+\item{cell_identifiers}{character vector naming fields that key a cell}
17
+
18
+\item{cluster_idx}{character naming a single field IDing the clusters}
19
+
20
+\item{canonicalize_fun}{a function with signature `canonicalize_fun(cluster_tbl, cell_identifiers, cluster_idx, order = i)` that for each `cell_identifier` returns a single contig that depends on the `order`.  For instance \link{canonicalize_by_prevalence} or \link{canonicalize_by_chain}.}
21
+
22
+\item{table_order}{Integer larger than 1. What order of cluster_idx will be paired, eg, order = 2 means that the most common and second most common cluster_idx will be sought for each cell}
23
+
24
+\item{min_expansion}{the minimal number of times a pairing needs to occur for it to be reported}
25
+
26
+\item{orphan_level}{Integer larger than 0 and less than or equal to `table_order`.  Given that at least `min_expansion` cells are found that have `table_order` chains identical, how many `cluster_idx` pairs will we match on to select other cells.  Example: `ophan_level=1` means that cells that share just a single chain with the}
27
+
28
+\item{cluster_whitelist}{a table of "cluster_idx" that should always be reported.  In contrast to the `cluster_tbl`, here the clusters must be named "cluster_idx.1", "cluster_idx.2" (if order-2 pairs are being selected).}
29
+
30
+\item{cluster_blacklist}{a table of "cluster_idx" that will never be reported.  Must be named as per `cluster_whitelist`.}
31
+
32
+\item{cell_tbl}{optional, ancillary table with additional cell features.  Must also be keyed by `cell_identifiers`}
33
+
34
+\item{feature_tbl}{optional, ancillary table with additional cluster features.  Must also be keyed by `cluster_idx`}
35
+}
36
+\value{
37
+list of tables.  The `cell_tbl` is keyed by the `cell_identifiers`, with fields "cluster_idx.1", "cluster_idx.2", etc, IDing the contigs present in each cell. "cluster_idx.1_fct" and "cluster_idx.2_fct" cast these fields to factors and are reordered to maximize the number of pairs along the diagonal. The `idx1_tbl` and `idx2_tbl` report information (passed in about the `cluster_idx` by `feature_tbl`.)  The `cluster_pair_tbl` reports all pairings found of contigs, and the number of times observed.
38
+}
39
+\description{
40
+A contingency table of every combination of `cluster_idx` up to `table_order` is generated.
41
+Combinations that are found in at least `min_expansion` number of cells are reported.  All cells that have these combinations are returned, as well as cells that only have `orphan_level` of matching `cluster_idx`.
42
+}
43
+\details{
44
+For example, if `table_order=2` and `min_expansion=2` then heavy/light or alpha/beta pairs found two or more times will be returned (as well as alpha-alpha pairs, etc, if those are present).
45
+If `orphan_level=1` then all cells that share just a single chain with an expanded clone will be returned.
46
+
47
+The `cluster_idx.1_fct` and `cluster_idx.2_fct` fields in `cell_tbl`, `idx1_tbl`, `idx2_tbl` are cast to factors and ordered such that pairings will tend to occur along the diagonal when they are cross-tabulated.
48
+This facilitates plotting.
49
+}
50
+\section{Caveats and warnings}{
51
+
52
+ The cell_idx -> cluster_idx map is generally one-to-many, and is resolved by `canonicalize_fun`.  For `table_order>1`, few collisions are expected as most cells will contain no more than 2 clusters.  Any collisions are resolved by returning the most prevalent cluster, across samples.  When two clusters are tied for most prevalent within a cell, the `cluster_idx` returned is arbitrary. Therefore, when `table_order=1`, it is strongly recommended to subset the `cluster_tbl` to just a single chain.
53
+}
54
+
55
+\examples{
56
+library(dplyr)
57
+cluster_tbl = data_frame(clust_idx = gl(3, 2), cell_idx = rep(1:3, times = 2))
58
+# no pairs found twice
59
+pt1 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', canonicalize_by_prevalence)
60
+# all pairs found, found once.
61
+pt2 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx',
62
+    canonicalize_by_prevalence, min_expansion = 1)
63
+pt2$cell_tbl
64
+cluster_tbl2 = bind_rows(cluster_tbl, cluster_tbl \%>\% mutate(cell_idx = rep(4:6, times = 2)))
65
+#all pairs found twice
66
+pt3 = pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', canonicalize_by_prevalence, min_expansion = 1)
67
+pt3$cell_tbl
68
+# canonicalize_by_chain by default expects fields umis, reads to break ties, could wrap the function to change this
69
+cluster_tbl3 = cluster_tbl2 \%>\%
70
+    mutate(umis = 1, reads = 1, chain = rep(c('TRA', 'TRB'), times = 6))
71
+pt4 = pairing_tables(cluster_tbl3, 'cell_idx', 'clust_idx',
72
+    canonicalize_by_chain, min_expansion = 1, table_order = 2)
73
+}
74
+\seealso{
75
+canonicalize_by_prevalence, canonicalize_by_chain
76
+}