Provide a chain-canonicalization method which is now used by default in pairing_tables
Some doc updates, too.
... | ... |
@@ -1,70 +1,20 @@ |
1 |
-#' For each cell, return a single, canonical chain-cluster |
|
2 |
-#' |
|
3 |
-#' In single cell data, multiple chains (heavy-light or alpha-beta) are expected. In some cases, there could be more than two (eg multiple alpha alleles for T cells). |
|
4 |
-#' This picks a cluster id for each cell based on the overall prevalence of cluster ids over all cells in `tbl`. |
|
5 |
-#' If order = 1 then the canonical chain-cluster will be the most prevalent, and if order = 2, it will be the 2nd most prevalent, and so on. Ties are broken arbitrarily (possibly by lexicographic order of `cluster_idx`). |
|
6 |
-#' @param tbl `data.frame` containing columns specified in `cell_identifiers` and `cluster_idx` |
|
7 |
-#' @param cell_identifiers `character` vector specifying columns in `tbl` that identify a cell |
|
8 |
-#' @param cluster_idx `character` specifying the column in `tbl` that identifies a |
|
9 |
-#' @param order return the 1st, 2nd, 3rd, etc, most common chain-cluster |
|
10 |
-#' |
|
11 |
-#' |
|
12 |
-#' @return `data.frame` with columns from `cell_identifiers` and a single `cluster_idx` for each cell |
|
13 |
-#' @export |
|
14 |
-get_canonical_chain = function(tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', order = 1){ |
|
15 |
- prevalence = tbl %>% group_by(!!sym(cluster_idx)) %>% summarize(prev = n()) |
|
16 |
- tbl_prevalence = left_join(tbl, prevalence, by = cluster_idx) |
|
17 |
- tbl_order = tbl_prevalence %>% group_by(!!!syms(cell_identifiers)) %>% summarize(!!cluster_idx := dplyr::nth(!!sym(cluster_idx), -order, prev)) |
|
18 |
- ungroup(tbl_order) |
|
1 |
+cluster_germline = function(tbl, segment_identifiers = c('v_gene', 'j_gene', 'chain')){ |
|
2 |
+ seg_types = tbl %>% group_by(!!!syms(segment_identifiers)) %>% summarize() %>% ungroup() %>% mutate(germline_idx = seq_len(nrow(.))) |
|
3 |
+ tbl %>% left_join(seg_types, by = segment_identifiers) |
|
19 | 4 |
} |
20 | 5 |
|
21 |
-# Define canonical chain types per cell |
|
22 |
- |
|
23 |
-#' Given a family of similar sequences, return a "representative" |
|
24 |
-#' |
|
25 |
-#' @param seqs character vector |
|
26 |
-#' @param medoid_idx optional index into seqs |
|
27 |
-#' @param warn_if_distinct Should a warning be emitted if there are distinct elements in seqs? |
|
28 |
-#' |
|
29 |
-#' If medoid_idx is supplied, the medoid sequence is returned, otherwise the longest |
|
30 |
-#' sequence is returned |
|
31 |
-#' |
|
32 |
-#' @return character vector |
|
33 |
-#' @export |
|
34 |
-#' |
|
35 |
-#' @examples |
|
36 |
-#' get_canonical_representative(c('apple', 'manzana', 'pomme')) |
|
37 |
-get_canonical_representative = function(seqs, medoid_idx, warn_if_distinct = FALSE){ |
|
38 |
- if(!missing(medoid_idx)){ |
|
39 |
- if(!is.integer(medoid_idx) || medoid_idx < 1 || medoid_idx > length(seqs)) stop("Illegal `medoid_idx`") |
|
40 |
- ii = medoid_idx |
|
41 |
- } else{ |
|
42 |
- len = str_length(seqs) |
|
43 |
- ii = which.max(len) |
|
44 |
- } |
|
45 |
- rep = seqs[ii] |
|
46 |
- |
|
47 |
- if(warn_if_distinct && length(nomatch <- which(seqs != rep)) > 0) warning("At indices ", nomatch, " sequences did not match the representative") |
|
48 |
- |
|
49 |
- return(rep) |
|
50 |
-} |
|
51 |
- |
|
52 |
- |
|
53 |
- |
|
54 |
- |
|
55 |
- |
|
56 | 6 |
fine_cluster_by = function(seqs, by, max_dist = NULL, ...){ |
57 | 7 |
seq_list = split(seqs, by) |
58 | 8 |
fc_list = lapply(seq_list, fine_cluster, ...) |
59 | 9 |
if(is.null(max_dist)){ |
60 |
- max_max = max(map_dbl(fc_list, 'max_dist')) |
|
10 |
+ max_max = max(purrr::map_dbl(fc_list, 'max_dist')) |
|
61 | 11 |
} else { |
62 | 12 |
max_max = max_dist |
63 | 13 |
} |
64 |
- affinities = map(fc_list, ~ max_max - .[['distance']]) |
|
14 |
+ affinities = purrr::map(fc_list, ~ max_max - .[['distance']]) |
|
65 | 15 |
aff_matrix = Matrix::.bdiag(affinities) |
66 |
- homologies = unlist(map(fc_list, 'homology'), use.names = FALSE) |
|
67 |
- medoid = map_dbl(fc_list, 'medoid') |
|
16 |
+ homologies = unlist(purrr::map(fc_list, 'homology'), use.names = FALSE) |
|
17 |
+ medoid = purrr::map_dbl(fc_list, 'medoid') |
|
68 | 18 |
list(affinities, aff_matrix, homologies, medoid) |
69 | 19 |
} |
70 | 20 |
|
... | ... |
@@ -72,12 +22,15 @@ fine_cluster_by = function(seqs, by, max_dist = NULL, ...){ |
72 | 22 |
#' |
73 | 23 |
#' The distances between AA sequences is defined to be 1-score/max(score) times the median length of the input sequences. |
74 | 24 |
#' The distances between nucleotide sequences is defined to be edit_distance/max(edit_distance) times the median length of input sequences. |
25 |
+#' |
|
75 | 26 |
#' @param seqs character vector, DNAStringSet or AAStringSet |
76 | 27 |
#' @param type character either `AA` or `DNA` specifying type of `seqs` |
77 | 28 |
#' @param big_memory_brute attempt to cluster more than 4000 sequences? Clustering is quadratic, so this will take a long time and might exhaust memory |
78 | 29 |
#' @param method one of 'substitutionMatrix' or 'levenshtein' |
79 | 30 |
#' @param substitution_matrix a character vector naming a substition matrix available in Biostrings, or a substitution matrix itself |
31 |
+#' @param cluster_fun `character`, one of "hclust" or "none", determining if distance matrices should also be clustered with `hclust` |
|
80 | 32 |
#' @param cluster_method character passed to `hclust` |
33 |
+#' |
|
81 | 34 |
#' @seealso hclust, stringDist |
82 | 35 |
#' @return dendrogram of class `hclust` |
83 | 36 |
#' @export |
... | ... |
@@ -87,10 +40,10 @@ fine_cluster_by = function(seqs, by, max_dist = NULL, ...){ |
87 | 40 |
#' aaseq = Biostrings::readAAStringSet(fasta_path)[1:100] |
88 | 41 |
#' cls = fine_cluster(aaseq) |
89 | 42 |
#' plot(cls$cluster) |
90 |
-fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster = 'hclust', cluster_method = 'complete'){ |
|
43 |
+fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'levenshtein', substitution_matrix = 'BLOSUM100', cluster_fun = 'hclust', cluster_method = 'complete'){ |
|
91 | 44 |
if(length(seqs) > 4000 & !big_memory_brute) stop("Won't try to cluster ", length(seqs), " sequences unless `big_memory_brute` = TRUE. (Make sure you actually have lots of memory!)") |
92 | 45 |
type = match.arg(type, choices = c('AA', 'DNA')) |
93 |
- cluster = match.arg(cluster, c('hclust', 'none')) |
|
46 |
+ cluster_fun = match.arg(cluster_fun, c('hclust', 'none')) |
|
94 | 47 |
method = match.arg(method, c('substitutionMatrix', 'levenshtein')) |
95 | 48 |
|
96 | 49 |
if(is.character(seqs)){ |
... | ... |
@@ -122,7 +75,7 @@ fine_cluster = function(seqs, type = 'AA', big_memory_brute = FALSE, method = 'l |
122 | 75 |
#if(length(ss)>1) sd = sd/max(sd)*median(Biostrings::nchar(ss)) |
123 | 76 |
} |
124 | 77 |
|
125 |
- if(length(seqs)>1 && cluster != 'none'){ |
|
78 |
+ if(length(seqs)>1 && cluster_fun != 'none'){ |
|
126 | 79 |
hc = stats::hclust(as.dist(sd), method = cluster_method) |
127 | 80 |
hc$labels = names(ss) |
128 | 81 |
} else{ |
... | ... |
@@ -169,6 +122,7 @@ entropy = function(v, pseudo_count = length(v)/1000, na.action = na.fail){ |
169 | 122 |
|
170 | 123 |
#' @export |
171 | 124 |
#' @describeIn entropy The number of categories exceeding `p` proportion of the total |
125 |
+#' @param p proportion threshold |
|
172 | 126 |
np = function(v, p = .05, pseudo_count = p/5, na.action = na.fail){ |
173 | 127 |
v = na.action(v) |
174 | 128 |
if(length(v) == 0) return(0) |
175 | 129 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,209 @@ |
1 |
+#' For each cell, return a single, canonical chain-cluster |
|
2 |
+#' |
|
3 |
+#' In single cell data, multiple chains (heavy-light or alpha-beta) are expected. In some cases, there could be more than two (eg multiple alpha alleles for T cells). |
|
4 |
+#' This picks a cluster id for each cell based on the overall prevalence of cluster ids over all cells in `tbl`. |
|
5 |
+#' If order = 1 then the canonical chain-cluster will be the most prevalent, and if order = 2, it will be the 2nd most prevalent, and so on. Ties are broken arbitrarily (possibly by lexicographic order of `cluster_idx`). |
|
6 |
+#' @param tbl `data.frame` containing columns specified in `cell_identifiers`, `cluster_idx` and optionally `chain_identifiers` |
|
7 |
+#' @param cell_identifiers `character` vector specifying columns in `tbl` that identify a cell |
|
8 |
+#' @param cluster_idx `character` specifying the column in `tbl` that identifies a clsuter |
|
9 |
+#' @param order return the 1st, 2nd, 3rd, etc, most common chain-cluster |
|
10 |
+#' |
|
11 |
+#' @return `data.frame` with columns from `cell_identifiers` and a single `cluster_idx` for each cell |
|
12 |
+#' @export |
|
13 |
+canonicalize_by_prevalence = function(tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', order = 1){ |
|
14 |
+ prevalence = tbl %>% group_by(!!sym(cluster_idx)) %>% summarize(prev = dplyr::n()) |
|
15 |
+ tbl_prevalence = left_join(tbl, prevalence, by = cluster_idx) |
|
16 |
+ tbl_order = tbl_prevalence %>% group_by(!!!syms(cell_identifiers)) %>% summarize(!!cluster_idx := dplyr::nth(!!sym(cluster_idx), -order, prev)) |
|
17 |
+ ungroup(tbl_order) |
|
18 |
+} |
|
19 |
+ |
|
20 |
+ |
|
21 |
+ |
|
22 |
+ |
|
23 |
+#' @param sort_factors `character` vector naming columns in `tbl` to sorted on, within `cell_identifier`. Sorted by first element first, then ties broken by subsequent elements. Sorted in decreasing order for each element. |
|
24 |
+#' @param chain_levels an optional `character` vector providing the sort order of the `chain` column in `tbl`. Set to length zero to disable. |
|
25 |
+#' @export |
|
26 |
+#' @describeIn canonicalize_by_prevalence return a canonical contig by chain type, with TRB/IGH returned first. By default, ties are broken by umis and reads. |
|
27 |
+canonicalize_by_chain = function(tbl, cell_identifiers = 'barcode', sort_factors = c('chain', 'umis', 'reads'), cluster_idx = 'cluster_idx', order = 1, chain_levels = c('IGL', 'IGK', 'TRA', 'TRB', 'IGH')){ |
|
28 |
+ #ochain = tbl[[sort_factors[1]]] |
|
29 |
+ if(length(chain_levels) > 0){ |
|
30 |
+ if(!('chain' %in% names(tbl))) stop('`tbl` must contain a `chain` column to set `chain_levels`') |
|
31 |
+ tbl = tbl %>% mutate(chain = factor(chain, levels = chain_levels, ordered = TRUE)) |
|
32 |
+ } |
|
33 |
+ arranging = purrr::map(sort_factors, ~ rlang::quo(desc(!!sym(.x)))) |
|
34 |
+ tbl %>% group_by(!!!syms(cell_identifiers)) %>% dplyr::arrange(!!!arranging) %>% mutate(rank = seq_along(!!sym(sort_factors[[1]]))) %>% dplyr::filter(rank == order) %>% dplyr::select(-rank) |
|
35 |
+ |
|
36 |
+} |
|
37 |
+ |
|
38 |
+# Define canonical chain types per cell |
|
39 |
+ |
|
40 |
+#' Given a family of similar sequences, return a "representative" |
|
41 |
+#' |
|
42 |
+#' @param seqs character vector |
|
43 |
+#' @param medoid_idx optional index into seqs |
|
44 |
+#' @param warn_if_distinct Should a warning be emitted if there are distinct elements in seqs? |
|
45 |
+#' |
|
46 |
+#' If medoid_idx is supplied, the medoid sequence is returned, otherwise the longest |
|
47 |
+#' sequence is returned |
|
48 |
+#' |
|
49 |
+#' @return character vector |
|
50 |
+#' @export |
|
51 |
+#' |
|
52 |
+#' @examples |
|
53 |
+#' get_canonical_representative(c('apple', 'manzana', 'pomme')) |
|
54 |
+get_canonical_representative = function(seqs, medoid_idx, warn_if_distinct = FALSE){ |
|
55 |
+ if(!missing(medoid_idx)){ |
|
56 |
+ if(!is.integer(medoid_idx) || medoid_idx < 1 || medoid_idx > length(seqs)) stop("Illegal `medoid_idx`") |
|
57 |
+ ii = medoid_idx |
|
58 |
+ } else{ |
|
59 |
+ len = str_length(seqs) |
|
60 |
+ ii = which.max(len) |
|
61 |
+ } |
|
62 |
+ rep = seqs[ii] |
|
63 |
+ |
|
64 |
+ if(warn_if_distinct && length(nomatch <- which(seqs != rep)) > 0) warning("At indices ", nomatch, " sequences did not match the representative") |
|
65 |
+ |
|
66 |
+ return(rep) |
|
67 |
+} |
|
68 |
+ |
|
69 |
+#' Generate a list of tables representing clusters paired in cells |
|
70 |
+#' |
|
71 |
+#' A contingency table of every combination of `cluster_idx` up to `table_order` is generated. |
|
72 |
+#' Combinations that are found in at least `min_expansion` number of cells are reported. All cells that have these combinations are returned, as well as cells that only have `orphan_level` of matching `cluster_idx`. |
|
73 |
+#' |
|
74 |
+#' For example, if `table_order=2` and `min_expansion=2` then heavy/light or alpha/beta pairs found two or more times will be returned (as well as alpha-alpha pairs, etc, if those are present). |
|
75 |
+#' If `orphan_level=1` then all cells that share just a single chain with an expanded clone will be returned. |
|
76 |
+#' |
|
77 |
+#' The `cluster_idx.1_fct` and `cluster_idx.2_fct` fields in `cell_tbl`, `idx1_tbl`, `idx2_tbl` are cast to factors and ordered such that pairings will tend to occur along the diagonal when they are cross-tabulated. |
|
78 |
+#' This facilitates plotting. |
|
79 |
+#' |
|
80 |
+#' @section Caveats and warnings: |
|
81 |
+#' The cell_idx -> cluster_idx map is generally one-to-many, and is resolved by `canonicalize_fun`. For `table_order>1`, few collisions are expected as most cells will contain no more than 2 clusters. Any collisions are resolved by returning the most prevalent cluster, across samples. When two clusters are tied for most prevalent within a cell, the `cluster_idx` returned is arbitrary. Therefore, when `table_order=1`, it is strongly recommended to subset the `cluster_tbl` to just a single chain. |
|
82 |
+#' |
|
83 |
+#' @param cluster_tbl a table with all combinations of clusters in all cells |
|
84 |
+#' @param cell_identifiers character vector naming fields that key a cell |
|
85 |
+#' @param cluster_idx character naming a single field IDing the clusters |
|
86 |
+#' @param min_expansion the minimal number of times a pairing needs to occur for it to be reported |
|
87 |
+#' @param cell_tbl optional, ancillary table with additional cell features. Must also be keyed by `cell_identifiers` |
|
88 |
+#' @param feature_tbl optional, ancillary table with additional cluster features. Must also be keyed by `cluster_idx` |
|
89 |
+#' @param table_order Integer larger than 1. What order of cluster_idx will be paired, eg, order = 2 means that the most common and second most common cluster_idx will be sought for each cell |
|
90 |
+#' @param orphan_level Integer larger than 0 and less than or equal to `table_order`. Given that at least `min_expansion` cells are found that have `table_order` chains identical, how many `cluster_idx` pairs will we match on to select other cells. Example: `ophan_level=1` means that cells that share just a single chain with the |
|
91 |
+#' @param cluster_whitelist a table of "cluster_idx" that should always be reported. In contrast to the `cluster_tbl`, here the clusters must be named "cluster_idx.1", "cluster_idx.2" (if order-2 pairs are being selected). |
|
92 |
+#' @param cluster_blacklist a table of "cluster_idx" that will never be reported. Must be named as per `cluster_whitelist`. |
|
93 |
+#' @param canonicalize_fun a function with signature `canonicalize_fun(cluster_tbl, cell_identifiers, cluster_idx, order = i)` that for each `cell_identifier` returns a single contig that depends on the `order`. For instance \link{canonicalize_by_prevalence} or \link{canonicalize_by_chain}. |
|
94 |
+#' |
|
95 |
+#' @return list of tables. The `cell_tbl` is keyed by the `cell_identifiers`, with fields "cluster_idx.1", "cluster_idx.2", etc, IDing the contigs present in each cell. "cluster_idx.1_fct" and "cluster_idx.2_fct" cast these fields to factors and are reordered to maximize the number of pairs along the diagonal. The `idx1_tbl` and `idx2_tbl` report information (passed in about the `cluster_idx` by `feature_tbl`.) The `cluster_pair_tbl` reports all pairings found of contigs, and the number of times observed. |
|
96 |
+#' @export |
|
97 |
+#' |
|
98 |
+#' @seealso canonicalize_by_prevalence, canonicalize_by_chain |
|
99 |
+#' @importFrom tibble as_data_frame |
|
100 |
+#' @importFrom dplyr bind_rows left_join ungroup summarize anti_join |
|
101 |
+#' @importFrom stringr str_length str_c |
|
102 |
+#' @importFrom rlang sym syms := |
|
103 |
+#' @examples |
|
104 |
+#' library(dplyr) |
|
105 |
+#' cluster_tbl = data_frame(clust_idx = gl(3, 2), cell_idx = rep(1:3, times = 2)) |
|
106 |
+#' # no pairs found twice |
|
107 |
+#' pt1 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', canonicalize_by_prevalence) |
|
108 |
+#' # all pairs found, found once. |
|
109 |
+#' pt2 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', |
|
110 |
+#' canonicalize_by_prevalence, min_expansion = 1) |
|
111 |
+#' pt2$cell_tbl |
|
112 |
+#' cluster_tbl2 = bind_rows(cluster_tbl, cluster_tbl %>% mutate(cell_idx = rep(4:6, times = 2))) |
|
113 |
+#' #all pairs found twice |
|
114 |
+#' pt3 = pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', canonicalize_by_prevalence, min_expansion = 1) |
|
115 |
+#' pt3$cell_tbl |
|
116 |
+#' # `canonicalize_by_chain` expects fields `umis`, `reads` |
|
117 |
+#' # to break ties, wrap the function to change this |
|
118 |
+#' cluster_tbl3 = cluster_tbl2 %>% |
|
119 |
+#' mutate(umis = 1, reads = 1, chain = rep(c('TRA', 'TRB'), times = 6)) |
|
120 |
+#' pt4 = pairing_tables(cluster_tbl3, 'cell_idx', 'clust_idx', |
|
121 |
+#' canonicalize_by_chain, min_expansion = 1, table_order = 2) |
|
122 |
+pairing_tables = function(cluster_tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', canonicalize_fun = canonicalize_by_chain, table_order = 2, min_expansion = 2, orphan_level = 1, cluster_whitelist = NULL, cluster_blacklist = NULL, cell_tbl = NULL, feature_tbl = NULL ){ |
|
123 |
+ |
|
124 |
+ if(orphan_level > table_order) stop('`ophan_level` must be less than or equal to `table_order`') |
|
125 |
+ if(table_order < 1) stop('Table order must be at least 1') |
|
126 |
+ |
|
127 |
+ # get `table_order` most common clusters for each cell |
|
128 |
+ # forcibly rename cluster_idx -> "cluster_idx" |
|
129 |
+ bar_chain_tbls = purrr::map(seq_len(table_order), function(i) canonicalize_fun(cluster_tbl, cell_identifiers = cell_identifiers, cluster_idx = cluster_idx, order = i) %>% dplyr::select(!!!c(syms(cell_identifiers), rlang::quo(cluster_idx))) %>% dplyr::rename( !!paste0('cluster_idx.', i) := !!cluster_idx)) |
|
130 |
+ # for each cell, what clusters are present |
|
131 |
+ oligo_cluster_pairs = purrr::reduce(bar_chain_tbls, left_join, by = cell_identifiers) |
|
132 |
+ |
|
133 |
+ # Set up cluster_ids for indexing into the pairing tables |
|
134 |
+ cluster_ids = str_c('cluster_idx.', seq_len(table_order)) |
|
135 |
+ cluster_ids_to_select = cluster_ids[seq_len(orphan_level)] |
|
136 |
+ |
|
137 |
+ # In how many cells do each cluster pairing appear? |
|
138 |
+ cluster_pair_tbl = oligo_cluster_pairs %>% group_by(!!!syms(cluster_ids)) %>% summarize(n_clone_pairs = dplyr::n()) |
|
139 |
+ # which clusters are expanded |
|
140 |
+ expanded_cluster = cluster_pair_tbl %>% dplyr::filter(n_clone_pairs >= min_expansion) %>% dplyr::filter_at(.vars = cluster_ids, .vars_predicate = all_vars(!is.na(.))) |
|
141 |
+ expanded_cluster = ungroup(expanded_cluster) %>% dplyr::select(!!!syms(cluster_ids_to_select), max_pairs = n_clone_pairs) |
|
142 |
+ if(!is.null(cluster_whitelist)){ |
|
143 |
+ expanded_cluster = bind_rows(expanded_cluster, cluster_whitelist) |
|
144 |
+ } |
|
145 |
+ if(!is.null(cluster_blacklist)) expanded_cluster = anti_join(expanded_cluster, cluster_blacklist) |
|
146 |
+ |
|
147 |
+ # Could have duplicated cluster_ids after binding to the whitelist or from considering orphans |
|
148 |
+ expanded_cluster = expanded_cluster[!duplicated(expanded_cluster %>% dplyr::select(-max_pairs)),] |
|
149 |
+ expanded_c1 = oligo_cluster_pairs %>% dplyr::inner_join(expanded_cluster, by = cluster_ids_to_select) |
|
150 |
+ if(anyDuplicated(expanded_c1 %>% dplyr::select(!!!syms(cell_identifiers)))) stop("Ruhoh, duplicated cell identifiers, this is a bug!") |
|
151 |
+ |
|
152 |
+ # cross-tab pairings in order to figure out how to order the cluster_idx to put most common pairing on diagonal |
|
153 |
+ if(nrow(expanded_c1)>0 && table_order > 1){ |
|
154 |
+ expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ cluster_idx.2, fun.aggregate = length, value.var = 'max_pairs') |
|
155 |
+ mat = expanded_counts[,-1] |
|
156 |
+ #rownames(mat) = rowid[['cluster_idx']] |
|
157 |
+ ro = hclust(dist(mat))$order |
|
158 |
+ co = hclust(dist(t(mat)))$order |
|
159 |
+ } else if (nrow(expanded_c1) == 0) { |
|
160 |
+ warning('No pairs found') |
|
161 |
+ return() |
|
162 |
+ } else{ |
|
163 |
+ expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ ., fun.aggregate = length, value.var = 'max_pairs') |
|
164 |
+ ro = order(expanded_counts[[2]]) |
|
165 |
+ co = NA_integer_ |
|
166 |
+ } |
|
167 |
+ |
|
168 |
+ ci_class = class(cluster_tbl[[cluster_idx]]) |
|
169 |
+ as_method = if(ci_class == 'factor') as.factor else function(x) as(x, ci_class) |
|
170 |
+ rowid = data_frame(cluster_idx = expanded_counts$cluster_idx.1 %>% as_method, plot_order = ro) |
|
171 |
+ colid = suppressWarnings(data_frame(cluster_idx = colnames(expanded_counts)[-1] %>% as_method, plot_order = co)) |
|
172 |
+ rowid[['cluster_idx.1_fct']] = factor(rowid[['cluster_idx']], levels = rowid[['cluster_idx']][ro]) |
|
173 |
+ colid[['cluster_idx.2_fct']] = factor(colid[['cluster_idx']], levels = colid[['cluster_idx']][co]) |
|
174 |
+ |
|
175 |
+ |
|
176 |
+ # also fix levels of this tbl |
|
177 |
+ expanded_c1 = expanded_c1 %>% mutate(cluster_idx.1_fct = factor(cluster_idx.1, levels = levels(rowid[['cluster_idx.1_fct']]))) |
|
178 |
+ if(table_order>1) expanded_c1 = expanded_c1 %>% mutate(cluster_idx.2_fct = factor(cluster_idx.2, levels = levels(colid[['cluster_idx.2_fct']]))) |
|
179 |
+ |
|
180 |
+ if(!is.null(cell_tbl)){ |
|
181 |
+ if(anyDuplicated(cell_tbl %>% select(!!!syms(cell_identifiers)))) stop('`cell_tbl` must not have duplicate `cell_identifiers`.') |
|
182 |
+ expanded_c1 = left_join(expanded_c1, cell_tbl, by = cell_identifiers) |
|
183 |
+ } |
|
184 |
+ |
|
185 |
+ idx1_tbl = rowid %>% dplyr::rename(!!cluster_idx := cluster_idx) |
|
186 |
+ idx2_tbl = colid %>% dplyr::rename(!!cluster_idx := cluster_idx) |
|
187 |
+ if(!is.null(feature_tbl)){ |
|
188 |
+ if(anyDuplicated(feature_tbl %>% select(!!sym(cluster_idx)))) stop('`feature_tbl` must not have duplicate `cluster_idx`.') |
|
189 |
+ idx1_tbl = left_join(idx1_tbl, feature_tbl, by = cluster_idx) |
|
190 |
+ idx2_tbl = left_join(idx2_tbl, feature_tbl, by = cluster_idx) |
|
191 |
+ } |
|
192 |
+ |
|
193 |
+ list(cell_tbl = expanded_c1, idx1_tbl = idx1_tbl, idx2_tbl = idx2_tbl, cluster_pair_tbl = cluster_pair_tbl) |
|
194 |
+ |
|
195 |
+} |
|
196 |
+ |
|
197 |
+plot_pairing = function(pairing_list, color_labels_by){ |
|
198 |
+ pl = pairing_list |
|
199 |
+ pairs_plt = ggplot(pairing_list$cell_tbl, aes(x = cluster_idx.1_fct, y = cluster_idx.2_fct, color = sample, shape = pop)) + geom_jitter(width = .3, height = .3) |
|
200 |
+ |
|
201 |
+ ylab = data_frame(!!color_labels_by := ggplot_build(pairs_plt)$layout$panel_params[[1]]$y.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color)) |
|
202 |
+ |
|
203 |
+ xlab = data_frame(!!color_labels_by := ggplot_build(pairs_plt)$layout$panel_params[[1]]$x.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color)) |
|
204 |
+ |
|
205 |
+ pairs_plt = pairs_plt + theme(axis.text.x = element_text(angle = 90, color = xlab$class_color, size = 8), axis.text.y = element_text(color = ylab$class_color, size = 8)) |
|
206 |
+ |
|
207 |
+ |
|
208 |
+} |
|
209 |
+ |
... | ... |
@@ -12,9 +12,11 @@ |
12 | 12 |
#' |
13 | 13 |
#' @examples |
14 | 14 |
#' library(dplyr) |
15 |
-#' contig_anno_path = system.file('extdata', 'cellranger_contig_annotation.csv', package = 'CellaRepertorium') |
|
15 |
+#' contig_anno_path = system.file('extdata', 'cellranger_contig_annotation.csv', |
|
16 |
+#' package = 'CellaRepertorium') |
|
16 | 17 |
#' contig_anno = readr::read_csv(contig_anno_path) |
17 |
-#' contig_anno = contig_anno %>% mutate(fancy_name = fancy_name_contigs(., prefix = paste(sample, pop, sep = '_'))) |
|
18 |
+#' contig_anno = contig_anno %>% mutate(fancy_name = |
|
19 |
+#' fancy_name_contigs(., prefix = paste(sample, pop, sep = '_'))) |
|
18 | 20 |
#' stopifnot(!any(duplicated(contig_anno$fancy_name))) |
19 | 21 |
fancy_name_contigs = function(contig_frame, prefix){ |
20 | 22 |
notin = setdiff(c('chain', 'v_gene', 'd_gene', 'j_gene'), names(contig_frame)) |
... | ... |
@@ -31,138 +33,6 @@ variable_genes = function(ref = '10X'){ |
31 | 33 |
} |
32 | 34 |
|
33 | 35 |
|
34 |
-#' Generate a list of tables representing clusters paired in cells |
|
35 |
-#' |
|
36 |
-#' A contingency table of every combination of `cluster_idx` up to `table_order` is generated. |
|
37 |
-#' Combinations that are found in at least `min_expansion` number of cells are reported. All cells that have these combinations are returned, as well as cells that only have `orphan_level` of matching `cluster_idx`. |
|
38 |
-#' |
|
39 |
-#' For example, if `table_order=2` and `min_expansion=2` then heavy/light or alpha/beta pairs found two or more times will be returned (as well as alpha-alpha pairs, etc, if those are present). |
|
40 |
-#' If `orphan_level=1` then all cells that share just a single chain with an expanded clone will be returned. |
|
41 |
-#' |
|
42 |
-#' The `cluster_idx.1_fct` and `cluster_idx.2_fct` fields in `cell_tbl`, `idx1_tbl`, `idx2_tbl` are cast to factors and ordered such that pairings will tend to occur along the diagonal when they are cross-tabulated. |
|
43 |
-#' This facilitates plotting. |
|
44 |
-#' |
|
45 |
-#' @section Caveats and warnings: |
|
46 |
-#' The cell_idx -> cluster_idx map is generally one-to-many. For `table_order>1`, few collisions are expected as most cells will contain no more than 2 clusters. Any collisions are resolved by returning the most prevalent cluster, across samples. When two clusters are tied for most prevalent within a cell, the `cluster_idx` returned is arbitrary. Therefore, when `table_order=1`, it is strongly recommended to subset the `cluster_tbl` to just a single chain. |
|
47 |
-#' @param cluster_tbl a table with all combinations of clusters in all cells |
|
48 |
-#' @param cell_identifiers character vector naming fields that key a cell |
|
49 |
-#' @param cluster_idx character naming a single field IDing the clusters |
|
50 |
-#' @param min_expansion the minimal number of times a pairing needs to occur for it to be reported |
|
51 |
-#' @param cell_tbl optional, ancillary table with additional cell features. Must also be keyed by `cell_identifiers` |
|
52 |
-#' @param feature_tbl optional, ancillary table with additional cluster features. Must also be keyed by `cluster_idx` |
|
53 |
-#' @param table_order Integer larger than 1. What order of cluster_idx will be paired, eg, order = 2 means that the most common and second most common cluster_idx will be sought for each cell |
|
54 |
-#' @param orphan_level Integer larger than 0 and less than or equal to `table_order`. Given that at least `min_expansion` cells are found that have `table_order` chains identical, how many `cluster_idx` pairs will we match on to select other cells. Example: `ophan_level=1` means that cells that share just a single chain with the |
|
55 |
-#' @param cluster_whitelist a table of "cluster_idx" that should always be reported. In contrast to the `cluster_tbl`, here the clusters must be named "cluster_idx.1", "cluster_idx.2" (if order-2 pairs are being selected). |
|
56 |
-#' @param cluster_blacklist a table of "cluster_idx" that will never be reported. Must be named as per `cluster_whitelist`. |
|
57 |
-#' |
|
58 |
-#' @return list of tables. The `cell_tbl` is keyed by the `cell_identifiers`, with fields "cluster_idx.1", "cluster_idx.2", etc, IDing the contigs present in each cell. "cluster_idx.1_fct" and "cluster_idx.2_fct" cast these fields to factors and are reordered to maximize the number of pairs along the diagonal. The `idx1_tbl` and `idx2_tbl` report information (passed in about the `cluster_idx` by `feature_tbl`.) The `cluster_pair_tbl` reports all pairings found of contigs, and the number of times observed. |
|
59 |
-#' @export |
|
60 |
-#' |
|
61 |
-#' @seealso get_canonical_chain |
|
62 |
-#' @importFrom tibble as_data_frame |
|
63 |
-#' @importFrom dplyr bind_rows left_join ungroup summarize anti_join |
|
64 |
-#' @importFrom stringr str_length str_c |
|
65 |
-#' @importFrom rlang sym syms := |
|
66 |
-#' @examples |
|
67 |
-#' library(dplyr) |
|
68 |
-#' cluster_tbl = data_frame(clust_idx = gl(3, 2), cell_idx = rep(1:3, times = 2)) |
|
69 |
-#' # no pairs found twice |
|
70 |
-#' pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx') |
|
71 |
-#' # all pairs found, found once. |
|
72 |
-#' pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', min_expansion = 1) |
|
73 |
-#' cluster_tbl2 = bind_rows(cluster_tbl, cluster_tbl %>% mutate(cell_idx = rep(4:6, times = 2))) |
|
74 |
-#' #all pairs found twice |
|
75 |
-#' pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', min_expansion = 1) |
|
76 |
-#' # A warning if `table_order = 1` -- best to stratify by chain |
|
77 |
-#' # (could be accomplished by setting the `cell_identifiers` to include a field that identifies the chain). |
|
78 |
-#' pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', min_expansion = 1, table_order = 1) |
|
79 |
-pairing_tables = function(cluster_tbl, cell_identifiers = 'barcode', cluster_idx = 'cluster_idx', table_order = 2, min_expansion = 2, orphan_level = 1, cluster_whitelist = NULL, cluster_blacklist = NULL, cell_tbl = NULL, feature_tbl = NULL ){ |
|
80 |
- |
|
81 |
- if(orphan_level > table_order) stop('`ophan_level` must be less than or equal to `table_order`') |
|
82 |
- if(table_order < 1) stop('Table order must be at least 1') |
|
83 |
- |
|
84 |
- # get `table_order` most common clusters for each cell |
|
85 |
- # forcibly rename cluster_idx -> "cluster_idx" |
|
86 |
- bar_chain_tbls = purrr::map(seq_len(table_order), function(i) get_canonical_chain(cluster_tbl, cell_identifiers = cell_identifiers, cluster_idx = cluster_idx, order = i) %>% dplyr::rename( !!paste0('cluster_idx.', i) := !!cluster_idx)) |
|
87 |
- # for each cell, what clusters are present |
|
88 |
- oligo_cluster_pairs = purrr::reduce(bar_chain_tbls, left_join, by = cell_identifiers) |
|
89 |
- |
|
90 |
- # Set up cluster_ids for indexing into the pairing tables |
|
91 |
- cluster_ids = str_c('cluster_idx.', seq_len(table_order)) |
|
92 |
- cluster_ids_to_select = cluster_ids[seq_len(orphan_level)] |
|
93 |
- |
|
94 |
- # In how many cells do each cluster pairing appear? |
|
95 |
- cluster_pair_tbl = oligo_cluster_pairs %>% group_by(!!!syms(cluster_ids)) %>% summarize(n_clone_pairs = n()) |
|
96 |
- # which clusters are expanded |
|
97 |
- expanded_cluster = cluster_pair_tbl %>% dplyr::filter(n_clone_pairs >= min_expansion) %>% dplyr::filter_at(.vars = cluster_ids_to_select, .vars_predicate = all_vars(!is.na(.))) |
|
98 |
- expanded_cluster = ungroup(expanded_cluster) %>% dplyr::select(!!!syms(cluster_ids_to_select), max_pairs = n_clone_pairs) |
|
99 |
- if(!is.null(cluster_whitelist)){ |
|
100 |
- expanded_cluster = bind_rows(expanded_cluster, cluster_whitelist) |
|
101 |
- } |
|
102 |
- if(!is.null(cluster_blacklist)) expanded_cluster = anti_join(expanded_cluster, cluster_blacklist) |
|
103 |
- |
|
104 |
- # Could have duplicated cluster_ids after binding to the whitelist or from considering orphans |
|
105 |
- expanded_cluster = expanded_cluster[!duplicated(expanded_cluster %>% dplyr::select(-max_pairs)),] |
|
106 |
- expanded_c1 = oligo_cluster_pairs %>% right_join(expanded_cluster, by = cluster_ids_to_select) |
|
107 |
- if(anyDuplicated(expanded_c1 %>% dplyr::select(!!!syms(cell_identifiers)))) stop("Ruhoh, duplicated cell identifiers, this is a bug!") |
|
108 |
- |
|
109 |
- # cross-tab pairings in order to figure out how to order the cluster_idx to put most common pairing on diagonal |
|
110 |
- if(nrow(expanded_c1)>0 && table_order > 1){ |
|
111 |
- expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ cluster_idx.2, fun.aggregate = length, value.var = 'max_pairs') |
|
112 |
- mat = expanded_counts[,-1] |
|
113 |
- #rownames(mat) = rowid[['cluster_idx']] |
|
114 |
- ro = hclust(dist(mat))$order |
|
115 |
- co = hclust(dist(t(mat)))$order |
|
116 |
- } else if (nrow(expanded_c1) == 0) { |
|
117 |
- warning('No pairs found') |
|
118 |
- return() |
|
119 |
- } else{ |
|
120 |
- expanded_counts = reshape2::dcast(expanded_c1, cluster_idx.1 ~ ., fun.aggregate = length, value.var = 'max_pairs') |
|
121 |
- ro = order(expanded_counts[[2]]) |
|
122 |
- co = NA_integer_ |
|
123 |
- } |
|
124 |
- |
|
125 |
- ci_class = class(cluster_tbl[[cluster_idx]]) |
|
126 |
- as_method = if(ci_class == 'factor') as.factor else function(x) as(x, ci_class) |
|
127 |
- rowid = data_frame(cluster_idx = expanded_counts$cluster_idx.1 %>% as_method, plot_order = ro) |
|
128 |
- colid = suppressWarnings(data_frame(cluster_idx = colnames(expanded_counts)[-1] %>% as_method, plot_order = co)) |
|
129 |
- rowid[['cluster_idx.1_fct']] = factor(rowid[['cluster_idx']], levels = rowid[['cluster_idx']][ro]) |
|
130 |
- colid[['cluster_idx.2_fct']] = factor(colid[['cluster_idx']], levels = colid[['cluster_idx']][co]) |
|
131 |
- |
|
132 |
- |
|
133 |
- # also fix levels of this tbl |
|
134 |
- expanded_c1 = expanded_c1 %>% mutate(cluster_idx.1_fct = factor(cluster_idx.1, levels = levels(rowid[['cluster_idx.1_fct']]))) |
|
135 |
- if(table_order>1) expanded_c1 = expanded_c1 %>% mutate(cluster_idx.2_fct = factor(cluster_idx.2, levels = levels(colid[['cluster_idx.2_fct']]))) |
|
136 |
- |
|
137 |
- if(!is.null(cell_tbl)){ |
|
138 |
- if(anyDuplicated(cell_tbl %>% select(!!!syms(cell_identifiers)))) stop('`cell_tbl` must not have duplicate `cell_identifiers`.') |
|
139 |
- expanded_c1 = left_join(expanded_c1, cell_tbl, by = cell_identifiers) |
|
140 |
- } |
|
141 |
- |
|
142 |
- idx1_tbl = rowid %>% dplyr::rename(!!cluster_idx := cluster_idx) |
|
143 |
- idx2_tbl = colid %>% dplyr::rename(!!cluster_idx := cluster_idx) |
|
144 |
- if(!is.null(feature_tbl)){ |
|
145 |
- if(anyDuplicated(feature_tbl %>% select(!!sym(cluster_idx)))) stop('`feature_tbl` must not have duplicate `cluster_idx`.') |
|
146 |
- idx1_tbl = left_join(idx1_tbl, feature_tbl, by = cluster_idx) |
|
147 |
- idx2_tbl = left_join(idx2_tbl, feature_tbl, by = cluster_idx) |
|
148 |
- } |
|
149 |
- |
|
150 |
- list(cell_tbl = expanded_c1, idx1_tbl = idx1_tbl, idx2_tbl = idx2_tbl, cluster_pair_tbl = cluster_pair_tbl) |
|
151 |
- |
|
152 |
-} |
|
153 |
- |
|
154 |
-plot_pairing = function(pairing_list, color_labels_by){ |
|
155 |
- pl = pairing_list |
|
156 |
- pairs_plt = ggplot(pairing_list$cell_tbl, aes(x = cluster_idx.1_fct, y = cluster_idx.2_fct, color = sample, shape = pop)) + geom_jitter(width = .3, height = .3) |
|
157 |
- |
|
158 |
- ylab = data_frame(!!color_labels_by := ggplot_build(pairs_plt)$layout$panel_params[[1]]$y.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color)) |
|
159 |
- |
|
160 |
- xlab = data_frame(!!color_labels_by := ggplot_build(pairs_plt)$layout$panel_params[[1]]$x.label) %>% left_join(feature_tbl) %>% mutate(class_color = ifelse(is.na(class_color), '#E41A1C', class_color)) |
|
161 |
- |
|
162 |
- pairs_plt = pairs_plt + theme(axis.text.x = element_text(angle = 90, color = xlab$class_color, size = 8), axis.text.y = element_text(color = ylab$class_color, size = 8)) |
|
163 |
- |
|
164 |
- |
|
165 |
-} |
|
166 | 36 |
|
167 | 37 |
cleanup_annotations = function(json){ |
168 | 38 |
anno = json[['annotations']] |
0 | 5 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,41 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/pairing-methods.R |
|
3 |
+\name{canonicalize_by_prevalence} |
|
4 |
+\alias{canonicalize_by_prevalence} |
|
5 |
+\alias{canonicalize_by_chain} |
|
6 |
+\title{For each cell, return a single, canonical chain-cluster} |
|
7 |
+\usage{ |
|
8 |
+canonicalize_by_prevalence(tbl, cell_identifiers = "barcode", |
|
9 |
+ cluster_idx = "cluster_idx", order = 1) |
|
10 |
+ |
|
11 |
+canonicalize_by_chain(tbl, cell_identifiers = "barcode", |
|
12 |
+ sort_factors = c("chain", "umis", "reads"), |
|
13 |
+ cluster_idx = "cluster_idx", order = 1, chain_levels = c("IGL", |
|
14 |
+ "IGK", "TRA", "TRB", "IGH")) |
|
15 |
+} |
|
16 |
+\arguments{ |
|
17 |
+\item{tbl}{`data.frame` containing columns specified in `cell_identifiers`, `cluster_idx` and optionally `chain_identifiers`} |
|
18 |
+ |
|
19 |
+\item{cell_identifiers}{`character` vector specifying columns in `tbl` that identify a cell} |
|
20 |
+ |
|
21 |
+\item{cluster_idx}{`character` specifying the column in `tbl` that identifies a clsuter} |
|
22 |
+ |
|
23 |
+\item{order}{return the 1st, 2nd, 3rd, etc, most common chain-cluster} |
|
24 |
+ |
|
25 |
+\item{sort_factors}{`character` vector naming columns in `tbl` to sorted on, within `cell_identifier`. Sorted by first element first, then ties broken by subsequent elements. Sorted in decreasing order for each element.} |
|
26 |
+ |
|
27 |
+\item{chain_levels}{an optional `character` vector providing the sort order of the `chain` column in `tbl`. Set to length zero to disable.} |
|
28 |
+} |
|
29 |
+\value{ |
|
30 |
+`data.frame` with columns from `cell_identifiers` and a single `cluster_idx` for each cell |
|
31 |
+} |
|
32 |
+\description{ |
|
33 |
+In single cell data, multiple chains (heavy-light or alpha-beta) are expected. In some cases, there could be more than two (eg multiple alpha alleles for T cells). |
|
34 |
+This picks a cluster id for each cell based on the overall prevalence of cluster ids over all cells in `tbl`. |
|
35 |
+If order = 1 then the canonical chain-cluster will be the most prevalent, and if order = 2, it will be the 2nd most prevalent, and so on. Ties are broken arbitrarily (possibly by lexicographic order of `cluster_idx`). |
|
36 |
+} |
|
37 |
+\section{Functions}{ |
|
38 |
+\itemize{ |
|
39 |
+\item \code{canonicalize_by_chain}: return a canonical contig by chain type, with TRB/IGH returned first. By default, ties are broken by umis and reads. |
|
40 |
+}} |
|
41 |
+ |
... | ... |
@@ -34,7 +34,8 @@ For example, \code{clusters} could be a clonal ID, while \code{labels} is a subj |
34 | 34 |
\examples{ |
35 | 35 |
library(dplyr) |
36 | 36 |
purity = function(clusters, labels, covariates){ |
37 |
-n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% group_by(clusters, labels) \%>\% summarize(n = n()) \%>\% ungroup() |
|
37 |
+n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% |
|
38 |
+ group_by(clusters, labels) \%>\% summarize(n = n()) \%>\% ungroup() |
|
38 | 39 |
singletons = mean(n_label_cluster$n == 1) |
39 | 40 |
singletons |
40 | 41 |
} |
... | ... |
@@ -21,6 +21,7 @@ Generate a legible name for a series of contigs |
21 | 21 |
library(dplyr) |
22 | 22 |
contig_anno_path = system.file('extdata', 'cellranger_contig_annotation.csv', package = 'CellaRepertorium') |
23 | 23 |
contig_anno = readr::read_csv(contig_anno_path) |
24 |
-contig_anno = contig_anno \%>\% mutate(fancy_name = fancy_name_contigs(., prefix = paste(sample, pop, sep = '_'))) |
|
24 |
+contig_anno = contig_anno \%>\% mutate(fancy_name = |
|
25 |
+ fancy_name_contigs(., prefix = paste(sample, pop, sep = '_'))) |
|
25 | 26 |
stopifnot(!any(duplicated(contig_anno$fancy_name))) |
26 | 27 |
} |
27 | 28 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,27 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/pairing-methods.R |
|
3 |
+\name{get_canonical_representative} |
|
4 |
+\alias{get_canonical_representative} |
|
5 |
+\title{Given a family of similar sequences, return a "representative"} |
|
6 |
+\usage{ |
|
7 |
+get_canonical_representative(seqs, medoid_idx, warn_if_distinct = FALSE) |
|
8 |
+} |
|
9 |
+\arguments{ |
|
10 |
+\item{seqs}{character vector} |
|
11 |
+ |
|
12 |
+\item{medoid_idx}{optional index into seqs} |
|
13 |
+ |
|
14 |
+\item{warn_if_distinct}{Should a warning be emitted if there are distinct elements in seqs? |
|
15 |
+ |
|
16 |
+If medoid_idx is supplied, the medoid sequence is returned, otherwise the longest |
|
17 |
+sequence is returned} |
|
18 |
+} |
|
19 |
+\value{ |
|
20 |
+character vector |
|
21 |
+} |
|
22 |
+\description{ |
|
23 |
+Given a family of similar sequences, return a "representative" |
|
24 |
+} |
|
25 |
+\examples{ |
|
26 |
+get_canonical_representative(c('apple', 'manzana', 'pomme')) |
|
27 |
+} |
0 | 28 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,76 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/pairing-methods.R |
|
3 |
+\name{pairing_tables} |
|
4 |
+\alias{pairing_tables} |
|
5 |
+\title{Generate a list of tables representing clusters paired in cells} |
|
6 |
+\usage{ |
|
7 |
+pairing_tables(cluster_tbl, cell_identifiers = "barcode", |
|
8 |
+ cluster_idx = "cluster_idx", |
|
9 |
+ canonicalize_fun = canonicalize_by_chain, table_order = 2, |
|
10 |
+ min_expansion = 2, orphan_level = 1, cluster_whitelist = NULL, |
|
11 |
+ cluster_blacklist = NULL, cell_tbl = NULL, feature_tbl = NULL) |
|
12 |
+} |
|
13 |
+\arguments{ |
|
14 |
+\item{cluster_tbl}{a table with all combinations of clusters in all cells} |
|
15 |
+ |
|
16 |
+\item{cell_identifiers}{character vector naming fields that key a cell} |
|
17 |
+ |
|
18 |
+\item{cluster_idx}{character naming a single field IDing the clusters} |
|
19 |
+ |
|
20 |
+\item{canonicalize_fun}{a function with signature `canonicalize_fun(cluster_tbl, cell_identifiers, cluster_idx, order = i)` that for each `cell_identifier` returns a single contig that depends on the `order`. For instance \link{canonicalize_by_prevalence} or \link{canonicalize_by_chain}.} |
|
21 |
+ |
|
22 |
+\item{table_order}{Integer larger than 1. What order of cluster_idx will be paired, eg, order = 2 means that the most common and second most common cluster_idx will be sought for each cell} |
|
23 |
+ |
|
24 |
+\item{min_expansion}{the minimal number of times a pairing needs to occur for it to be reported} |
|
25 |
+ |
|
26 |
+\item{orphan_level}{Integer larger than 0 and less than or equal to `table_order`. Given that at least `min_expansion` cells are found that have `table_order` chains identical, how many `cluster_idx` pairs will we match on to select other cells. Example: `ophan_level=1` means that cells that share just a single chain with the} |
|
27 |
+ |
|
28 |
+\item{cluster_whitelist}{a table of "cluster_idx" that should always be reported. In contrast to the `cluster_tbl`, here the clusters must be named "cluster_idx.1", "cluster_idx.2" (if order-2 pairs are being selected).} |
|
29 |
+ |
|
30 |
+\item{cluster_blacklist}{a table of "cluster_idx" that will never be reported. Must be named as per `cluster_whitelist`.} |
|
31 |
+ |
|
32 |
+\item{cell_tbl}{optional, ancillary table with additional cell features. Must also be keyed by `cell_identifiers`} |
|
33 |
+ |
|
34 |
+\item{feature_tbl}{optional, ancillary table with additional cluster features. Must also be keyed by `cluster_idx`} |
|
35 |
+} |
|
36 |
+\value{ |
|
37 |
+list of tables. The `cell_tbl` is keyed by the `cell_identifiers`, with fields "cluster_idx.1", "cluster_idx.2", etc, IDing the contigs present in each cell. "cluster_idx.1_fct" and "cluster_idx.2_fct" cast these fields to factors and are reordered to maximize the number of pairs along the diagonal. The `idx1_tbl` and `idx2_tbl` report information (passed in about the `cluster_idx` by `feature_tbl`.) The `cluster_pair_tbl` reports all pairings found of contigs, and the number of times observed. |
|
38 |
+} |
|
39 |
+\description{ |
|
40 |
+A contingency table of every combination of `cluster_idx` up to `table_order` is generated. |
|
41 |
+Combinations that are found in at least `min_expansion` number of cells are reported. All cells that have these combinations are returned, as well as cells that only have `orphan_level` of matching `cluster_idx`. |
|
42 |
+} |
|
43 |
+\details{ |
|
44 |
+For example, if `table_order=2` and `min_expansion=2` then heavy/light or alpha/beta pairs found two or more times will be returned (as well as alpha-alpha pairs, etc, if those are present). |
|
45 |
+If `orphan_level=1` then all cells that share just a single chain with an expanded clone will be returned. |
|
46 |
+ |
|
47 |
+The `cluster_idx.1_fct` and `cluster_idx.2_fct` fields in `cell_tbl`, `idx1_tbl`, `idx2_tbl` are cast to factors and ordered such that pairings will tend to occur along the diagonal when they are cross-tabulated. |
|
48 |
+This facilitates plotting. |
|
49 |
+} |
|
50 |
+\section{Caveats and warnings}{ |
|
51 |
+ |
|
52 |
+ The cell_idx -> cluster_idx map is generally one-to-many, and is resolved by `canonicalize_fun`. For `table_order>1`, few collisions are expected as most cells will contain no more than 2 clusters. Any collisions are resolved by returning the most prevalent cluster, across samples. When two clusters are tied for most prevalent within a cell, the `cluster_idx` returned is arbitrary. Therefore, when `table_order=1`, it is strongly recommended to subset the `cluster_tbl` to just a single chain. |
|
53 |
+} |
|
54 |
+ |
|
55 |
+\examples{ |
|
56 |
+library(dplyr) |
|
57 |
+cluster_tbl = data_frame(clust_idx = gl(3, 2), cell_idx = rep(1:3, times = 2)) |
|
58 |
+# no pairs found twice |
|
59 |
+pt1 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', canonicalize_by_prevalence) |
|
60 |
+# all pairs found, found once. |
|
61 |
+pt2 = pairing_tables(cluster_tbl, 'cell_idx', 'clust_idx', |
|
62 |
+ canonicalize_by_prevalence, min_expansion = 1) |
|
63 |
+pt2$cell_tbl |
|
64 |
+cluster_tbl2 = bind_rows(cluster_tbl, cluster_tbl \%>\% mutate(cell_idx = rep(4:6, times = 2))) |
|
65 |
+#all pairs found twice |
|
66 |
+pt3 = pairing_tables(cluster_tbl2, 'cell_idx', 'clust_idx', canonicalize_by_prevalence, min_expansion = 1) |
|
67 |
+pt3$cell_tbl |
|
68 |
+# canonicalize_by_chain by default expects fields umis, reads to break ties, could wrap the function to change this |
|
69 |
+cluster_tbl3 = cluster_tbl2 \%>\% |
|
70 |
+ mutate(umis = 1, reads = 1, chain = rep(c('TRA', 'TRB'), times = 6)) |
|
71 |
+pt4 = pairing_tables(cluster_tbl3, 'cell_idx', 'clust_idx', |
|
72 |
+ canonicalize_by_chain, min_expansion = 1, table_order = 2) |
|
73 |
+} |
|
74 |
+\seealso{ |
|
75 |
+canonicalize_by_prevalence, canonicalize_by_chain |
|
76 |
+} |