Browse code

Many changes about equalization to try to preserve length of cell_tbl 1. Add equalize argument to constructor 2. Don't equalize cells if contigs replaced 3. Don't equalize if cluster table is replaced 4. split_cdb gains an equalization argument

Andrew McDavid authored on 28/08/2019 03:25:45
Showing 5 changed files

... ...
@@ -47,7 +47,9 @@ Suggests:
47 47
     ggdendro,
48 48
     broom,
49 49
     lme4,
50
-    RColorBrewer
50
+    RColorBrewer,
51
+    SingleCellExperiment,
52
+    scater
51 53
 LinkingTo: 
52 54
     Rcpp
53 55
 VignetteBuilder: 
... ...
@@ -1,5 +1,6 @@
1
-# Todo
2
-# combine methods
1
+# Contig -- always equalize to cell
2
+# Cluster -- equalize to contig
3
+# Cell -- equalize after calling
3 4
 
4 5
 valid_KeyedTbl = function(tbl, keys){
5 6
     tbl_nm = deparse(substitute(tbl))
... ...
@@ -25,6 +26,7 @@ valid_KeyedTbl = function(tbl, keys){
25 26
 #' @param cell_pk character vector naming fields in `cell_tbl` that uniquely identify a cell barcode
26 27
 #' @param cluster_tbl A data frame that provide cluster assignments for each contig
27 28
 #' @param cluster_pk If `cluster_tbl` was provided, a character vector naming fields in `cluster_tbl` that uniquely identify a cluster
29
+#' @param equalize `logical`. Should the contig, cells and clusters be equalized by taking the intersection of their common keys?
28 30
 #' @return \code{ContigCellDB}
29 31
 #'
30 32
 #' @section Accessors/mutators:
... ...
@@ -82,9 +84,10 @@ ContigCellDB = function(contig_tbl, contig_pk, cell_tbl, cell_pk, cluster_tbl, c
82 84
 }
83 85
 
84 86
 #' @describeIn ContigCellDB-fun provide defaults that correspond to identifiers in 10X VDJ data
87
+#' @param ... passed to [ContigCellDB()]
85 88
 #' @export
86
-ContigCellDB_10XVDJ = function(contig_tbl, contig_pk = c('barcode', 'contig_id'), cell_pk = 'barcode'){
87
-    ContigCellDB(contig_tbl = contig_tbl, contig_pk = contig_pk, cell_pk = cell_pk)
89
+ContigCellDB_10XVDJ = function(contig_tbl, contig_pk = c('barcode', 'contig_id'), cell_pk = 'barcode', ...){
90
+    ContigCellDB(contig_tbl = contig_tbl, contig_pk = contig_pk, cell_pk = cell_pk, ...)
88 91
 }
89 92
 
90 93
 access_cdb = function(x, name){
... ...
@@ -104,7 +107,7 @@ replace_cdb = function(x, name, value){
104 107
     }
105 108
     if(name %in% c('contig_tbl','contig_pk')){
106 109
         valid_KeyedTbl(x$contig_tbl, x$contig_pk)
107
-        x = equalize_ccdb(x)
110
+        x = equalize_ccdb(x, contig = TRUE, cell = FALSE, cluster = TRUE)
108 111
     }
109 112
     if(name %in% c('cell_tbl','cell_pk')){
110 113
         valid_KeyedTbl(x$cell_tbl, x$cell_pk)
... ...
@@ -112,7 +115,7 @@ replace_cdb = function(x, name, value){
112 115
     }
113 116
     if(name %in% c('cluster_tbl','cluster_pk')){
114 117
         valid_KeyedTbl(x$cluster_tbl, x$cluster_pk)
115
-        x = equalize_ccdb(x, contig = TRUE, cell = FALSE, cluster = FALSE)
118
+        x = equalize_ccdb(x, contig = TRUE, cell = FALSE, cluster = TRUE)
116 119
     }
117 120
     invisible(x)
118 121
 }
... ...
@@ -141,7 +144,7 @@ setMethod("$", signature = c(x = 'ContigCellDB'), access_cdb)
141 144
 #' ccdb_ex
142 145
 #' ccdb_ex$contig_tbl = dplyr::filter(ccdb_ex$contig_tbl, pop == 'b6')
143 146
 #' ccdb_ex
144
-setReplaceMethod("$", signature = c(x = 'ContigCellDB'), replace_cdb)
147
+setReplaceMethod("$", signature = c(x = 'ContigCellDB'), function(x, name, value) replace_cdb(x, name, value))
145 148
 
146 149
 setMethod('show', signature = c(object = 'ContigCellDB'), function(object){
147 150
     cat(class(object), "of", nrow(object$contig_tbl), "contigs")
... ...
@@ -200,20 +203,20 @@ equalize_ccdb = function(x, cell = TRUE, contig = TRUE, cluster = TRUE){
200 203
     if(cell) x@cell_tbl = semi_join(x$cell_tbl, x$contig_tbl, by = x$cell_pk)
201 204
     if(contig) x@contig_tbl = semi_join(x$contig_tbl, x$cell_tbl, by = x$cell_pk)
202 205
     if(nrow(x$cluster_tbl) > 0 && cluster) x@cluster_tbl = semi_join(x$cluster_tbl, x$contig_tbl, by = x$cluster_pk)
203
-    x@equalized = TRUE
206
+    x@equalized = (cell & contig & cluster) | x@equalized
204 207
     x
205 208
 }
206 209
 
207 210
 replace_cluster_tbl = function(ccdb, cluster_tbl, contig_tbl, cluster_pk){
208 211
     if(nrow(ccdb$cluster_tbl)>0 && !missing(cluster_pk)){
209
-        warning("Replacing `cluster_tbl` with ", paste(ccdb$cluster_pk, sep = ', ', '.'))
212
+        warning("Replacing `cluster_tbl` with ", paste(ccdb$cluster_pk, sep = ', '), '.')
210 213
     }
211 214
     if(!missing(cluster_pk)) ccdb@cluster_pk = cluster_pk
212 215
     ccdb@cluster_tbl = cluster_tbl
213 216
     if(!missing(contig_tbl)){
217
+        if(nrow(contig_tbl) != nrow(ccdb@contig_tbl)) stop("Length mismatch; this is a bug!")
214 218
         ccdb@contig_tbl = contig_tbl
215 219
         valid_KeyedTbl(ccdb$contig_tbl, ccdb$contig_pk)
216
-        ccdb = equalize_ccdb(ccdb, cell = FALSE, cluster = FALSE, contig = TRUE)
217 220
     }
218 221
     valid_KeyedTbl(ccdb$cluster_tbl, ccdb$cluster_pk)
219 222
     validObject(ccdb)
... ...
@@ -271,7 +274,7 @@ mutate_cdb <- function(ccdb, ..., tbl='contig_tbl'){
271 274
 #' splat = split_cdb(ccdb_ex, 'chain', 'contig_tbl')
272 275
 #' stopifnot(all(splat$TRA$contig_tbl$chain == 'TRA'))
273 276
 #' stopifnot(all(splat$TRB$contig_tbl$chain == 'TRB'))
274
-split_cdb = function(ccdb, fields, tbl = 'contig_tbl', drop = FALSE){
277
+split_cdb = function(ccdb, fields, tbl = 'contig_tbl', drop = FALSE, equalize = TRUE){
275 278
     thetbl = access_cdb(ccdb, tbl)
276 279
     if(!is.character(fields)){
277 280
         stop('`field` must be a character naming fields in `tbl`')
... ...
@@ -281,7 +284,8 @@ split_cdb = function(ccdb, fields, tbl = 'contig_tbl', drop = FALSE){
281 284
     }
282 285
     split_tbl = split(thetbl, thetbl[fields], drop = drop)
283 286
     out = purrr::map(split_tbl, function(tt){
284
-        replace_cdb(ccdb, tbl, tt)
287
+        tmp = replace_cdb(ccdb, tbl, tt)
288
+        if(equalize) equalize_ccdb(tmp)
285 289
     })
286 290
     out
287 291
 }
... ...
@@ -293,7 +297,7 @@ rbind.ContigCellDB <- function(..., deparse.level=1)
293 297
     .bind_rows_ccdb(objects[[1L]], objects[-1L])
294 298
 }
295 299
 
296
-#' #' Combine `ContigCellDB` along rows (contigs, cells or clusters)
300
+#' Combine `ContigCellDB` along rows (contigs, cells or clusters)
297 301
 #'
298 302
 #' The union of the rows in each of the objects is taken,
299 303
 #'  thus removing any rows that has an exact duplicate.  This
... ...
@@ -324,7 +328,5 @@ setMethod('rbind', 'ContigCellDB', rbind.ContigCellDB)
324 328
     for(tt in c('contig_pk', 'cell_pk', 'cluster_pk')){
325 329
         pks[[tt]] = purrr::reduce(purrr::map(all_objs, access_cdb, name = tt), union)
326 330
     }
327
-
328
-
329 331
     ContigCellDB(tbls$contig_tbl, pks$contig_pk, tbls$cell_tbl, pks$cell_pk, tbls$cluster_tbl, pks$cluster_pk)
330 332
 }
... ...
@@ -31,7 +31,7 @@ test_that('Can rbind', {
31 31
    expect_equal(unite, ccdb_ex)
32 32
 
33 33
    ccdb_ex = cluster_germline(ccdb_ex, segment_keys = 'sample')
34
-   splat_cell = split_cdb(ccdb_ex, c('sample', 'pop'), 'cell_tbl', drop = TRUE)
34
+   splat_cell = split_cdb(ccdb_ex, c('sample', 'pop'), 'cell_tbl', drop = TRUE, equalize = TRUE)
35 35
    unite_cell = do.call(rbind, splat_cell)
36 36
    expect_equal(unite_cell, ccdb_ex)
37 37
 
... ...
@@ -3,7 +3,7 @@ context('Cluster Logistic Testing')
3 3
 data(ccdb_ex)
4 4
 ccdb_ex = cluster_germline(ccdb_ex) %>% fine_clustering('cdr3_nt', 'DNA')
5 5
 ccdb_ex = canonicalize_cluster(ccdb_ex)
6
-ccdb_ex_trb = filter_cdb(ccdb_ex, chain == 'TRB') %>% canonicalize_cell(, contig_filter_args = chain == 'TRB', contig_fields = c('chain', 'v_gene', 'j_gene'))
6
+ccdb_ex_trb = filter_cdb(ccdb_ex, chain == 'TRB') %>% equalize_ccdb() %>% canonicalize_cell(, contig_filter_args = chain == 'TRB', contig_fields = c('chain', 'v_gene', 'j_gene'))
7 7
 
8 8
 theclust = filter(ccdb_ex$cluster_tbl, chain == "TRB", v_gene == "TRBV13-2", j_gene == "TRBJ2-7") %>% mutate( x_ = 1)
9 9
 
... ...
@@ -15,16 +15,18 @@ theform = ~ pop + (1|sample)
15 15
 test_that('Manual glmer testing equals cluster_logistic_test', {
16 16
     if(!requireNamespace('lme4')) skip('Install lme4')
17 17
     library(lme4)
18
-    # test against a manually data -- TRUE if the cell was in the cluster, false otherwise
18
+    # test against a manual data -- TRUE if the cell was in the cluster, false otherwise
19 19
     manual_glmer = glmer(update(theform, x_ ~ .) , data = manual_cell,  family = 'binomial')
20 20
     automatic1 = cluster_logistic_test(theform, ccdb = ccdb_ex_trb, cluster_whitelist = theclust, keep_fit = TRUE, silent = TRUE)
21 21
     expect_equal(fixef(automatic1$fit[[1]]), fixef(manual_glmer))
22 22
 
23
+    # Adding additional clusters doesn't change results
23 24
     wl2 = tibble(cluster_idx = c(theclust$cluster_idx, ccdb_ex_trb$cluster_tbl$cluster_idx[1:3]))
24 25
     automatic2 = cluster_logistic_test(theform, ccdb = ccdb_ex_trb, cluster_whitelist = wl2, keep_fit = TRUE, silent = TRUE)
25 26
     expect_equal(unique(automatic2$cluster_idx), wl2$cluster_idx)
26 27
     expect_equal(semi_join(automatic2, theclust, by = 'cluster_idx') %>% select(term:p.value), automatic1 %>% select(term:p.value))
27 28
 
29
+
28 30
     automatic3 = cluster_test_by(ccdb_ex, formula = theform, cluster_whitelist = wl2, silent = TRUE)
29 31
     expect_false(any(automatic3$chain == 'TRA'))
30 32
     expect_equal(semi_join(automatic3, theclust, by = 'cluster_idx') %>% select(term:p.value), automatic1 %>% select(term:p.value))
... ...
@@ -1,8 +1,8 @@
1 1
 ---
2
-title: "Combining Repertoire with Expression Using Nested colData"
2
+title: "Combining Repertoire with Expression with SingleCellExperiment"
3 3
 output: BiocStyle::html_document
4 4
 vignette: >
5
-  %\VignetteIndexEntry{Clustering repertoire via CDR3 sequences}
5
+  %\VignetteIndexEntry{Combining Repertoire with Expression with SingleCellExperiment}
6 6
   %\VignetteEngine{knitr::rmarkdown}
7 7
   %\VignetteEncoding{UTF-8}
8 8
 ---
... ...
@@ -15,7 +15,6 @@ knitr::opts_chunk$set(
15 15
 
16 16
 ```{r setup}
17 17
 library(CellaRepertorium)
18
-library(MultiAssayExperiment)
19 18
 library(SingleCellExperiment)
20 19
 library(dplyr)
21 20
 library(ggplot2)
... ...
@@ -62,7 +61,7 @@ sce = SingleCellExperiment(assay = expression, colData = all_bc)
62 61
 ```{r}
63 62
 ccdb2 = ContigCellDB(ccdb_ex$contig_tbl, contig_pk = ccdb_ex$contig_pk, cell_tbl = colData(sce), cell_pk = ccdb_ex$cell_pk, equalize = FALSE)
64 63
 
65
-ccdb2 = cdhit_ccdb(ccdb2, 'cdr3', type = 'AA', cluster_name = 'aa80', identity = .8)
64
+ccdb2 = cdhit_ccdb(ccdb2, 'cdr3', type = 'AA', cluster_name = 'aa80', identity = .8, min_length = 5)
66 65
 ccdb2 = fine_clustering(ccdb2, sequence_key = 'cdr3', type = 'AA', keep_clustering_details = FALSE)
67 66
 
68 67
 ```
... ...
@@ -73,9 +72,9 @@ Key is to construct with `equalize = FALSE`, which will allow some cells to lack
73 72
 # Chain pairings
74 73
 
75 74
 ```{r}
76
-colData(sce)$alpha =  canonicalize_cell(ccdb2, chain == 'TRA', contig_fields = c('chain', 'v_gene','d_gene', 'j_gene', 'AA97'))
75
+colData(sce)$alpha =  canonicalize_cell(ccdb2, chain == 'TRA', contig_fields = c('chain', 'v_gene','d_gene', 'j_gene', 'aa80'))
77 76
 
78
-colData(sce)$beta =  canonicalize_cell(ccdb2, chain == 'TRB', contig_fields = c('chain', 'v_gene','d_gene', 'j_gene', 'AA97'))
77
+colData(sce)$beta =  canonicalize_cell(ccdb2, chain == 'TRB', contig_fields = c('chain', 'v_gene','d_gene', 'j_gene', 'aa80'))
79 78
 
80 79
 ```
81 80