Browse code

Hopefully much faster canonicalization And more BiocCheck attempts

Andrew McDavid authored on 02/09/2019 05:02:39
Showing 5 changed files

... ...
@@ -12,3 +12,4 @@ manuscript/
12 12
 .ignore
13 13
 README.Rmd
14 14
 data/ccdb_bcell.rda
15
+R/immcantation-utils.R
... ...
@@ -2,7 +2,7 @@ Type: Package
2 2
 Package: CellaRepertorium
3 3
 Title: Data structures, clustering and testing for single 
4 4
     cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)
5
-Version: 0.4.0
5
+Version: 0.8.0
6 6
 Authors@R: 
7 7
     c(person(given = "Andrew",
8 8
              family = "McDavid",
... ...
@@ -64,6 +64,7 @@ importFrom(rlang,":=")
64 64
 importFrom(rlang,sym)
65 65
 importFrom(rlang,syms)
66 66
 importFrom(stats,as.dist)
67
+importFrom(stats,as.formula)
67 68
 importFrom(stats,dist)
68 69
 importFrom(stats,hclust)
69 70
 importFrom(stats,na.fail)
... ...
@@ -1,3 +1,4 @@
1
+globalVariables('ngrp')
1 2
 
2 3
 #' Cluster contigs by germline properties
3 4
 #'
... ...
@@ -13,7 +14,8 @@
13 14
 #' ccdb_ex$cluster_tbl
14 15
 cluster_germline = function(ccdb, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_name = 'cluster_idx'){
15 16
     contig_tbl = ccdb$contig_tbl
16
-    seg_types = contig_tbl %>% group_by(!!!syms(segment_keys)) %>% summarize() %>% ungroup() %>% mutate(!!sym(cluster_name) := seq_len(nrow(.)))
17
+    seg_types = contig_tbl %>% group_by(!!!syms(segment_keys)) %>% summarize() %>% ungroup()
18
+    seg_types[[cluster_name]] = seq_len(nrow(seg_types))
17 19
     cl_con_tbl = left_join_warn(seg_types, contig_tbl, by = segment_keys)
18 20
     cluster_tbl = as_tibble(unique(cl_con_tbl[union(cluster_name, segment_keys)]))
19 21
     replace_cluster_tbl(ccdb, cluster_tbl, cl_con_tbl, cluster_pk = cluster_name)
... ...
@@ -151,7 +153,9 @@ tie_break_keys = character(), order = 1, representative = ccdb$cluster_pk[1], co
151 153
     arranging = purrr::map(tie_break_keys, ~ rlang::quo(desc(!!sym(.x))))
152 154
 
153 155
     # take first row of each cluster
154
-    cluster_tbl = sub_contig_tbl %>% group_by(!!!syms(ccdb$cluster_pk)) %>% dplyr::arrange(!!!arranging) %>% dplyr::do(dplyr::slice(., order))
156
+    cluster_tbl = sub_contig_tbl %>% group_by(!!!syms(ccdb$cluster_pk)) %>% dplyr::arrange(!!!arranging)
157
+    idx = cluster_tbl %>% transmute(ngrp = dplyr::n(), idx = seq_along(ngrp))
158
+    cluster_tbl = cluster_tbl[idx$idx==order,,drop = FALSE]
155 159
     cluster_tbl = cluster_tbl %>% dplyr::select(!!!syms(unique(c(ccdb$cluster_pk, contig_fields, representative))))
156 160
 
157 161
     # fill any missing clusters after the filtering
... ...
@@ -72,7 +72,9 @@ canonicalize_cell = function(ccdb, contig_filter_args = TRUE,  tie_break_keys =
72 72
     # setup quosures to arrange the data
73 73
     arranging = purrr::map(tie_break_keys, ~ rlang::quo(desc(!!sym(.x))))
74 74
     # take first row of each cell
75
-    ft2 = ft %>% group_by(!!!syms(ccdb$cell_pk)) %>% dplyr::arrange(!!!arranging) %>% dplyr::do(dplyr::slice(., order))
75
+    ft2 = ft %>% group_by(!!!syms(ccdb$cell_pk)) %>% dplyr::arrange(!!!arranging)
76
+    idx = ft2 %>% transmute(ngrp = dplyr::n(), idx = seq_along(ngrp))
77
+    ft2 = ft2[idx$idx==order,,drop = FALSE]
76 78
     cell_tbl = ccdb$cell_tbl
77 79
     # join with cell tbl (so same number of cells)
78 80
     ccdb$cell_tbl = right_join_warn(ft2[unique(c(contig_fields, ccdb$cell_pk))], cell_tbl, by = ccdb$cell_pk, overwrite = overwrite)