... | ... |
@@ -2,7 +2,7 @@ Type: Package |
2 | 2 |
Package: CellaRepertorium |
3 | 3 |
Title: Data structures, clustering and testing for single |
4 | 4 |
cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq) |
5 |
-Version: 1.1.2 |
|
5 |
+Version: 1.1.3 |
|
6 | 6 |
Authors@R: |
7 | 7 |
c(person(given = "Andrew", |
8 | 8 |
family = "McDavid", |
... | ... |
@@ -60,7 +60,9 @@ Suggests: |
60 | 60 |
SingleCellExperiment, |
61 | 61 |
scater, |
62 | 62 |
broom.mixed, |
63 |
- cowplot |
|
63 |
+ cowplot, |
|
64 |
+ igraph, |
|
65 |
+ ggraph |
|
64 | 66 |
LinkingTo: |
65 | 67 |
Rcpp |
66 | 68 |
VignetteBuilder: |
... | ... |
@@ -1,67 +1,115 @@ |
1 |
-#' Visualization of clustering characteristics for a ContigCellDB object with inherited fine_clustering object |
|
2 |
-#' |
|
1 |
+require_igraph = function(){ |
|
2 |
+ if(!requireNamespace('igraph') || !requireNamespace('ggraph')) stop('Install `igraph` and `ggraph`.') |
|
3 |
+ TRUE |
|
4 |
+} |
|
5 |
+ |
|
6 |
+#' Visualization of pairs of cluster factor |
|
7 |
+#' |
|
8 |
+#' With `factors`, a pair of variables present in the `contig_tbl` and the `cluster_tbl`, |
|
9 |
+#' generate and plot cross-tabs of the number of contigs, or its pearson residual. |
|
3 | 10 |
#' @param ccdb A ContigCellDB object. |
4 |
-#' @param cluster_factors A character vector of variables in ccdb$contig table to visualize clusters by |
|
11 |
+#' @param factors `character` length 2 of fields present |
|
5 | 12 |
#' @param type Type of visualization, a heatmap or a node-edge network plot |
6 | 13 |
#' @param statistic Cluster characteristics visualized by pearson residuals or raw contig counts |
7 |
-#' @param ncluster Optional integer. Only plot characteristics that have at least nclusters (for clarity of visualization) |
|
14 |
+#' @param ncluster `integer`. Omit factors that occur less than `nclusters`. For clarity of visualization. |
|
8 | 15 |
#' @param chaintype Character in ccdb$contig_tbl$chain. If passed will subset contigs belonging to specified chain (IGH,IGK,IGL,TRA,TRB) |
9 |
-#' |
|
16 |
+#' @seealso canonicalize_cluster to "roll-up" additional contig variables into the `cluster_tbl`` |
|
10 | 17 |
#' @return A ggraph object if type == 'network', and a ggplot object if type == 'heatmap' |
18 |
+#' @export |
|
11 | 19 |
#' @examples |
20 |
+#' library(ggraph) |
|
12 | 21 |
#' data(ccdb_ex) |
13 |
-#' ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx') |
|
14 |
-#' ccdb_germline_ex = fine_clustering(germline_cluster, sequence_key = 'cdr3_nt', type = 'DNA') |
|
15 |
-#' plot_clustering(ccdb_germline_ex,cluster_factors = c('v_gene','j_gene'), statistic = 'pearson', type = 'network' ,ncluster = 50, chaintype = 'IGH') |
|
22 |
+#' ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), |
|
23 |
+#' cluster_pk = 'segment_idx') |
|
24 |
+#' ccdb_germline_ex = fine_clustering(ccdb_germline_ex, sequence_key = 'cdr3_nt', type = 'DNA') |
|
25 |
+#' plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'), |
|
26 |
+#' statistic = 'pearson', type = 'network' ,ncluster = 10, chaintype = 'TRB') |
|
27 |
+#' plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'), |
|
28 |
+#' statistic = 'contigs', type = 'heatmap') |
|
29 |
+#' plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'), |
|
30 |
+#' statistic = 'contigs', type = 'network', ncluster = 10) |
|
31 |
+plot_cluster_factors = function(ccdb, |
|
32 |
+ factors, |
|
33 |
+ type = c('heatmap', 'network'), |
|
34 |
+ statistic = c('pearson', 'contigs'), |
|
35 |
+ ncluster = 0, |
|
36 |
+ chaintype) { |
|
37 |
+ if (!inherits(ccdb, "ContigCellDB") || !has_fineclustering(ccdb)) |
|
38 |
+ stop('ccdb must be `ContigCellDB` with `fine_clustering`.') |
|
39 |
+ check_plot_infra() |
|
40 |
+ statistic = match.arg(statistic, c('pearson', 'contigs')) |
|
41 |
+ type = match.arg(type, c('heatmap', 'network')) |
|
42 |
+ if (length(factors) != 2 || |
|
43 |
+ !is.character(factors)) |
|
44 |
+ stop('`factors` must be character vector that specifies exactly two columns from ccdb') |
|
16 | 45 |
|
17 |
- |
|
18 |
-plot_clustering = function(ccdb,cluster_factors,type = c('heatmap','network'),statistic = c('pearson','contigs'),ncluster,chaintype) { |
|
46 |
+ cluster_tbl = dplyr::filter(ccdb$cluster_tbl, n_cluster > ncluster) |
|
19 | 47 |
|
20 |
-if(!inherits(ccdb, "ContigCellDB")) stop('ccdb must have class ContigCellDb') |
|
21 | 48 |
|
22 |
-if(length(cluster_factors) != 2 | !is.character(cluster_factors)) stop('cluster_factors must be character vector that specifies exactly two columns from ccdb') |
|
23 |
- |
|
24 |
-if(!missing(ncluster)) {tmp = ccdb$cluster_tbl %>% filter(n_cluster > ncluster)} else {tmp = ccdb$cluster_tbl} |
|
49 |
+ if (!missing(chaintype)) { |
|
50 |
+ cluster_tbl = cluster_tbl %>% filter(chain %in% chaintype) |
|
51 |
+ } |
|
25 | 52 |
|
26 |
-if(!missing(chaintype)) {tmp = tmp %>% filter(chain == chaintype)} else {tmp = tmp} |
|
27 |
- |
|
28 |
-if(statistic == 'pearson') { |
|
53 |
+ if(!nrow(cluster_tbl) > 0) stop("No clusters remain after filtering.") |
|
29 | 54 |
|
30 |
- if(!missing(chaintype)) {ccdb$contig_tbl = ccdb$contig_tbl %>% filter(chain == chaintype)} |
|
31 |
- |
|
32 |
- resids = chisq.test(table(ccdb$contig_tbl %>% pull(cluster_factors[1]), ccdb$contig_tbl %>% pull(cluster_factors[2]))) |
|
33 |
- |
|
34 |
- resid_tab = as.data.frame(resids$residuals) |
|
35 |
- |
|
36 |
- colnames(resid_tab) = c(cluster_factors,'residuals') |
|
37 |
- |
|
38 |
- tmp = left_join(tmp,resid_tab,by = cluster_factors) |
|
39 |
- |
|
40 |
- fillby = 'residuals' |
|
41 |
- |
|
42 |
-} |
|
43 |
- |
|
44 |
-else {fillby = 'n_cluster'} |
|
45 |
- |
|
46 |
-tmp = tmp %>% relocate(cluster_factors) |
|
55 |
+ if (statistic == 'pearson') { |
|
56 |
+ if (!missing(chaintype)) { |
|
57 |
+ ccdb$contig_tbl = ccdb$contig_tbl %>% filter(chain %in% chaintype) |
|
58 |
+ } |
|
59 |
+ if(!nrow(ccdb$contig_tbl)>0) stop("No contigs left after filtering.") |
|
60 |
+ resids = stats::chisq.test(table( |
|
61 |
+ ccdb$contig_tbl[[factors[1]]], |
|
62 |
+ ccdb$contig_tbl[[factors[2]]] |
|
63 |
+ )) |
|
64 |
+ resid_tab = as.data.frame(resids$residuals) |
|
65 |
+ colnames(resid_tab) = c(factors, 'residuals') |
|
66 |
+ cluster_tbl = left_join_warn(cluster_tbl, resid_tab, by = factors) |
|
67 |
+ fillby = 'residuals' |
|
68 |
+ } else { |
|
69 |
+ fillby = 'n_cluster' |
|
70 |
+ } |
|
71 |
+ cluster_tbl = cluster_tbl %>% dplyr::relocate(factors) |
|
47 | 72 |
|
48 |
-if (type == 'network') { |
|
49 |
- |
|
50 |
- nodes = data.frame(names = c(unique(tmp %>% pull(cluster_factors[1])),unique(tmp %>% pull(cluster_factors[2])))) |
|
51 |
- |
|
52 |
- graph = graph_from_data_frame(tmp,vertices = nodes) |
|
73 |
+ if (type == 'network') { |
|
74 |
+ require_igraph() |
|
75 |
+ nodes = data.frame(names = c(unique(cluster_tbl[[factors[1]]]),unique(cluster_tbl[[factors[2]]]))) |
|
76 |
+ graph = igraph::graph_from_data_frame(cluster_tbl, vertices = nodes) |
|
53 | 77 |
|
54 |
- plot = ggraph(graph,layout = 'circle') + geom_node_point() + geom_edge_fan(aes_string(colour = fillby)) + geom_node_label(aes(label = V(graph)$name), size = 2) + ggtitle(paste('Network Plot for factors', cluster_factors[1],'and',cluster_factors[2], 'using',statistic)) |
|
55 |
- |
|
56 |
- if(statistic == 'pearson') {plot = plot + scale_edge_color_gradient2()} |
|
57 |
- |
|
58 |
- return(plot) } |
|
59 |
- |
|
60 |
-else { |
|
61 |
- plot = ggplot(tmp) + aes_string(x = cluster_factors[1],y = cluster_factors[2],fill = fillby) + geom_tile() + theme(axis.text.x = element_text(angle = 90)) + ggtitle(paste('Heatmap for factors', cluster_factors[1],'and',cluster_factors[2], 'using',statistic)) |
|
62 |
- |
|
63 |
- if (statistic == 'pearson') {plot = plot + scale_fill_gradient2()} |
|
78 |
+ plot = ggraph::ggraph(graph, layout = 'circle') + ggraph::geom_node_point() + ggraph::geom_edge_fan(ggplot2::aes_string(colour = fillby)) + ggraph::geom_node_label(ggplot2::aes(label = igraph::V(graph)$name), size = 2) + ggplot2::ggtitle( |
|
79 |
+ paste( |
|
80 |
+ 'Network Plot for factors', |
|
81 |
+ factors[1], |
|
82 |
+ 'and', |
|
83 |
+ factors[2], |
|
84 |
+ 'using', |
|
85 |
+ statistic |
|
86 |
+ ) |
|
87 |
+ ) |
|
64 | 88 |
|
65 |
-return(plot) } |
|
89 |
+ if (statistic == 'pearson') { |
|
90 |
+ plot = plot + ggraph::scale_edge_color_gradient2() |
|
91 |
+ } |
|
92 |
+ |
|
93 |
+ return(plot) |
|
94 |
+ } |
|
95 |
+ |
|
96 |
+ else { |
|
97 |
+ plot = ggplot2::ggplot(cluster_tbl) + ggplot2::aes_string(x = factors[1], y = factors[2], fill = fillby) + ggplot2::geom_tile() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90)) + ggplot2::ggtitle( |
|
98 |
+ paste( |
|
99 |
+ 'Heatmap for factors', |
|
100 |
+ factors[1], |
|
101 |
+ 'and', |
|
102 |
+ factors[2], |
|
103 |
+ 'using', |
|
104 |
+ statistic |
|
105 |
+ ) |
|
106 |
+ ) |
|
107 |
+ |
|
108 |
+ if (statistic == 'pearson') { |
|
109 |
+ plot = plot + ggplot2::scale_fill_gradient2() |
|
110 |
+ } |
|
111 |
+ |
|
112 |
+ return(plot) |
|
113 |
+ } |
|
66 | 114 |
} |
67 |
- |
|
115 |
+ |
68 | 116 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,50 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/plot_clustering.R |
|
3 |
+\name{plot_cluster_factors} |
|
4 |
+\alias{plot_cluster_factors} |
|
5 |
+\title{Visualization of pairs of cluster factor} |
|
6 |
+\usage{ |
|
7 |
+plot_cluster_factors( |
|
8 |
+ ccdb, |
|
9 |
+ factors, |
|
10 |
+ type = c("heatmap", "network"), |
|
11 |
+ statistic = c("pearson", "contigs"), |
|
12 |
+ ncluster = 0, |
|
13 |
+ chaintype |
|
14 |
+) |
|
15 |
+} |
|
16 |
+\arguments{ |
|
17 |
+\item{ccdb}{A ContigCellDB object.} |
|
18 |
+ |
|
19 |
+\item{factors}{\code{character} length 2 of fields present} |
|
20 |
+ |
|
21 |
+\item{type}{Type of visualization, a heatmap or a node-edge network plot} |
|
22 |
+ |
|
23 |
+\item{statistic}{Cluster characteristics visualized by pearson residuals or raw contig counts} |
|
24 |
+ |
|
25 |
+\item{ncluster}{\code{integer}. Omit factors that occur less than \code{nclusters}. For clarity of visualization.} |
|
26 |
+ |
|
27 |
+\item{chaintype}{Character in ccdb$contig_tbl$chain. If passed will subset contigs belonging to specified chain (IGH,IGK,IGL,TRA,TRB)} |
|
28 |
+} |
|
29 |
+\value{ |
|
30 |
+A ggraph object if type == 'network', and a ggplot object if type == 'heatmap' |
|
31 |
+} |
|
32 |
+\description{ |
|
33 |
+With \code{factors}, a pair of variables present in the \code{contig_tbl} and the \code{cluster_tbl}, |
|
34 |
+generate and plot cross-tabs of the number of contigs, or its pearson residual. |
|
35 |
+} |
|
36 |
+\examples{ |
|
37 |
+library(ggraph) |
|
38 |
+data(ccdb_ex) |
|
39 |
+ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx') |
|
40 |
+ccdb_germline_ex = fine_clustering(ccdb_germline_ex, sequence_key = 'cdr3_nt', type = 'DNA') |
|
41 |
+plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'), |
|
42 |
+statistic = 'pearson', type = 'network' ,ncluster = 10, chaintype = 'TRB') |
|
43 |
+plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'), |
|
44 |
+statistic = 'contigs', type = 'heatmap') |
|
45 |
+plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'), |
|
46 |
+statistic = 'contigs', type = 'network', ncluster = 10) |
|
47 |
+} |
|
48 |
+\seealso{ |
|
49 |
+canonicalize_cluster to "roll-up" additional contig variables into the `cluster_tbl`` |
|
50 |
+} |
0 | 51 |
deleted file mode 100644 |
... | ... |
@@ -1,40 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/plot_clustering.R |
|
3 |
-\name{plot_clustering} |
|
4 |
-\alias{plot_clustering} |
|
5 |
-\title{Visualization of clustering characteristics for a ContigCellDB object with inherited fine_clustering object} |
|
6 |
-\usage{ |
|
7 |
-plot_clustering( |
|
8 |
- ccdb, |
|
9 |
- cluster_factors, |
|
10 |
- type = c("heatmap", "network"), |
|
11 |
- statistic = c("pearson", "contigs"), |
|
12 |
- ncluster, |
|
13 |
- chaintype |
|
14 |
-) |
|
15 |
-} |
|
16 |
-\arguments{ |
|
17 |
-\item{ccdb}{A ContigCellDB object.} |
|
18 |
- |
|
19 |
-\item{cluster_factors}{A character vector of variables in ccdb$contig table to visualize clusters by} |
|
20 |
- |
|
21 |
-\item{type}{Type of visualization, a heatmap or a node-edge network plot} |
|
22 |
- |
|
23 |
-\item{statistic}{Cluster characteristics visualized by pearson residuals or raw contig counts} |
|
24 |
- |
|
25 |
-\item{ncluster}{Optional integer. Only plot characteristics that have at least nclusters (for clarity of visualization)} |
|
26 |
- |
|
27 |
-\item{chaintype}{Character in ccdb$contig_tbl$chain. If passed will subset contigs belonging to specified chain (IGH,IGK,IGL,TRA,TRB)} |
|
28 |
-} |
|
29 |
-\value{ |
|
30 |
-A ggraph object if type == 'network', and a ggplot object if type == 'heatmap' |
|
31 |
-} |
|
32 |
-\description{ |
|
33 |
-Visualization of clustering characteristics for a ContigCellDB object with inherited fine_clustering object |
|
34 |
-} |
|
35 |
-\examples{ |
|
36 |
-data(ccdb_ex) |
|
37 |
-ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx') |
|
38 |
-ccdb_germline_ex = fine_clustering(germline_cluster, sequence_key = 'cdr3_nt', type = 'DNA') |
|
39 |
-plot_clustering(ccdb_germline_ex,cluster_factors = c('v_gene','j_gene'), statistic = 'pearson', type = 'network' ,ncluster = 50, chaintype = 'IGH') |
|
40 |
-} |
... | ... |
@@ -111,16 +111,16 @@ We can cluster by any other feature of the contigs. Here we cluster each contig |
111 | 111 |
|
112 | 112 |
```{r} |
113 | 113 |
germline_cluster = fine_clustering(germline_cluster, sequence_key = 'cdr3_nt', type = 'DNA') |
114 |
-ggplot(germline_cluster$cluster_tbl %>% filter(chain == 'TRB'), aes(x = v_gene, y = j_gene, fill = n_cluster)) + geom_tile() + theme(axis.text.x = element_text(angle = 90)) |
|
114 |
+filter_cdb(germline_cluster, chain == 'TRB') %>% plot_cluster_factors(factors = c('v_gene','j_gene'), statistic = 'contigs', type = 'heatmap') |
|
115 | 115 |
``` |
116 | 116 |
|
117 |
-Number of pairs |
|
117 |
+Number of pairs. The pearson residual (showing the difference from expected counts given marginals) is probably more informative, set `statistic = 'residual'` for this. |
|
118 | 118 |
|
119 | 119 |
```{r} |
120 | 120 |
ggplot(germline_cluster$cluster_tbl %>% filter(chain == 'TRB'), aes(x = v_gene, y = j_gene, fill = avg_distance)) + geom_tile() + theme(axis.text.x = element_text(angle = 90)) |
121 | 121 |
``` |
122 | 122 |
|
123 |
-Average Levenshtein distance of CDR3 within each pair. This might be turned into a z-score by fitting a weighted linear model with sum-to-zero contrasts and returning the pearson residuals. This could determine if a pairing has an unexpected small, or large, within cluster distance. |
|
123 |
+Average Levenshtein distance of CDR3 within each pair. This might be turned into a z-score by fitting a weighted linear model with sum-to-zero contrasts and returning the studentized residuals. This could determine if a pairing has an unexpected small, or large, within cluster distance. |
|
124 | 124 |
|
125 | 125 |
|
126 | 126 |
|