Browse code

Cleanup plot_clustering. More sanity checks, add packages to Suggests

Andrew McDavid authored on 07/04/2021 05:12:34
Showing 6 changed files

... ...
@@ -2,7 +2,7 @@ Type: Package
2 2
 Package: CellaRepertorium
3 3
 Title: Data structures, clustering and testing for single 
4 4
     cell immune receptor repertoires (scRNAseq RepSeq/AIRR-seq)
5
-Version: 1.1.2
5
+Version: 1.1.3
6 6
 Authors@R: 
7 7
     c(person(given = "Andrew",
8 8
              family = "McDavid",
... ...
@@ -60,7 +60,9 @@ Suggests:
60 60
     SingleCellExperiment,
61 61
     scater,
62 62
     broom.mixed,
63
-    cowplot
63
+    cowplot,
64
+    igraph,
65
+    ggraph
64 66
 LinkingTo: 
65 67
     Rcpp
66 68
 VignetteBuilder: 
... ...
@@ -31,6 +31,7 @@ export(modal_category)
31 31
 export(mutate_cdb)
32 32
 export(np)
33 33
 export(pairing_tables)
34
+export(plot_cluster_factors)
34 35
 export(purity)
35 36
 export(rank_chain_ccdb)
36 37
 export(rank_prevalence_ccdb)
... ...
@@ -1,67 +1,115 @@
1
-#' Visualization of clustering characteristics for a ContigCellDB object with inherited fine_clustering object
2
-#' 
1
+require_igraph = function(){
2
+  if(!requireNamespace('igraph') || !requireNamespace('ggraph')) stop('Install `igraph` and `ggraph`.')
3
+  TRUE
4
+}
5
+
6
+#' Visualization of pairs of cluster factor
7
+#'
8
+#' With `factors`, a pair of variables present in the `contig_tbl` and the `cluster_tbl`,
9
+#' generate and plot cross-tabs of the number of contigs, or its pearson residual.
3 10
 #' @param ccdb A ContigCellDB object.
4
-#' @param cluster_factors A character vector of variables in ccdb$contig table to visualize clusters by
11
+#' @param factors `character` length 2 of fields present
5 12
 #' @param type Type of visualization, a heatmap or a node-edge network plot
6 13
 #' @param statistic Cluster characteristics visualized by pearson residuals or raw contig counts
7
-#' @param ncluster Optional integer. Only plot characteristics that have at least nclusters (for clarity of visualization)
14
+#' @param ncluster `integer`.  Omit factors that occur less than `nclusters`. For clarity of visualization.
8 15
 #' @param chaintype Character in ccdb$contig_tbl$chain. If passed will subset contigs belonging to specified chain (IGH,IGK,IGL,TRA,TRB)
9
-#'
16
+#' @seealso canonicalize_cluster to "roll-up" additional contig variables into the `cluster_tbl``
10 17
 #' @return A ggraph object if type == 'network', and a ggplot object if type == 'heatmap'
18
+#' @export
11 19
 #' @examples
20
+#' library(ggraph)
12 21
 #' data(ccdb_ex)
13
-#' ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx')
14
-#' ccdb_germline_ex = fine_clustering(germline_cluster, sequence_key = 'cdr3_nt', type = 'DNA')
15
-#' plot_clustering(ccdb_germline_ex,cluster_factors = c('v_gene','j_gene'), statistic = 'pearson', type = 'network' ,ncluster = 50, chaintype = 'IGH')
22
+#' ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'),
23
+#' cluster_pk = 'segment_idx')
24
+#' ccdb_germline_ex = fine_clustering(ccdb_germline_ex, sequence_key = 'cdr3_nt', type = 'DNA')
25
+#' plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'),
26
+#' statistic = 'pearson', type = 'network' ,ncluster = 10, chaintype = 'TRB')
27
+#' plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'),
28
+#' statistic = 'contigs', type = 'heatmap')
29
+#' plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'),
30
+#' statistic = 'contigs', type = 'network', ncluster = 10)
31
+plot_cluster_factors = function(ccdb,
32
+                           factors,
33
+                           type = c('heatmap', 'network'),
34
+                           statistic = c('pearson', 'contigs'),
35
+                           ncluster = 0,
36
+                           chaintype) {
37
+  if (!inherits(ccdb,  "ContigCellDB") || !has_fineclustering(ccdb))
38
+    stop('ccdb must be  `ContigCellDB` with `fine_clustering`.')
39
+  check_plot_infra()
40
+  statistic = match.arg(statistic, c('pearson', 'contigs'))
41
+  type = match.arg(type, c('heatmap', 'network'))
42
+  if (length(factors) != 2 ||
43
+      !is.character(factors))
44
+    stop('`factors` must be character vector that specifies exactly two columns from ccdb')
16 45
 
17
-  
18
-plot_clustering = function(ccdb,cluster_factors,type = c('heatmap','network'),statistic = c('pearson','contigs'),ncluster,chaintype) {
46
+  cluster_tbl =  dplyr::filter(ccdb$cluster_tbl, n_cluster > ncluster)
19 47
 
20
-if(!inherits(ccdb,  "ContigCellDB")) stop('ccdb must have class ContigCellDb')
21 48
 
22
-if(length(cluster_factors) != 2 | !is.character(cluster_factors)) stop('cluster_factors must be character vector that specifies exactly two columns from ccdb')
23
-  
24
-if(!missing(ncluster)) {tmp = ccdb$cluster_tbl %>% filter(n_cluster > ncluster)} else {tmp = ccdb$cluster_tbl}
49
+  if (!missing(chaintype)) {
50
+    cluster_tbl = cluster_tbl %>% filter(chain %in% chaintype)
51
+  }
25 52
 
26
-if(!missing(chaintype)) {tmp = tmp %>% filter(chain == chaintype)} else {tmp = tmp}
27
-  
28
-if(statistic == 'pearson') {
53
+  if(!nrow(cluster_tbl) > 0) stop("No clusters remain after filtering.")
29 54
 
30
-  if(!missing(chaintype)) {ccdb$contig_tbl = ccdb$contig_tbl %>% filter(chain == chaintype)}
31
-  
32
-  resids = chisq.test(table(ccdb$contig_tbl %>% pull(cluster_factors[1]), ccdb$contig_tbl %>% pull(cluster_factors[2])))
33
-  
34
-  resid_tab = as.data.frame(resids$residuals)
35
-  
36
-  colnames(resid_tab) = c(cluster_factors,'residuals')
37
-  
38
-  tmp = left_join(tmp,resid_tab,by = cluster_factors)
39
-  
40
-  fillby = 'residuals'
41
-  
42
-}
43
-  
44
-else {fillby = 'n_cluster'}
45
-  
46
-tmp = tmp %>% relocate(cluster_factors)
55
+  if (statistic == 'pearson') {
56
+    if (!missing(chaintype)) {
57
+      ccdb$contig_tbl = ccdb$contig_tbl %>% filter(chain %in% chaintype)
58
+    }
59
+    if(!nrow(ccdb$contig_tbl)>0) stop("No contigs left after filtering.")
60
+    resids = stats::chisq.test(table(
61
+      ccdb$contig_tbl[[factors[1]]],
62
+      ccdb$contig_tbl[[factors[2]]]
63
+    ))
64
+    resid_tab = as.data.frame(resids$residuals)
65
+    colnames(resid_tab) = c(factors, 'residuals')
66
+    cluster_tbl = left_join_warn(cluster_tbl, resid_tab, by = factors)
67
+    fillby = 'residuals'
68
+  } else {
69
+    fillby = 'n_cluster'
70
+  }
71
+  cluster_tbl = cluster_tbl %>% dplyr::relocate(factors)
47 72
 
48
-if (type == 'network') {
49
-  
50
-  nodes = data.frame(names = c(unique(tmp %>% pull(cluster_factors[1])),unique(tmp %>% pull(cluster_factors[2]))))
51
-  
52
-  graph = graph_from_data_frame(tmp,vertices = nodes)
73
+  if (type == 'network') {
74
+    require_igraph()
75
+    nodes = data.frame(names = c(unique(cluster_tbl[[factors[1]]]),unique(cluster_tbl[[factors[2]]])))
76
+    graph = igraph::graph_from_data_frame(cluster_tbl, vertices = nodes)
53 77
 
54
-  plot = ggraph(graph,layout = 'circle') + geom_node_point() + geom_edge_fan(aes_string(colour = fillby)) + geom_node_label(aes(label = V(graph)$name), size = 2) + ggtitle(paste('Network Plot for factors', cluster_factors[1],'and',cluster_factors[2], 'using',statistic))
55
-  
56
-  if(statistic == 'pearson') {plot = plot + scale_edge_color_gradient2()}
57
-  
58
-  return(plot) }
59
-  
60
-else {
61
-  plot = ggplot(tmp) + aes_string(x = cluster_factors[1],y = cluster_factors[2],fill = fillby) + geom_tile() + theme(axis.text.x = element_text(angle = 90)) +  ggtitle(paste('Heatmap for factors', cluster_factors[1],'and',cluster_factors[2], 'using',statistic))
62
-  
63
-  if (statistic == 'pearson') {plot = plot + scale_fill_gradient2()}
78
+    plot = ggraph::ggraph(graph, layout = 'circle') + ggraph::geom_node_point() + ggraph::geom_edge_fan(ggplot2::aes_string(colour = fillby)) + ggraph::geom_node_label(ggplot2::aes(label = igraph::V(graph)$name), size = 2) + ggplot2::ggtitle(
79
+      paste(
80
+        'Network Plot for factors',
81
+        factors[1],
82
+        'and',
83
+        factors[2],
84
+        'using',
85
+        statistic
86
+      )
87
+    )
64 88
 
65
-return(plot) }
89
+    if (statistic == 'pearson') {
90
+      plot = plot + ggraph::scale_edge_color_gradient2()
91
+    }
92
+
93
+    return(plot)
94
+  }
95
+
96
+  else {
97
+    plot = ggplot2::ggplot(cluster_tbl) + ggplot2::aes_string(x = factors[1], y = factors[2], fill = fillby) + ggplot2::geom_tile() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90)) +  ggplot2::ggtitle(
98
+      paste(
99
+        'Heatmap for factors',
100
+        factors[1],
101
+        'and',
102
+        factors[2],
103
+        'using',
104
+        statistic
105
+      )
106
+    )
107
+
108
+    if (statistic == 'pearson') {
109
+      plot = plot + ggplot2::scale_fill_gradient2()
110
+    }
111
+
112
+    return(plot)
113
+  }
66 114
 }
67
-  
115
+
68 116
new file mode 100644
... ...
@@ -0,0 +1,50 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/plot_clustering.R
3
+\name{plot_cluster_factors}
4
+\alias{plot_cluster_factors}
5
+\title{Visualization of pairs of cluster factor}
6
+\usage{
7
+plot_cluster_factors(
8
+  ccdb,
9
+  factors,
10
+  type = c("heatmap", "network"),
11
+  statistic = c("pearson", "contigs"),
12
+  ncluster = 0,
13
+  chaintype
14
+)
15
+}
16
+\arguments{
17
+\item{ccdb}{A ContigCellDB object.}
18
+
19
+\item{factors}{\code{character} length 2 of fields present}
20
+
21
+\item{type}{Type of visualization, a heatmap or a node-edge network plot}
22
+
23
+\item{statistic}{Cluster characteristics visualized by pearson residuals or raw contig counts}
24
+
25
+\item{ncluster}{\code{integer}.  Omit factors that occur less than \code{nclusters}. For clarity of visualization.}
26
+
27
+\item{chaintype}{Character in ccdb$contig_tbl$chain. If passed will subset contigs belonging to specified chain (IGH,IGK,IGL,TRA,TRB)}
28
+}
29
+\value{
30
+A ggraph object if type == 'network', and a ggplot object if type == 'heatmap'
31
+}
32
+\description{
33
+With \code{factors}, a pair of variables present in the \code{contig_tbl} and the \code{cluster_tbl},
34
+generate and plot cross-tabs of the number of contigs, or its pearson residual.
35
+}
36
+\examples{
37
+library(ggraph)
38
+data(ccdb_ex)
39
+ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx')
40
+ccdb_germline_ex = fine_clustering(ccdb_germline_ex, sequence_key = 'cdr3_nt', type = 'DNA')
41
+plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'),
42
+statistic = 'pearson', type = 'network' ,ncluster = 10, chaintype = 'TRB')
43
+plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'),
44
+statistic = 'contigs', type = 'heatmap')
45
+plot_cluster_factors(ccdb_germline_ex,factors = c('v_gene','j_gene'),
46
+statistic = 'contigs', type = 'network', ncluster = 10)
47
+}
48
+\seealso{
49
+canonicalize_cluster to "roll-up" additional contig variables into the `cluster_tbl``
50
+}
0 51
deleted file mode 100644
... ...
@@ -1,40 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/plot_clustering.R
3
-\name{plot_clustering}
4
-\alias{plot_clustering}
5
-\title{Visualization of clustering characteristics for a ContigCellDB object with inherited fine_clustering object}
6
-\usage{
7
-plot_clustering(
8
-  ccdb,
9
-  cluster_factors,
10
-  type = c("heatmap", "network"),
11
-  statistic = c("pearson", "contigs"),
12
-  ncluster,
13
-  chaintype
14
-)
15
-}
16
-\arguments{
17
-\item{ccdb}{A ContigCellDB object.}
18
-
19
-\item{cluster_factors}{A character vector of variables in ccdb$contig table to visualize clusters by}
20
-
21
-\item{type}{Type of visualization, a heatmap or a node-edge network plot}
22
-
23
-\item{statistic}{Cluster characteristics visualized by pearson residuals or raw contig counts}
24
-
25
-\item{ncluster}{Optional integer. Only plot characteristics that have at least nclusters (for clarity of visualization)}
26
-
27
-\item{chaintype}{Character in ccdb$contig_tbl$chain. If passed will subset contigs belonging to specified chain (IGH,IGK,IGL,TRA,TRB)}
28
-}
29
-\value{
30
-A ggraph object if type == 'network', and a ggplot object if type == 'heatmap'
31
-}
32
-\description{
33
-Visualization of clustering characteristics for a ContigCellDB object with inherited fine_clustering object
34
-}
35
-\examples{
36
-data(ccdb_ex)
37
-ccdb_germline_ex = cluster_germline(ccdb_ex, segment_keys = c('v_gene', 'j_gene', 'chain'), cluster_pk = 'segment_idx')
38
-ccdb_germline_ex = fine_clustering(germline_cluster, sequence_key = 'cdr3_nt', type = 'DNA')
39
-plot_clustering(ccdb_germline_ex,cluster_factors = c('v_gene','j_gene'), statistic = 'pearson', type = 'network' ,ncluster = 50, chaintype = 'IGH')
40
-}
... ...
@@ -111,16 +111,16 @@ We can cluster by any other feature of the contigs. Here we cluster each contig
111 111
 
112 112
 ```{r}
113 113
 germline_cluster = fine_clustering(germline_cluster, sequence_key = 'cdr3_nt', type = 'DNA')
114
-ggplot(germline_cluster$cluster_tbl %>% filter(chain == 'TRB'), aes(x = v_gene, y = j_gene, fill = n_cluster)) + geom_tile() + theme(axis.text.x = element_text(angle = 90))
114
+filter_cdb(germline_cluster, chain == 'TRB') %>% plot_cluster_factors(factors = c('v_gene','j_gene'), statistic = 'contigs', type = 'heatmap')
115 115
 ```
116 116
 
117
-Number of pairs
117
+Number of pairs.  The pearson residual (showing the difference from expected counts given marginals) is probably more informative, set `statistic = 'residual'` for this.
118 118
 
119 119
 ```{r}
120 120
 ggplot(germline_cluster$cluster_tbl %>% filter(chain == 'TRB'), aes(x = v_gene, y = j_gene, fill = avg_distance)) + geom_tile() + theme(axis.text.x = element_text(angle = 90))
121 121
 ```
122 122
 
123
-Average Levenshtein distance of CDR3 within each pair.  This might be turned into a z-score by fitting a weighted linear model with sum-to-zero contrasts and returning the pearson residuals.  This could determine if a pairing has an unexpected small, or large, within cluster distance.
123
+Average Levenshtein distance of CDR3 within each pair.  This might be turned into a z-score by fitting a weighted linear model with sum-to-zero contrasts and returning the studentized residuals.  This could determine if a pairing has an unexpected small, or large, within cluster distance.
124 124
 
125 125
 
126 126