... | ... |
@@ -1,25 +1,30 @@ |
1 | 1 |
Package: CellaRepertorium |
2 | 2 |
Type: Package |
3 | 3 |
Title: Methods for clustering and analyzing high-throughput single cell immune cell repertoires (RepSeq) |
4 |
-Version: 0.1.0 |
|
4 |
+Version: 0.2.0 |
|
5 | 5 |
Author: Andrew McDavid |
6 | 6 |
Maintainer: Andrew McDavid <Andrew_McDavid@urmc.rochester.edu> |
7 | 7 |
Description: Methods to cluster and analyze high-throughput single cell immune cell repertoires, |
8 | 8 |
especially from the 10X Genomics VDJ solution. |
9 |
+ Contains an R interface to CD-HIT (Li and Godzik 2006). |
|
9 | 10 |
Tests for specific expansion, as well as omnibus oligoclonality under hypergeometric models. |
10 | 11 |
License: GPL-3 |
11 | 12 |
Encoding: UTF-8 |
12 | 13 |
LazyData: true |
13 | 14 |
Imports: |
14 |
- httr, |
|
15 |
- readr, |
|
16 | 15 |
dplyr, |
17 | 16 |
tibble, |
18 | 17 |
stringr, |
19 | 18 |
Biostrings, |
20 |
- progress, |
|
21 |
- Rcpp |
|
22 |
-Suggests: testthat |
|
19 |
+ Rcpp, |
|
20 |
+ reshape2, |
|
21 |
+ methods, |
|
22 |
+ rlang, |
|
23 |
+ purrr, |
|
24 |
+ Matrix |
|
25 |
+Suggests: |
|
26 |
+ testthat, |
|
27 |
+ readr |
|
23 | 28 |
RoxygenNote: 6.1.0 |
24 | 29 |
LinkingTo: Rcpp |
25 | 30 |
NeedsCompilation: yes |
... | ... |
@@ -5,15 +5,33 @@ export(cluster_permute_test) |
5 | 5 |
export(entropy) |
6 | 6 |
export(fancy_name_contigs) |
7 | 7 |
export(fine_cluster) |
8 |
+export(get_canonical_chain) |
|
9 |
+export(get_canonical_representative) |
|
8 | 10 |
export(modal_category) |
9 | 11 |
export(np) |
10 |
-export(query_cdhit) |
|
11 |
-export(read_cdhit) |
|
12 |
+export(pairing_tables) |
|
12 | 13 |
import(Biostrings) |
13 |
-import(httr) |
|
14 |
-import(stringr) |
|
15 | 14 |
importFrom(dplyr,"%>%") |
15 |
+importFrom(dplyr,anti_join) |
|
16 |
+importFrom(dplyr,bind_rows) |
|
17 |
+importFrom(dplyr,group_by) |
|
18 |
+importFrom(dplyr,left_join) |
|
19 |
+importFrom(dplyr,mutate) |
|
20 |
+importFrom(dplyr,summarize) |
|
21 |
+importFrom(dplyr,ungroup) |
|
22 |
+importFrom(methods,as) |
|
23 |
+importFrom(rlang,":=") |
|
24 |
+importFrom(rlang,sym) |
|
25 |
+importFrom(rlang,syms) |
|
26 |
+importFrom(stats,as.dist) |
|
27 |
+importFrom(stats,dist) |
|
28 |
+importFrom(stats,hclust) |
|
29 |
+importFrom(stats,na.fail) |
|
30 |
+importFrom(stats,sd) |
|
31 |
+importFrom(stringr,str_c) |
|
32 |
+importFrom(stringr,str_length) |
|
33 |
+importFrom(stringr,str_replace_all) |
|
34 |
+importFrom(tibble,as_data_frame) |
|
16 | 35 |
importFrom(tibble,data_frame) |
17 | 36 |
importFrom(utils,data) |
18 |
-importFrom(utils,download.file) |
|
19 | 37 |
useDynLib(CellaRepertorium) |
... | ... |
@@ -1,6 +1,6 @@ |
1 | 1 |
# CellaRepertorium |
2 | 2 |
|
3 |
-These package contains methods for clustering and analyzing single cell rep seq data, especially as generated by [10X genomics VDJ solution](https://support.10xgenomics.com/single-cell-vdj). |
|
3 |
+This package contains methods for clustering and analyzing single cell RepSeq data, especially as generated by [10X genomics VDJ solution](https://support.10xgenomics.com/single-cell-vdj). |
|
4 | 4 |
|
5 | 5 |
## Functions |
6 | 6 |
|
... | ... |
@@ -4,21 +4,26 @@ |
4 | 4 |
\alias{cluster_permute_test} |
5 | 5 |
\title{Tests for independence between clusters and labels using permutation} |
6 | 6 |
\usage{ |
7 |
-cluster_permute_test(clusters, labels, statistic, n_perm = 1000, ...) |
|
7 |
+cluster_permute_test(clusters, labels, covariates, statistic, |
|
8 |
+ n_perm = 1000, alternative = c("two.sided", "less", "greater"), ...) |
|
8 | 9 |
} |
9 | 10 |
\arguments{ |
10 | 11 |
\item{clusters}{\code{factor} of length n} |
11 | 12 |
|
12 | 13 |
\item{labels}{\code{factor} of length n} |
13 | 14 |
|
14 |
-\item{statistic}{function of \code{clusters} and \code{labels} that returns a \code{numeric} of length 1. Currently, a two-tailed test is always run.} |
|
15 |
+\item{covariates}{optional \code{factor} of length n.} |
|
16 |
+ |
|
17 |
+\item{statistic}{function of \code{clusters}, \code{labels} and optionally \code{covariates} that returns a \code{numeric} of length 1.} |
|
15 | 18 |
|
16 | 19 |
\item{n_perm}{number of permutations.} |
17 | 20 |
|
21 |
+\item{alternative}{`character` naming the direction `statistic` should be fall under the alternative hypothesis} |
|
22 |
+ |
|
18 | 23 |
\item{...}{passed to \code{statistic}} |
19 | 24 |
} |
20 | 25 |
\value{ |
21 |
-a list containing the observed value of the statistic, its expectation (under independence), a two-sided p-value, and the monte carlo standard error (of the expected value). |
|
26 |
+a list containing the observed value of the statistic, its expectation (under independence), a p-value, and the monte carlo standard error (of the expected value). |
|
22 | 27 |
} |
23 | 28 |
\description{ |
24 | 29 |
This tests a statistic for association between clusters and labels by permuting the link between the two |
... | ... |
@@ -28,12 +33,15 @@ For example, \code{clusters} could be a clonal ID, while \code{labels} is a subj |
28 | 33 |
} |
29 | 34 |
\examples{ |
30 | 35 |
library(dplyr) |
31 |
-purity = function(clusters, labels){ |
|
32 |
-n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% group_by(clusters, si) \%>\% summarize(n = n()) \%>\% ungroup() |
|
36 |
+purity = function(clusters, labels, covariates){ |
|
37 |
+n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% group_by(clusters, labels) \%>\% summarize(n = n()) \%>\% ungroup() |
|
33 | 38 |
singletons = mean(n_label_cluster$n == 1) |
34 | 39 |
singletons |
35 | 40 |
} |
36 | 41 |
|
37 | 42 |
clusters = c(1, 1, 1, 2, 2, 3, 3) |
43 |
+labels = c('A', 'A', 'B', 'B', 'B', 'C', 'C') |
|
44 |
+covariates = c('X', 'X', 'Y', 'Y', 'Y', 'Y', 'Y') |
|
45 |
+cluster_permute_test(clusters, labels, statistic = purity, n_perm = 50) |
|
38 | 46 |
|
39 | 47 |
} |
... | ... |
@@ -5,8 +5,8 @@ |
5 | 5 |
\title{Calculate distances and perform hierarchical clustering on a set of sequences} |
6 | 6 |
\usage{ |
7 | 7 |
fine_cluster(seqs, type = "AA", big_memory_brute = FALSE, |
8 |
- substitution_matrix = "BLOSUM100", cluster = "hclust", |
|
9 |
- cluster_method = "complete") |
|
8 |
+ method = "levenshtein", substitution_matrix = "BLOSUM100", |
|
9 |
+ cluster = "hclust", cluster_method = "complete") |
|
10 | 10 |
} |
11 | 11 |
\arguments{ |
12 | 12 |
\item{seqs}{character vector, DNAStringSet or AAStringSet} |
... | ... |
@@ -15,7 +15,9 @@ fine_cluster(seqs, type = "AA", big_memory_brute = FALSE, |
15 | 15 |
|
16 | 16 |
\item{big_memory_brute}{attempt to cluster more than 4000 sequences? Clustering is quadratic, so this will take a long time and might exhaust memory} |
17 | 17 |
|
18 |
-\item{substitution_matrix}{a character vector naming a substition matrix used to weight} |
|
18 |
+\item{method}{one of 'substitutionMatrix' or 'levenshtein'} |
|
19 |
+ |
|
20 |
+\item{substitution_matrix}{a character vector naming a substition matrix available in Biostrings, or a substitution matrix itself} |
|
19 | 21 |
|
20 | 22 |
\item{cluster_method}{character passed to `hclust`} |
21 | 23 |
} |
... | ... |
@@ -30,7 +32,7 @@ The distances between nucleotide sequences is defined to be edit_distance/max(ed |
30 | 32 |
fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium') |
31 | 33 |
aaseq = Biostrings::readAAStringSet(fasta_path)[1:100] |
32 | 34 |
cls = fine_cluster(aaseq) |
33 |
-plot(cls) |
|
35 |
+plot(cls$cluster) |
|
34 | 36 |
} |
35 | 37 |
\seealso{ |
36 | 38 |
hclust, stringDist |
37 | 39 |
deleted file mode 100644 |
... | ... |
@@ -1,38 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/cdhit-methods.R |
|
3 |
-\name{query_cdhit} |
|
4 |
-\alias{query_cdhit} |
|
5 |
-\title{Query the web interface to cdhit} |
|
6 |
-\usage{ |
|
7 |
-query_cdhit(sequences, results_path = NULL, identity_cutoff = 0.9, |
|
8 |
- bandwidth = 20, results_timeout = 120) |
|
9 |
-} |
|
10 |
-\arguments{ |
|
11 |
-\item{sequences}{An object of class \code{AAStringSet}} |
|
12 |
- |
|
13 |
-\item{results_path}{If non-null, a \code{character} specifying a path to a results. Directories therein must exist. If null, the R temporary directory will be used.} |
|
14 |
- |
|
15 |
-\item{identity_cutoff}{minimum identity to be clustered together} |
|
16 |
- |
|
17 |
-\item{bandwidth}{see CDhit docs} |
|
18 |
- |
|
19 |
-\item{results_timeout}{number of seconds to wait for result} |
|
20 |
-} |
|
21 |
-\value{ |
|
22 |
-path to results file |
|
23 |
-} |
|
24 |
-\description{ |
|
25 |
-This sends a query to \url{http://weizhong-lab.ucsd.edu/cdhit-web-server/cgi-bin/index.cgi?cmd=cd-hit} |
|
26 |
-and returns a link to results. |
|
27 |
-} |
|
28 |
-\examples{ |
|
29 |
-\dontrun{ |
|
30 |
-fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium') |
|
31 |
-aaseq = Biostrings::readAAStringSet(fasta_path) |
|
32 |
-results_path = query_cdhit(aaseq[1:100]) |
|
33 |
-read_cdhit(results_path) |
|
34 |
-} |
|
35 |
-} |
|
36 |
-\seealso{ |
|
37 |
-AAStringSet |
|
38 |
-} |
39 | 0 |
deleted file mode 100644 |
... | ... |
@@ -1,22 +0,0 @@ |
1 |
-% Generated by roxygen2: do not edit by hand |
|
2 |
-% Please edit documentation in R/cdhit-methods.R |
|
3 |
-\name{read_cdhit} |
|
4 |
-\alias{read_cdhit} |
|
5 |
-\title{Parse a CD-hit results file as a data.frame} |
|
6 |
-\usage{ |
|
7 |
-read_cdhit(file) |
|
8 |
-} |
|
9 |
-\arguments{ |
|
10 |
-\item{file}{path to a file with cdhit results} |
|
11 |
-} |
|
12 |
-\value{ |
|
13 |
-\code{data.frame} with columns |
|
14 |
-} |
|
15 |
-\description{ |
|
16 |
-Parses a cd-hit (\url{http://weizhong-cluster.ucsd.edu/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit}) cluster file and returns it as a \code{data.frame}. |
|
17 |
-The \code{data.frame} contains the columns `cluster_idx`: the cluster number, `query_idx`: the fasta identifier of the query sequence, `homology_pct`: the percent similarity to the exemplar sequence, `member_idx`, an arbitrary enumeration of each sequence in the a cluster, `len`: the length of the query sequence. |
|
18 |
-} |
|
19 |
-\examples{ |
|
20 |
-demo_path = system.file('extdata', 'demo150.clstr.sorted', package='CellaRepertorium') |
|
21 |
-out = read_cdhit(demo_path) |
|
22 |
-} |