Browse code

Update docs

Andrew McDavid authored on 14/03/2019 22:53:22
Showing 7 changed files

... ...
@@ -1,25 +1,30 @@
1 1
 Package: CellaRepertorium
2 2
 Type: Package
3 3
 Title: Methods for clustering and analyzing high-throughput single cell immune cell repertoires (RepSeq)
4
-Version: 0.1.0
4
+Version: 0.2.0
5 5
 Author: Andrew McDavid
6 6
 Maintainer: Andrew McDavid <Andrew_McDavid@urmc.rochester.edu>
7 7
 Description: Methods to cluster and analyze high-throughput single cell immune cell repertoires,
8 8
     especially from the 10X Genomics VDJ solution. 
9
+    Contains an R interface to CD-HIT (Li and Godzik 2006).
9 10
     Tests for specific expansion, as well as omnibus oligoclonality under hypergeometric models.
10 11
 License: GPL-3
11 12
 Encoding: UTF-8
12 13
 LazyData: true
13 14
 Imports:
14
-   httr,
15
-   readr,
16 15
    dplyr,
17 16
    tibble,
18 17
    stringr,
19 18
    Biostrings,
20
-   progress,
21
-   Rcpp
22
-Suggests: testthat
19
+   Rcpp,
20
+   reshape2,
21
+   methods,
22
+   rlang,
23
+   purrr,
24
+   Matrix
25
+Suggests: 
26
+   testthat,
27
+   readr
23 28
 RoxygenNote: 6.1.0
24 29
 LinkingTo: Rcpp
25 30
 NeedsCompilation: yes
... ...
@@ -5,15 +5,33 @@ export(cluster_permute_test)
5 5
 export(entropy)
6 6
 export(fancy_name_contigs)
7 7
 export(fine_cluster)
8
+export(get_canonical_chain)
9
+export(get_canonical_representative)
8 10
 export(modal_category)
9 11
 export(np)
10
-export(query_cdhit)
11
-export(read_cdhit)
12
+export(pairing_tables)
12 13
 import(Biostrings)
13
-import(httr)
14
-import(stringr)
15 14
 importFrom(dplyr,"%>%")
15
+importFrom(dplyr,anti_join)
16
+importFrom(dplyr,bind_rows)
17
+importFrom(dplyr,group_by)
18
+importFrom(dplyr,left_join)
19
+importFrom(dplyr,mutate)
20
+importFrom(dplyr,summarize)
21
+importFrom(dplyr,ungroup)
22
+importFrom(methods,as)
23
+importFrom(rlang,":=")
24
+importFrom(rlang,sym)
25
+importFrom(rlang,syms)
26
+importFrom(stats,as.dist)
27
+importFrom(stats,dist)
28
+importFrom(stats,hclust)
29
+importFrom(stats,na.fail)
30
+importFrom(stats,sd)
31
+importFrom(stringr,str_c)
32
+importFrom(stringr,str_length)
33
+importFrom(stringr,str_replace_all)
34
+importFrom(tibble,as_data_frame)
16 35
 importFrom(tibble,data_frame)
17 36
 importFrom(utils,data)
18
-importFrom(utils,download.file)
19 37
 useDynLib(CellaRepertorium)
... ...
@@ -1,6 +1,6 @@
1 1
 # CellaRepertorium
2 2
 
3
-These package contains methods for clustering and analyzing single cell rep seq data, especially as generated by [10X genomics VDJ solution](https://support.10xgenomics.com/single-cell-vdj).  
3
+This package contains methods for clustering and analyzing single cell RepSeq data, especially as generated by [10X genomics VDJ solution](https://support.10xgenomics.com/single-cell-vdj).  
4 4
 
5 5
 ## Functions
6 6
 
... ...
@@ -4,21 +4,26 @@
4 4
 \alias{cluster_permute_test}
5 5
 \title{Tests for independence between clusters and labels using permutation}
6 6
 \usage{
7
-cluster_permute_test(clusters, labels, statistic, n_perm = 1000, ...)
7
+cluster_permute_test(clusters, labels, covariates, statistic,
8
+  n_perm = 1000, alternative = c("two.sided", "less", "greater"), ...)
8 9
 }
9 10
 \arguments{
10 11
 \item{clusters}{\code{factor} of length n}
11 12
 
12 13
 \item{labels}{\code{factor} of length n}
13 14
 
14
-\item{statistic}{function of \code{clusters} and \code{labels} that returns a \code{numeric} of length 1.  Currently, a two-tailed test is always run.}
15
+\item{covariates}{optional \code{factor} of length n.}
16
+
17
+\item{statistic}{function of \code{clusters}, \code{labels} and optionally \code{covariates} that returns a \code{numeric} of length 1.}
15 18
 
16 19
 \item{n_perm}{number of permutations.}
17 20
 
21
+\item{alternative}{`character` naming the direction `statistic` should be fall under the alternative hypothesis}
22
+
18 23
 \item{...}{passed to \code{statistic}}
19 24
 }
20 25
 \value{
21
-a list containing the observed value of the statistic, its expectation (under independence), a two-sided p-value, and the monte carlo standard error (of the expected value).
26
+a list containing the observed value of the statistic, its expectation (under independence), a p-value, and the monte carlo standard error (of the expected value).
22 27
 }
23 28
 \description{
24 29
 This tests a statistic for association between clusters and labels by permuting the link between the two
... ...
@@ -28,12 +33,15 @@ For example, \code{clusters} could be a clonal ID, while \code{labels} is a subj
28 33
 }
29 34
 \examples{
30 35
 library(dplyr)
31
-purity = function(clusters, labels){
32
-n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% group_by(clusters, si) \%>\% summarize(n = n()) \%>\% ungroup()
36
+purity = function(clusters, labels, covariates){
37
+n_label_cluster = data_frame(labels = labels, clusters = clusters) \%>\% group_by(clusters, labels) \%>\% summarize(n = n()) \%>\% ungroup()
33 38
 singletons = mean(n_label_cluster$n == 1)
34 39
 singletons
35 40
 }
36 41
 
37 42
 clusters = c(1, 1, 1, 2, 2, 3, 3)
43
+labels = c('A', 'A', 'B', 'B', 'B', 'C', 'C')
44
+covariates = c('X', 'X', 'Y', 'Y', 'Y', 'Y', 'Y')
45
+cluster_permute_test(clusters, labels, statistic = purity, n_perm  = 50)
38 46
 
39 47
 }
... ...
@@ -5,8 +5,8 @@
5 5
 \title{Calculate distances and perform hierarchical clustering on a set of sequences}
6 6
 \usage{
7 7
 fine_cluster(seqs, type = "AA", big_memory_brute = FALSE,
8
-  substitution_matrix = "BLOSUM100", cluster = "hclust",
9
-  cluster_method = "complete")
8
+  method = "levenshtein", substitution_matrix = "BLOSUM100",
9
+  cluster = "hclust", cluster_method = "complete")
10 10
 }
11 11
 \arguments{
12 12
 \item{seqs}{character vector, DNAStringSet or AAStringSet}
... ...
@@ -15,7 +15,9 @@ fine_cluster(seqs, type = "AA", big_memory_brute = FALSE,
15 15
 
16 16
 \item{big_memory_brute}{attempt to cluster more than 4000 sequences?  Clustering is quadratic, so this will take a long time and might exhaust memory}
17 17
 
18
-\item{substitution_matrix}{a character vector naming a substition matrix used to weight}
18
+\item{method}{one of 'substitutionMatrix' or 'levenshtein'}
19
+
20
+\item{substitution_matrix}{a character vector naming a substition matrix available in Biostrings, or a substitution matrix itself}
19 21
 
20 22
 \item{cluster_method}{character passed to `hclust`}
21 23
 }
... ...
@@ -30,7 +32,7 @@ The distances between nucleotide sequences is defined to be edit_distance/max(ed
30 32
 fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium')
31 33
 aaseq = Biostrings::readAAStringSet(fasta_path)[1:100]
32 34
 cls = fine_cluster(aaseq)
33
-plot(cls)
35
+plot(cls$cluster)
34 36
 }
35 37
 \seealso{
36 38
 hclust, stringDist
37 39
deleted file mode 100644
... ...
@@ -1,38 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/cdhit-methods.R
3
-\name{query_cdhit}
4
-\alias{query_cdhit}
5
-\title{Query the web interface to cdhit}
6
-\usage{
7
-query_cdhit(sequences, results_path = NULL, identity_cutoff = 0.9,
8
-  bandwidth = 20, results_timeout = 120)
9
-}
10
-\arguments{
11
-\item{sequences}{An object of class \code{AAStringSet}}
12
-
13
-\item{results_path}{If non-null, a \code{character} specifying a path to a results.  Directories therein must exist. If null, the R temporary directory will be used.}
14
-
15
-\item{identity_cutoff}{minimum identity to be clustered together}
16
-
17
-\item{bandwidth}{see CDhit docs}
18
-
19
-\item{results_timeout}{number of seconds to wait for result}
20
-}
21
-\value{
22
-path to results file
23
-}
24
-\description{
25
-This sends a query to \url{http://weizhong-lab.ucsd.edu/cdhit-web-server/cgi-bin/index.cgi?cmd=cd-hit}
26
-and returns a link to results.
27
-}
28
-\examples{
29
-\dontrun{
30
-fasta_path = system.file('extdata', 'demo.fasta', package='CellaRepertorium')
31
-aaseq = Biostrings::readAAStringSet(fasta_path)
32
-results_path = query_cdhit(aaseq[1:100])
33
-read_cdhit(results_path)
34
-}
35
-}
36
-\seealso{
37
-AAStringSet
38
-}
39 0
deleted file mode 100644
... ...
@@ -1,22 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/cdhit-methods.R
3
-\name{read_cdhit}
4
-\alias{read_cdhit}
5
-\title{Parse a CD-hit results file as a data.frame}
6
-\usage{
7
-read_cdhit(file)
8
-}
9
-\arguments{
10
-\item{file}{path to a file with cdhit results}
11
-}
12
-\value{
13
-\code{data.frame} with columns
14
-}
15
-\description{
16
-Parses a cd-hit (\url{http://weizhong-cluster.ucsd.edu/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit}) cluster file and returns it as a \code{data.frame}.
17
-The \code{data.frame} contains the columns `cluster_idx`: the cluster number, `query_idx`: the fasta identifier of the query sequence, `homology_pct`: the percent similarity to the exemplar sequence, `member_idx`, an arbitrary enumeration of each sequence in the a cluster, `len`: the length of the query sequence.
18
-}
19
-\examples{
20
-demo_path = system.file('extdata', 'demo150.clstr.sorted', package='CellaRepertorium')
21
-out = read_cdhit(demo_path)
22
-}