git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@81628 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -14,16 +14,14 @@ |
14 | 14 |
} |
15 | 15 |
|
16 | 16 |
\usage{ |
17 |
- BamTallyParam(genome, which = RangesList(), |
|
17 |
+ BamTallyParam(genome, which = GRanges(), |
|
18 | 18 |
desired_read_group = NULL, |
19 |
- read_pos_breaks = NULL, |
|
20 |
- high_base_quality = 0L, |
|
21 | 19 |
minimum_mapq = 0L, |
22 | 20 |
concordant_only = FALSE, unique_only = FALSE, |
23 | 21 |
primary_only = FALSE, ignore_duplicates = FALSE, |
24 | 22 |
min_depth = 0L, variant_strand = 0L, |
25 | 23 |
ignore_query_Ns = FALSE, |
26 |
- indels = TRUE) |
|
24 |
+ indels = FALSE) |
|
27 | 25 |
} |
28 | 26 |
\arguments{ |
29 | 27 |
\item{genome}{A \code{GmapGenome} object, or something coercible to one.} |
... | ... |
@@ -33,11 +31,6 @@ |
33 | 31 |
} |
34 | 32 |
\item{desired_read_group}{The name of the read group to which to limit |
35 | 33 |
the tallying; if not NULL, must be a single, non-NA string.} |
36 |
- \item{read_pos_breaks}{The breaks, like those passed to \code{\link{cut}} |
|
37 |
- for aggregating the per-read position counts. If \code{NULL}, no per-cycle |
|
38 |
- counts are returned.} |
|
39 |
- \item{high_base_quality}{The minimum mapping quality for a |
|
40 |
- read to be counted as high quality.} |
|
41 | 34 |
\item{minimum_mapq}{Minimum mapping quality for a read to be counted |
42 | 35 |
at all.} |
43 | 36 |
\item{concordant_only}{Consider only what gnsap |
... | ... |
@@ -12,54 +12,73 @@ |
12 | 12 |
} |
13 | 13 |
|
14 | 14 |
\usage{ |
15 |
-GsnapParam(genome, unique_only = FALSE, |
|
16 |
- max_mismatches = NULL, suboptimal_levels = 0L, |
|
17 |
- mode = "standard", snps = NULL, |
|
18 |
- npaths = if (unique_only) 1L else 100L, |
|
19 |
- quiet_if_excessive = unique_only, nofails = unique_only, |
|
20 |
- split_output = !unique_only, |
|
21 |
- novelsplicing = FALSE, splicing = NULL, |
|
22 |
- nthreads = 1L, part = NULL, batch = "2", ...) |
|
15 |
+GsnapParam(genome, unique_only = FALSE, molecule = c("RNA", "DNA"), |
|
16 |
+ max_mismatches = NULL, |
|
17 |
+ suboptimal_levels = 0L, mode = "standard", |
|
18 |
+ snps = NULL, |
|
19 |
+ npaths = if (unique_only) 1L else 100L, |
|
20 |
+ quiet_if_excessive = unique_only, nofails = unique_only, |
|
21 |
+ split_output = !unique_only, |
|
22 |
+ novelsplicing = FALSE, splicing = NULL, |
|
23 |
+ nthreads = 1L, part = NULL, batch = "2", |
|
24 |
+ terminal_threshold = if (molecule == "DNA") 1000L else 2L, |
|
25 |
+ gmap_mode = if (molecule == "DNA") "none" else |
|
26 |
+ "pairsearch,terminal,improve", ...) |
|
23 | 27 |
} |
24 | 28 |
|
25 | 29 |
\arguments{ |
26 |
-\item{genome}{A GmapGenome object to align against} |
|
27 |
-\item{unique_only}{Whether only alignments with a unique match should be |
|
28 |
- output. The default is FALSE.} |
|
29 |
-\item{max_mismatches}{The maximum number of mismatches to allow per |
|
30 |
- alignment. If NULL, then the value defaults to ((readlength + 2) / 12 - 2))} |
|
31 |
-\item{suboptimal_levels}{Report suboptimal hits beyond best hit. The |
|
32 |
- default is 0L.} |
|
33 |
-\item{mode}{The alignment mode. It can be "standard", "cmet-stranded", |
|
34 |
- "cmet-nonstranded", "atoi-stranded", or "atoi-nonstranded". The default |
|
35 |
- is "standard".} |
|
36 |
-\item{snps}{If not NULL, then a GmapSnps object. Provided SNPs will not |
|
37 |
- count as mismatches.} |
|
38 |
-\item{npaths}{The maximum number of paths to print.} |
|
39 |
-\item{quiet_if_excessive}{If more than maximum number of paths are |
|
40 |
- found, then no alignment from the read will be in the output.} |
|
41 |
-\item{nofails}{Exclude failed alignments from output} |
|
42 |
-\item{split_output}{Basename for multiple-file output, separately for nomapping, |
|
43 |
- halfmapping_uniq, halfmapping_mult, unpaired_uniq, unpaired_mult, paired_uniq, |
|
44 |
- paired_mult, concordant_uniq, and concordant_mult results (up to 9 |
|
45 |
- files, or 10 if --fails-as-input is selected, or 3 for single-end reads)} |
|
46 |
-\item{novelsplicing}{Logical indicating whether to look for novel |
|
47 |
- splicing events. FALSE is the default.} |
|
48 |
-\item{splicing}{If not NULL, a GmapSplices object. NULL is the default.} |
|
49 |
-\item{nthreads}{The number of worker threads gsnap should use to align.} |
|
50 |
-\item{part}{If not NULL, then process only the i-th out of every n |
|
51 |
- sequences e.g., 0/100 or 99/100 (useful for distributing jobs to a |
|
52 |
- computer farm). If NULL, then all sequences are processed. NULL is the default.} |
|
53 |
-\item{batch}{This argument allows control over gsnap's memory mapping |
|
54 |
- and allocation. The default is mode 2. |
|
55 |
- Mode 0: \{offsets=allocate, positions=mmap, genome=mmap\}, |
|
56 |
- Mode 1: \{offsets=allocate, positions=mmap & preload,genome=mmap\}, |
|
57 |
- Mode 2: \{offsets=allocate, positions=mmap & preload,genome=mmap & |
|
58 |
- preload\}, |
|
59 |
- Mode 3: \{offsets=allocate, positions=allocate,genome=mmap & preload\}, |
|
60 |
- Mode 4: \{offsets=allocate, positions=allocate,genome=allocate\}} |
|
30 |
+ \item{genome}{A GmapGenome object to align against} |
|
31 |
+ \item{unique_only}{Whether only alignments with a unique match should be |
|
32 |
+ output. The default is FALSE.} |
|
33 |
+ \item{molecule}{The type of molecule sequenced; used to determine |
|
34 |
+ appropriate parameter defaults.} |
|
35 |
+ \item{max_mismatches}{The maximum number of mismatches to allow per |
|
36 |
+ alignment. If NULL, then the value defaults to ((readlength + 2) / 12 - 2))} |
|
37 |
+ \item{suboptimal_levels}{Report suboptimal hits beyond best hit. The |
|
38 |
+ default is 0L.} |
|
39 |
+ \item{mode}{The alignment mode. It can be "standard", "cmet-stranded", |
|
40 |
+ "cmet-nonstranded", "atoi-stranded", or "atoi-nonstranded". The default |
|
41 |
+ is "standard".} |
|
42 |
+ \item{snps}{If not NULL, then a GmapSnps object. Provided SNPs will not |
|
43 |
+ count as mismatches.} |
|
44 |
+ \item{npaths}{The maximum number of paths to print.} |
|
45 |
+ \item{quiet_if_excessive}{If more than maximum number of paths are |
|
46 |
+ found, then no alignment from the read will be in the output.} |
|
47 |
+ \item{nofails}{Exclude failed alignments from output} |
|
48 |
+ \item{split_output}{Basename for multiple-file output, separately for |
|
49 |
+ nomapping, halfmapping_uniq, halfmapping_mult, unpaired_uniq, |
|
50 |
+ unpaired_mult, paired_uniq, paired_mult, concordant_uniq, and |
|
51 |
+ concordant_mult results (up to 9 files, or 10 if --fails-as-input is |
|
52 |
+ selected, or 3 for single-end reads)} |
|
53 |
+ \item{novelsplicing}{Logical indicating whether to look for novel |
|
54 |
+ splicing events. FALSE is the default.} |
|
55 |
+ \item{splicing}{If not NULL, a GmapSplices object. NULL is the default.} |
|
56 |
+ \item{nthreads}{The number of worker threads gsnap should use to align.} |
|
57 |
+ \item{part}{If not NULL, then process only the i-th out of every n |
|
58 |
+ sequences e.g., 0/100 or 99/100 (useful for distributing jobs to a |
|
59 |
+ computer farm). If NULL, then all sequences are processed. NULL is |
|
60 |
+ the default.} |
|
61 |
+ \item{batch}{This argument allows control over gsnap's memory mapping |
|
62 |
+ and allocation. The default is mode 2. |
|
63 |
+ Mode 0: \{offsets=allocate, positions=mmap, genome=mmap\}, |
|
64 |
+ Mode 1: \{offsets=allocate, positions=mmap & preload,genome=mmap\}, |
|
65 |
+ Mode 2: \{offsets=allocate, positions=mmap & preload,genome=mmap & |
|
66 |
+ preload\}, |
|
67 |
+ Mode 3: \{offsets=allocate, positions=allocate,genome=mmap & preload\}, |
|
68 |
+ Mode 4: \{offsets=allocate, positions=allocate,genome=allocate\}} |
|
61 | 69 |
\item{...}{Additional parameters for gsnap. See gsnap's full |
62 |
- documentation for those available.} |
|
70 |
+ documentation for those available.} |
|
71 |
+ \item{terminal_threshold}{If this number of mismatches is exceeded, |
|
72 |
+ GSNAP will attempt to align from one of the sequence, eventually |
|
73 |
+ giving up and discarding the rest of the sequence. This is called a |
|
74 |
+ \dQuote{terminal alignment}. By setting this to a high value, we |
|
75 |
+ have effectively disabled it for DNA, since terminal alignments were |
|
76 |
+ motivated by splicing alignment problems and other special cases. |
|
77 |
+ } |
|
78 |
+ \item{gmap_mode}{Specifies the GMAP pipeline executed when GSNAP |
|
79 |
+ delegates to GMAP (a Smith-Waterman aligner) in difficult cases. We |
|
80 |
+ have disabled this for DNA, since such difficult cases are only |
|
81 |
+ anticipated in the context of splicing or complex rearrangements.} |
|
63 | 82 |
} |
64 | 83 |
|
65 | 84 |
\seealso{ |
... | ... |
@@ -4,7 +4,9 @@ |
4 | 4 |
\alias{bam_tally,BamFile-method} |
5 | 5 |
\alias{bam_tally,character-method} |
6 | 6 |
\alias{bam_tally,GmapBamReader-method} |
7 |
+\alias{genome,TallyIIT-method} |
|
7 | 8 |
\alias{bam_tally} |
9 |
+\alias{variantSummary} |
|
8 | 10 |
|
9 | 11 |
\title{Per-position Alignment Summaries} |
10 | 12 |
|
... | ... |
@@ -17,41 +19,46 @@ |
17 | 19 |
\usage{ |
18 | 20 |
\S4method{bam_tally}{BamFile}(x, param, ...) |
19 | 21 |
\S4method{bam_tally}{character}(x, param, ...) |
22 |
+variantSummary(x, read_pos_breaks = NULL, high_base_quality = 0L, |
|
23 |
+ keep_ref_rows = FALSE) |
|
20 | 24 |
} |
21 | 25 |
|
22 | 26 |
\arguments{ |
23 | 27 |
\item{x}{a \code{BamFile} object or string path to a BAM file to read} |
24 | 28 |
\item{param}{The \code{\linkS4class{BamTallyParam}} object with |
25 | 29 |
parameters for the tally operation. } |
30 |
+ \item{read_pos_breaks}{The breaks, like those passed to \code{\link{cut}} |
|
31 |
+ for aggregating the per-read position counts. If \code{NULL}, no per-cycle |
|
32 |
+ counts are returned.} |
|
33 |
+ \item{high_base_quality}{The minimum mapping quality for a |
|
34 |
+ read to be counted as high quality.} |
|
35 |
+ \item{keep_ref_rows}{Whether to keep the rows describing only the |
|
36 |
+ reference calls, i.e., where ref and alt are the same. These are |
|
37 |
+ useful when one needs the reference counts even when there are no |
|
38 |
+ alts at that position.} |
|
26 | 39 |
\item{...}{Arguments that override settings in \code{param}.} |
27 | 40 |
} |
28 | 41 |
|
29 | 42 |
\value{ |
30 |
- A \code{\link[GenomicRanges]{GRanges}}, with a range for each position |
|
31 |
- that passed the filters, and with the following \code{elementMetadata} |
|
32 |
- columns: |
|
33 |
- \item{location}{A string representation of the location, of the form |
|
34 |
- \dQuote{chr:pos}. This makes it easy, e.g., to check for the |
|
35 |
- presence of a variant in another result object.} |
|
36 |
- \item{ref}{The reference base at that position.} |
|
37 |
- \item{alt}{The base for the alternate allele, \code{NA} for the |
|
38 |
- reference allele row.} |
|
43 |
+ The \code{bam_tally} function returns an opaque pointer to a C-level |
|
44 |
+ data structure with the class \dQuote{TallyIIT}. Currently, the only |
|
45 |
+ operation applicable to this object is \code{variantSummary}. |
|
46 |
+ |
|
47 |
+ The \code{variantSummary} function returns |
|
48 |
+ a \code{\link[VariantAnnotation]{VRanges}}, with a range for each position |
|
49 |
+ that passed the filters. The depth columns correspond to the counts |
|
50 |
+ after quality filtering (except for indels, for which there is no |
|
51 |
+ quality filtering). The following \code{elementMetadata} |
|
52 |
+ columns are also present: |
|
39 | 53 |
\item{n.read.pos}{The number of unique cycles at which the alternate allele was |
40 | 54 |
observed, \code{NA} for the reference allele row.} |
41 | 55 |
\item{n.read.pos.ref}{The number of unique cycles at which the reference |
42 | 56 |
allele was observed.} |
43 |
- \item{count}{The number of reads with the alternate allele, |
|
57 |
+ \item{raw.count}{The number of reads with the alternate allele, |
|
44 | 58 |
\code{NA} for the reference allele row.} |
45 |
- \item{count.ref}{The number of reads with the reference allele.} |
|
46 |
- \item{count.total}{The total number of reads at that position, |
|
59 |
+ \item{raw.count.ref}{The number of reads with the reference allele.} |
|
60 |
+ \item{raw.count.total}{The total number of reads at that position, |
|
47 | 61 |
including reference and all alternates.} |
48 |
- \item{high.quality}{The number of reads for the alternate allele that were |
|
49 |
- above \code{high_quality_cutoff}, \code{NA} for the reference allele |
|
50 |
- row.} |
|
51 |
- \item{high.quality.ref}{The number of reads for the reference allele that were |
|
52 |
- above \code{high_quality_cutoff}.} |
|
53 |
- \item{mean.quality}{The mean mapping quality for the alternate allele, |
|
54 |
- \code{NA} for the reference allele row.} |
|
55 | 62 |
\item{mean.quality.ref}{The mean mapping quality for the reference |
56 | 63 |
allele.} |
57 | 64 |
\item{count.pos}{The number of positive strand reads for the alternate |
... | ... |
@@ -62,9 +69,14 @@ |
62 | 69 |
allele, \code{NA} for the reference allele row.} |
63 | 70 |
\item{count.neg.ref}{The number of negative strand reads for the reference |
64 | 71 |
allele.} |
65 |
- |
|
72 |
+ \item{read.pos.var}{Variance in the read positions for the alt allele.} |
|
73 |
+ \item{read.pos.var.ref}{Variance in the read positions for the ref allele.} |
|
74 |
+ |
|
66 | 75 |
An additional column is present for each bin formed by |
67 | 76 |
the \code{read_pos_breaks} parameter, with the read count for that bin. |
68 | 77 |
} |
69 | 78 |
|
79 |
+\seealso{\code{tallyVariants} in the VariantTools package provides a |
|
80 |
+ high-level wrapper for this functionality.} |
|
81 |
+ |
|
70 | 82 |
\author{Michael Lawrence} |