- revise vignette to reflect working with short reads, not alignments
- update example data
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/ShortRead@82095 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,19 +1,24 @@ |
1 | 1 |
Package: ShortRead |
2 | 2 |
Type: Package |
3 |
-Title: Classes and methods for high-throughput short-read sequencing data. |
|
4 |
-Version: 1.21.1 |
|
3 |
+Title: FASTQ input and manipulation |
|
4 |
+Version: 1.21.2 |
|
5 | 5 |
Author: Martin Morgan, Michael Lawrence, Simon Anders |
6 | 6 |
Maintainer: Bioconductor Package Maintainer <maintainer@bioconductor.org> |
7 |
-Description: Base classes, functions, and methods for representation of |
|
8 |
- high-throughput, short-read sequencing data. |
|
7 |
+Description: This package implements sampling, iteration, and input of |
|
8 |
+ FASTQ files. The package includes functions for filtering and |
|
9 |
+ trimming reads, and for generating a quality assessment |
|
10 |
+ report. Data are represented as DNAStringSet-derived objects, and |
|
11 |
+ easily manipulated for a diversity of purposes. The package also |
|
12 |
+ contains legacy support for early single-end, ungapped alignment |
|
13 |
+ formats. |
|
9 | 14 |
License: Artistic-2.0 |
10 | 15 |
LazyLoad: yes |
11 |
-Depends: methods, BiocGenerics (>= 0.1.0), IRanges (>= 1.19.34), |
|
12 |
- GenomicRanges (>= 1.13.43), Biostrings (>= 2.29.18), |
|
13 |
- lattice, Rsamtools (>= 1.13.1) |
|
14 |
-Imports: Biobase, hwriter, zlibbioc, latticeExtra |
|
16 |
+Depends: BiocGenerics (>= 0.1.0), IRanges (>= 1.19.34), |
|
17 |
+ GenomicRanges (>= 1.13.43), Biostrings (>= 2.29.18), |
|
18 |
+ Rsamtools (>= 1.13.1) |
|
19 |
+Imports: Biobase, hwriter, methods, zlibbioc, lattice, latticeExtra |
|
15 | 20 |
Enhances: Rmpi, parallel |
16 |
-Suggests: biomaRt, RUnit, GenomicFeatures, yeastNagalakshmi |
|
21 |
+Suggests: BiocStyle, RUnit, biomaRt, GenomicFeatures, yeastNagalakshmi |
|
17 | 22 |
LinkingTo: IRanges, XVector, Biostrings |
18 | 23 |
biocViews: DataImport, Sequencing, HighThroughputSequencing, |
19 | 24 |
QualityControl |
... | ... |
@@ -71,14 +71,14 @@ |
71 | 71 |
setMethod(qa, "ShortReadQ", .qa_ShortReadQ) |
72 | 72 |
|
73 | 73 |
.qa_fastq_lane <- |
74 |
- function(dirPath, pattern, ..., sample=TRUE, type="fastq", |
|
74 |
+ function(dirPath, ..., sample=TRUE, type="fastq", |
|
75 | 75 |
verbose=FALSE) |
76 | 76 |
{ |
77 | 77 |
if (verbose) |
78 | 78 |
message("qa 'fastq' pattern:", pattern) |
79 | 79 |
if (sample) { |
80 |
- samp <- FastqSampler(file.path(dirPath, pattern), ...) |
|
81 |
- qa <- qa(yield(samp), pattern, ..., verbose=verbose) |
|
80 |
+ samp <- FastqSampler(dirPath, ...) |
|
81 |
+ qa <- qa(yield(samp), basename(dirPath), ..., verbose=verbose) |
|
82 | 82 |
close(samp) |
83 | 83 |
elts <- .srlist(qa) |
84 | 84 |
elts$readCounts$read <- samp$status()[["total"]] |
... | ... |
@@ -95,8 +95,7 @@ setMethod(qa, "ShortReadQ", .qa_ShortReadQ) |
95 | 95 |
{ |
96 | 96 |
fls <- .file_names(dirPath, pattern) |
97 | 97 |
lst <- |
98 |
- srapply(basename(fls), .qa_fastq_lane, |
|
99 |
- dirPath=dirPath, type=type, ..., |
|
98 |
+ srapply(fls, .qa_fastq_lane, type=type, ..., |
|
100 | 99 |
reduce=.reduce(1), verbose=verbose, |
101 | 100 |
USE.NAMES=TRUE) |
102 | 101 |
lst <- do.call(rbind, lst) |
... | ... |
@@ -108,7 +108,7 @@ uniqueFilter <- |
108 | 108 |
} |
109 | 109 |
|
110 | 110 |
occurrenceFilter <- |
111 |
- function(min=1L, max=1L, withSread=c(TRUE, FALSE, NA), |
|
111 |
+ function(min=1L, max=1L, withSread=c(NA, TRUE, FALSE), |
|
112 | 112 |
duplicates=c("head", "tail", "sample", "none"), |
113 | 113 |
.name=.occurrenceName(min, max, withSread, |
114 | 114 |
duplicates)) |
... | ... |
@@ -2,7 +2,6 @@ |
2 | 2 |
\docType{class} |
3 | 3 |
|
4 | 4 |
% Class: |
5 |
-\alias{class:GappedReads} |
|
6 | 5 |
\alias{GappedReads-class} |
7 | 6 |
\alias{GappedReads} |
8 | 7 |
|
... | ... |
@@ -22,7 +21,7 @@ |
22 | 21 |
\alias{narrow,GappedReads-method} |
23 | 22 |
|
24 | 23 |
|
25 |
-\title{GappedReads objects} |
|
24 |
+\title{(Legacy) GappedReads objects} |
|
26 | 25 |
|
27 | 26 |
\description{ |
28 | 27 |
The GappedReads class extends the \link[GenomicRanges]{GAlignments} |
... | ... |
@@ -21,7 +21,7 @@ |
21 | 21 |
\alias{show,Intensity-method} |
22 | 22 |
\alias{show,IntensityMeasure-method} |
23 | 23 |
|
24 |
-\title{"Intensity", "IntensityInfo", and "IntensityMeasure" base |
|
24 |
+\title{(Legacy) "Intensity", "IntensityInfo", and "IntensityMeasure" base |
|
25 | 25 |
classes for short read image intensities} |
26 | 26 |
|
27 | 27 |
\description{ |
... | ... |
@@ -10,7 +10,8 @@ |
10 | 10 |
\alias{show,SRSet-method} |
11 | 11 |
\alias{detail,SRSet-method} |
12 | 12 |
|
13 |
-\title{A base class for Roche experiment-wide data} |
|
13 |
+\title{(Legacy) A base class for Roche experiment-wide data} |
|
14 |
+ |
|
14 | 15 |
\description{ |
15 | 16 |
This class coordinates phenotype (sample) and sequence data, primarily |
16 | 17 |
as used on the Roche platform. |
... | ... |
@@ -8,13 +8,17 @@ |
8 | 8 |
\docType{package} |
9 | 9 |
\title{ |
10 | 10 |
|
11 |
- Base classes and methods for high-throughput short-read sequencing data. |
|
11 |
+ FASTQ input and manipulation. |
|
12 | 12 |
|
13 | 13 |
} |
14 | 14 |
\description{ |
15 | 15 |
|
16 |
- Base classes, functions, and methods for representation of |
|
17 |
- high-throughput, short-read sequencing data. |
|
16 |
+ This package implements sampling, iteration, and input of FASTQ |
|
17 |
+ files. The package includes functions for filtering and trimming |
|
18 |
+ reads, and for generating a quality assessment report. Data are |
|
19 |
+ represented as DNAStringSet-derived objects, and easily manipulated |
|
20 |
+ for a diversity of purposes. The package also contains legacy support |
|
21 |
+ for early single-end, ungapped alignment formats. |
|
18 | 22 |
|
19 | 23 |
} |
20 | 24 |
\details{ |
... | ... |
@@ -8,7 +8,8 @@ |
8 | 8 |
\alias{report_html,SolexaRealignQA-method} |
9 | 9 |
\alias{show,SolexaExportQA-method} |
10 | 10 |
|
11 |
-\title{Quality assessment summaries from Solexa export and realign files} |
|
11 |
+\title{(Legacy) Quality assessment summaries from Solexa export and |
|
12 |
+ realign files} |
|
12 | 13 |
|
13 | 14 |
\description{ |
14 | 15 |
|
... | ... |
@@ -18,7 +18,7 @@ |
18 | 18 |
\alias{readBaseQuality,SolexaPath-method} |
19 | 19 |
\alias{readAligned,SolexaPath-method} |
20 | 20 |
|
21 |
-\title{"SolexaPath" class representing a standard output file hierarchy} |
|
21 |
+\title{(Legacy) "SolexaPath" class representing a standard output file hierarchy} |
|
22 | 22 |
\description{ |
23 | 23 |
|
24 | 24 |
Solexa produces a hierarchy of output files. The content of the |
... | ... |
@@ -12,7 +12,7 @@ |
12 | 12 |
% transforming methods |
13 | 13 |
\alias{readAligned,SolexaSet-method} |
14 | 14 |
|
15 |
-\title{"SolexaSet" coordinating Solexa output locations with sample annotations} |
|
15 |
+\title{(Legacy) "SolexaSet" coordinating Solexa output locations with sample annotations} |
|
16 | 16 |
\description{ |
17 | 17 |
|
18 | 18 |
This class coordinates the file hierarchy produced by the Solexa |
... | ... |
@@ -72,16 +72,16 @@ x = numeric(1000) |
72 | 72 |
x[sample(1000, 100)] <- abs(rnorm(100)) |
73 | 73 |
df <- data.frame(x = c(x, -x), pos = seq(1, 1e5, length.out=1000), |
74 | 74 |
group = rep(c("positive", "negative"), each=1000)) |
75 |
-cv <- xyplot(x ~ pos, df, group=group, type="s", |
|
76 |
- col=col, main="yeast chrI:1 - 2e5", |
|
77 |
- ylab="Coverage", xlab="Coordinate", |
|
78 |
- scales=list(y=list(tck=c(1,0)), |
|
79 |
- x=list(rot=45, tck=c(1,0), tick.number=20)), |
|
80 |
- panel=function(...) { |
|
81 |
- panel.xyplot(...) |
|
82 |
- panel.grid(h=-1, v=20) |
|
83 |
- panel.abline(a=0, b=0, col="grey") |
|
84 |
- }) |
|
75 |
+cv <- lattice::xyplot(x ~ pos, df, group=group, type="s", |
|
76 |
+ col=col, main="yeast chrI:1 - 2e5", |
|
77 |
+ ylab="Coverage", xlab="Coordinate", |
|
78 |
+ scales=list(y=list(tck=c(1,0)), |
|
79 |
+ x=list(rot=45, tck=c(1,0), tick.number=20)), |
|
80 |
+ panel=function(...) { |
|
81 |
+ lattice::panel.xyplot(...) |
|
82 |
+ lattice::panel.grid(h=-1, v=20) |
|
83 |
+ lattice::panel.abline(a=0, b=0, col="grey") |
|
84 |
+ }) |
|
85 | 85 |
s <- SpTrellis(cv) |
86 | 86 |
s |
87 | 87 |
zi(s) |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
\name{readBfaToc} |
2 | 2 |
\alias{readBfaToc} |
3 | 3 |
|
4 |
-\title{Get a list of the sequences in a Maq .bfa file} |
|
4 |
+\title{(Legacy) Get a list of the sequences in a Maq .bfa file} |
|
5 | 5 |
|
6 | 6 |
\description{ |
7 | 7 |
As \code{\link{coverage}} needs to know the lengths of the reference sequences, |
... | ... |
@@ -35,12 +35,8 @@ compose(filt, ..., .name) |
35 | 35 |
|
36 | 36 |
idFilter(regex=character(0), fixed=FALSE, exclude=FALSE, |
37 | 37 |
.name="idFilter") |
38 |
-chromosomeFilter(regex=character(0), fixed=FALSE, exclude=FALSE, |
|
39 |
- .name="ChromosomeFilter") |
|
40 |
-positionFilter(min=-Inf, max=Inf, .name="PositionFilter") |
|
41 |
-strandFilter(strandLevels=character(0), .name="StrandFilter") |
|
42 | 38 |
occurrenceFilter(min=1L, max=1L, |
43 |
- withSread=c(TRUE, FALSE, NA), |
|
39 |
+ withSread=c(NA, TRUE, FALSE), |
|
44 | 40 |
duplicates=c("head", "tail", "sample", "none"), |
45 | 41 |
.name=.occurrenceName(min, max, withSread, |
46 | 42 |
duplicates)) |
... | ... |
@@ -50,6 +46,15 @@ polynFilter(threshold=0L, nuc=c("A", "C", "T", "G", "other"), |
50 | 46 |
dustyFilter(threshold=Inf, batchSize=NA, .name="DustyFilter") |
51 | 47 |
srdistanceFilter(subject=character(0), threshold=0L, |
52 | 48 |
.name="SRDistanceFilter") |
49 |
+ |
|
50 |
+## |
|
51 |
+## legacy filters for ungapped alignments |
|
52 |
+## |
|
53 |
+ |
|
54 |
+chromosomeFilter(regex=character(0), fixed=FALSE, exclude=FALSE, |
|
55 |
+ .name="ChromosomeFilter") |
|
56 |
+positionFilter(min=-Inf, max=Inf, .name="PositionFilter") |
|
57 |
+strandFilter(strandLevels=character(0), .name="StrandFilter") |
|
53 | 58 |
alignQualityFilter(threshold=0L, .name="AlignQualityFilter") |
54 | 59 |
alignDataFilter(expr=expression(), .name="AlignDataFilter") |
55 | 60 |
} |
... | ... |
@@ -174,7 +179,7 @@ alignDataFilter(expr=expression(), .name="AlignDataFilter") |
174 | 179 |
treated: \code{TRUE} to include the sread, chromosome, strand, and |
175 | 180 |
position when determining occurrence, \code{FALSE} to include |
176 | 181 |
chromosome, strand, and position, and \code{NA} to include only |
177 |
- sread. The default is \code{withSread=TRUE}. \code{duplicates} |
|
182 |
+ sread. The default is \code{withSread=NA}. \code{duplicates} |
|
178 | 183 |
determines how reads with more than \code{max} reads are |
179 | 184 |
treated. \code{head} selects the first \code{max} reads of each set of |
180 | 185 |
duplicates, \code{tail} the last \code{max} reads, and \code{sample} a |
... | ... |
@@ -63,7 +63,7 @@ showMethods("tables") |
63 | 63 |
sp <- SolexaPath(system.file("extdata", package="ShortRead")) |
64 | 64 |
aln <- readAligned(sp) |
65 | 65 |
tables(sread(aln), n=6) |
66 |
-xyplot(log10(nReads)~log10(nOccurrences), |
|
66 |
+lattice::xyplot(log10(nReads)~log10(nOccurrences), |
|
67 | 67 |
tables(sread(aln))$distribution) |
68 | 68 |
} |
69 | 69 |
\keyword{manip} |
... | ... |
@@ -1,394 +1,170 @@ |
1 | 1 |
%\VignetteIndexEntry{An introduction to ShortRead} |
2 |
-%\VignetteDepends{} |
|
2 |
+%\VignetteDepends{BiocStyle} |
|
3 | 3 |
%\VignetteKeywords{Short read, I/0, quality assessment} |
4 | 4 |
%\VignettePackage{ShortRead} |
5 | 5 |
\documentclass[]{article} |
6 | 6 |
|
7 |
-\usepackage{times} |
|
8 |
-\usepackage{hyperref} |
|
9 |
- |
|
10 |
-\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
11 |
-\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
12 |
-\newcommand{\Rpackage}[1]{{\textit{#1}}} |
|
13 |
-\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
14 |
-\newcommand{\Rfunarg}[1]{{\texttt{#1}}} |
|
15 |
-\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
16 |
-\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
17 |
- |
|
18 |
-\newcommand{\software}[1]{\textsf{#1}} |
|
19 |
-\newcommand{\R}{\software{R}} |
|
20 |
-\newcommand{\ShortRead}{\Rpackage{ShortRead}} |
|
7 |
+<<style, eval=TRUE, echo=FALSE, results=tex>>= |
|
8 |
+BiocStyle::latex() |
|
9 |
+@ |
|
21 | 10 |
|
22 |
-\newcommand{\ELAND}{\software{ELAND}} |
|
23 |
-\newcommand{\MAQ}{\software{MAQ}} |
|
24 |
-\newcommand{\Bowtie}{\software{Bowtie}} |
|
11 |
+\newcommand{\ShortRead}{\Biocpkg{ShortRead}} |
|
25 | 12 |
|
26 | 13 |
\title{An Introduction to \Rpackage{ShortRead}} |
27 | 14 |
\author{Martin Morgan} |
28 |
-\date{Modified: 28 September 2010. Compiled: \today} |
|
15 |
+\date{Modified: 21 October, 2013. Compiled: \today} |
|
29 | 16 |
|
30 | 17 |
\begin{document} |
31 | 18 |
|
32 | 19 |
\maketitle |
33 | 20 |
|
34 |
-<<options,echo=FALSE>>= |
|
35 |
-options(width=60) |
|
36 |
-@ |
|
37 |
- |
|
38 | 21 |
<<preliminaries>>= |
39 | 22 |
library("ShortRead") |
40 | 23 |
@ |
41 | 24 |
|
42 |
-The \Rpackage{ShortRead} package aims to provide key functionality for |
|
43 |
-input, quality assurance, and basic manipulation of `short read' DNA |
|
44 |
-sequences such as those produced by Solexa, 454, and related |
|
45 |
-technologies, including flexible import of common short read data |
|
46 |
-formats. This vignette introduces key functionality. |
|
47 |
- |
|
48 |
-Support is most fully developed for Solexa; contributions from the |
|
49 |
-community are welcome. |
|
50 |
- |
|
51 |
-\section{A first workflow} |
|
52 |
- |
|
53 |
-This section walks through a simple work flow. It outlines the |
|
54 |
-hierarchy of files produced by Solexa. It then illustrates a common |
|
55 |
-way for reading short read data into \R{}. |
|
56 |
- |
|
57 |
-\subsection{\Rclass{SolexaPath}: navigating Solexa output} |
|
58 |
- |
|
59 |
-\Rclass{SolexaPath} provides functionality to navigate files produced |
|
60 |
-by Solexa Genome Analyzer pipeline software. A typical way to start a |
|
61 |
-\ShortRead{} session is to point to the root of the output file |
|
62 |
-hierarchy. The \ShortRead{} package includes a very small subset of |
|
63 |
-files emulating this hierarchy. The root is found at |
|
64 |
-<<SolexaPath-root>>= |
|
65 |
-exptPath <- system.file("extdata", package="ShortRead") |
|
66 |
-@ |
|
67 |
-%% |
|
68 |
-Usually \Rcode{exptPath} would be a location on the users' file system. Key |
|
69 |
-components of the hierarchy are parsed into \R{} with |
|
70 |
-<<SolexaPat>>= |
|
71 |
-sp <- SolexaPath(exptPath) |
|
72 |
-sp |
|
73 |
-@ |
|
74 |
-%% |
|
75 |
-\Rfunction{SolexaPath} scans the directory hierarchy to identifying |
|
76 |
-useful directories. For instance, image intensity files are in the |
|
77 |
-`Firecrest' directory, while summary and alignment files are in the |
|
78 |
-analysis directory |
|
79 |
-<<firecrest>>= |
|
80 |
-imageAnalysisPath(sp) |
|
81 |
-analysisPath(sp) |
|
82 |
-@ |
|
83 |
-%% |
|
84 |
-Most functionality in \ShortRead{} uses \Rcode{baseCallPath} or |
|
85 |
-\Rcode{analysisPath}. Solexa documentation provides details of file |
|
86 |
-content. \Rfunction{SolexaPath} accepts additional arguments that |
|
87 |
-allow individual file paths to be specified. |
|
88 |
- |
|
89 |
-Many functions for Solexa data input `know' where appropriate files |
|
90 |
-are located. Specifying \Rcode{sp} is often sufficient for identifying |
|
91 |
-the desired directory path. Examples of this are illustrated below, |
|
92 |
-with for instance \Rfunction{readAligned} and \Rfunction{readFastq}. |
|
93 |
- |
|
94 |
-Displaying an object, e.g., \Robject{sp}, provides hints at how to |
|
95 |
-access information in the object, e.g., \Rfunction{analysisPath}. This |
|
96 |
-is a convention in \ShortRead{}. |
|
97 |
- |
|
98 |
-\subsection{\Rfunction{readAligned}: reading aligned data into \R{}} |
|
99 |
- |
|
100 |
-Solexa \texttt{s\_N\_export.txt} files (\texttt{\_N\_} is a |
|
101 |
-placeholder for the lane identifier) represent one place to start |
|
102 |
-working the short read data in \R{}. These files result from running |
|
103 |
-ANALYSIS eland\_extended in the Solexa Genome Analyzer. The files |
|
104 |
-contain information on all reads, including alignment information for |
|
105 |
-those reads successfully aligned to the genome. \ShortRead{} parses |
|
106 |
-additional alignment files, including \MAQ{} binary and text |
|
107 |
-(\texttt{mapview}) files and \Bowtie{} text files; consult the help page |
|
108 |
-for \Rfunction{readAligned} for details. \ShortRead{} flexibly parses |
|
109 |
-many other Solexa files; aligned reads represent just one entry point. |
|
110 |
- |
|
111 |
-To read a single \texttt{s\_N\_export.txt} file into \R{}, for instance |
|
112 |
-from lane 2, use the command |
|
113 |
-<<readAligned-simple>>= |
|
114 |
-aln <- readAligned(sp, "s_2_export.txt") |
|
115 |
-aln |
|
116 |
-@ |
|
117 |
-%% |
|
118 |
-This illustrates the convention used for identifying files for input |
|
119 |
-into \R{} and used by \ShortRead{}. The function takes a directory |
|
120 |
-path and a pattern (as a regular expression, similar to the \R{} |
|
121 |
-function \Rfunction{list.files}) of file names to match in the |
|
122 |
-directory. Usually, all files matching the pattern are read into a |
|
123 |
-\emph{single} \R{} object; this behavior is desirable for several of |
|
124 |
-the input functions in \ShortRead{}. In the present case the usual |
|
125 |
-expectation is that a single \texttt{s\_N\_export.txt} file will be |
|
126 |
-read into a single \R{} object, so the \Rfunarg{pattern} argument will |
|
127 |
-identify a single file. |
|
128 |
- |
|
129 |
-\subsubsection{Input of other aligned read files} |
|
130 |
- |
|
131 |
-\ELAND{} software provides access to much interesting data, in |
|
132 |
-addition to alignments, but if the interest is in aligned reads then |
|
133 |
-input may come from any of a number of different software |
|
134 |
-packages. Many of these alignments can be input with |
|
135 |
-\Rpackage{ShortRead}. |
|
136 |
- |
|
137 |
-\Bowtie{} is a very fast aligner, taking a few tens of minutes to |
|
138 |
-align entire lanes of reads to reference genomes. Use |
|
139 |
-\Rfunction{readAligned} with the \Rfunarg{type="Bowtie"} argument to |
|
140 |
-input alignments. Reading \Bowtie{} output using |
|
141 |
-\Rfunction{readAligned} produces the same class of object as reading |
|
142 |
-\ELAND{} output. Like \ELAND{}, \Bowtie{} provides information on |
|
143 |
-short read, quality, chromosome, position, and strand; there is no |
|
144 |
-information on alignment quality avaiable from \Bowtie{}. \ELAND{} and |
|
145 |
-\Bowtie{} provide very different auxiliary information. Consult the |
|
146 |
-\Rfunction{readAligned} and help page \Bowtie{} manual for additional |
|
147 |
-detail. |
|
148 |
- |
|
149 |
-\MAQ{} is another poplar aligner. \Rpackage{ShortRead} can input |
|
150 |
-\MAQ{} binary or text formats (see the arguments |
|
151 |
-\Rfunarg{type="MAQMapShort"}, \Rfunarg{"MAQMap"}, and |
|
152 |
-\Rfunarg{"MAQMapview"}). As with \Bowtie{}, \MAQ{} provides essential |
|
153 |
-information about reads and their aligments, plus additional |
|
154 |
-information that differs somewhat from the additional information |
|
155 |
-provided by \ELAND{}. |
|
156 |
- |
|
157 |
-Alignment information may come in a variety of different text-based |
|
158 |
-formats. Not all of these will be supported by |
|
159 |
-\Rpackage{ShortRead}. There are a number of tools available to input |
|
160 |
-this into \R{}. |
|
161 |
- |
|
162 |
-A basic strategy is involves two passes over the data, followed by |
|
163 |
-synthesis of results into an \Rclass{AlignedRead} object. First, |
|
164 |
-input alignment data using functions such as |
|
165 |
-\Rfunction{read.table}. Use the \Rfunarg{colClasses} argument to |
|
166 |
-`mask-out' (i.e., avoid importing) DNA and quality sequences. Next, |
|
167 |
-use \Rfunction{readXStringColumns} or \Rfunction{readFastq} to import |
|
168 |
-the short read and quality information. Finally, use the alignment |
|
169 |
-data and reads as arugments to the \Rfunction{AlignedRead} function to |
|
170 |
-synthesize the input. The following illustrates use of |
|
171 |
-\Rfunction{readXStringColumns} and \Rfunction{readFastq}. These |
|
172 |
-functions receive further attention below. |
|
173 |
- |
|
174 |
-\subsubsection{Cautions} |
|
175 |
- |
|
176 |
-There are several confusing areas of input. (1) Some alignment |
|
177 |
-programs and genome resources start numbering nucleotides of the |
|
178 |
-subject sequence at 0, whereas others start at 1. (2) Some alignment |
|
179 |
-programs report matches on the minus strand in terms of the |
|
180 |
-`left-most' position of the read (i.e., the location of the 3' end of |
|
181 |
-the aligned read), whereas other report `five-prime''matches (i.e., in |
|
182 |
-terms of the 5' end of the read), regardless of whether the alignment |
|
183 |
-is on the plus or minus strand. (3) Some alignment programs reverse |
|
184 |
-complement the sequence of reads aligned to the minus strand. (4) Base |
|
185 |
-qualities are sometimes encoded as character strings, but the encoding |
|
186 |
-differs between `fastq' and `solexa fastq'. It seems that all |
|
187 |
-combinations of these choices are common `in the wild'. |
|
188 |
- |
|
189 |
-The help page for \Rfunction{readAligned} attempts to be explicit |
|
190 |
-about how reads are formatted. Briefly: |
|
191 |
-\begin{itemize} |
|
192 |
-\item Subject sequence nucleotides are numbered starting at 1, rather |
|
193 |
- than zero. \Rfunction{readAligned} adjusts the coordinate system of |
|
194 |
- input reads if necessary (e.g., reading \MAQ{} alignments). |
|
195 |
-\item Alignments on the minus strand are reported in `left-most' |
|
196 |
- coordinates systems. |
|
197 |
-\item \ELAND{} and \Bowtie{} alignments on the minus strand are not reverse |
|
198 |
- complemented. |
|
199 |
-\item Character-encoded base quality scores are intrepreted as the |
|
200 |
- default for the software package being parsed, e.g., as `Solexa |
|
201 |
- fastq' for \ELAND{}. The object returned by \Rfunction{quality} |
|
202 |
- applied to an \Rclass{AlignedRead} object is either |
|
203 |
- \Rclass{FastqQuality} or \Rclass{SFastqQuality}. |
|
204 |
-\end{itemize} |
|
205 |
-Alignment programs sometimes offer the opportunity to custommize |
|
206 |
-output; such customization needs to be accomodated when reads are |
|
207 |
-input using \Rpackage{ShortRead}. |
|
208 |
- |
|
209 |
-\subsubsection{Filtering input} |
|
210 |
- |
|
211 |
-Downstream analysis may often want to use a well-defined subset of |
|
212 |
-reads. These can be selected with the \Rfunarg{filter} argument of |
|
213 |
-\Rfunction{readAligned}. There are built-in filters, for instance to |
|
214 |
-remove all reads containing an \texttt{N} nucleotide, to select just |
|
215 |
-those reads that map to the genome file \texttt{chr5.fa}, to select |
|
216 |
-reads on the \texttt{+} strand, or to `level the playing field' by |
|
217 |
-selecting only a single read for any chromosome, position and strand: |
|
218 |
-<<filter-egs>>= |
|
219 |
-nfilt <- nFilter() |
|
220 |
-cfilt <- chromosomeFilter('chr5.fa') |
|
221 |
-sfilt <- strandFilter("+") |
|
222 |
-ofilt <- occurrenceFilter(withSread=FALSE) |
|
25 |
+The \Rpackage{ShortRead} package provides functionality for working |
|
26 |
+with FASTQ files from high throughput sequence analysis. The package |
|
27 |
+also contains functions for legacy (single-end, ungapped) aligned |
|
28 |
+reads; for working with BAM files, please see the \Biocpkg{Rsamtools}, |
|
29 |
+\Biocpkg{GenomicRanges}, and related packages. |
|
30 |
+ |
|
31 |
+\section{Sample data} |
|
32 |
+ |
|
33 |
+Sample FASTQ data are derived from ArrayExpress record |
|
34 |
+\href{http://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-1147/}{E-MTAB-1147}. |
|
35 |
+Paired-end FASTQ files were retrieved and then sampled to 20,000 |
|
36 |
+records with |
|
37 |
+<<sample, eval=FALSE>>= |
|
38 |
+sampler <- FastqSampler('E-MTAB-1147/fastq/ERR127302_1.fastq.gz', 20000) |
|
39 |
+set.seed(123); ERR127302_1 <- yield(sampler) |
|
40 |
+sampler <- FastqSampler('E-MTAB-1147/fastq/ERR127302_2.fastq.gz', 20000) |
|
41 |
+set.seed(123); ERR127302_2 <- yield(sampler) |
|
42 |
+@ |
|
43 |
+ |
|
44 |
+\section{Functionality} |
|
45 |
+ |
|
46 |
+Functionality is summarized in Table~\ref{tab:fastq}. |
|
47 |
+ |
|
48 |
+\begin{table} |
|
49 |
+ \centering |
|
50 |
+ \begin{tabular}{lll} |
|
51 |
+ \hline |
|
52 |
+ \multicolumn{3}{l}{Input} \\ |
|
53 |
+ & \Rfunction{FastqStreamer} & Iterate through FASTQ files in chunks \\ |
|
54 |
+ & \Rfunction{FastqSampler} & Draw random samples from FASTQ files \\ |
|
55 |
+ & \Rfunction{readFastq} & Read an entire FASTQ file into memory \\ |
|
56 |
+ & \Rfunction{writeFastq} & Write FASTQ objects to a connection (file) \\ |
|
57 |
+ \multicolumn{3}{l}{Sequence and quality summary} \\ |
|
58 |
+ & \Rfunction{alphabetFrequency} & Nucleotide or quality score use per read\\ |
|
59 |
+ & \Rfunction{alphabetByCycle} & Nucleotide or quality score use by cycle\\ |
|
60 |
+ & \Rfunction{alphabetScore} & Whole-read quality summary\\ |
|
61 |
+ & \Rfunction{encoding} & Character / `phred' score mapping \\ |
|
62 |
+ \multicolumn{3}{l}{Quality assessment} \\ |
|
63 |
+ & \Rfunction{qa} & Visit FASTQ files to collect QA statistics \\ |
|
64 |
+ & \Rfunction{report} & Generate a quality assessment report \\ |
|
65 |
+ \multicolumn{3}{l}{Filtering and trimming} \\ |
|
66 |
+ & \Rfunction{srFilter} & Pre-defined and bespoke filters \\ |
|
67 |
+ & \Rfunction{trimTails}, etc. & Trim low-quality nucleotides \\ |
|
68 |
+ & \Rfunction{narrow} & Remove leading / trailing nucleotides \\ |
|
69 |
+ & \Rfunction{tables} & Summarize read occurrence \\ |
|
70 |
+ & \Rfunction{srduplicated}, etc. & Identify duplicate reads \\ |
|
71 |
+ & \Rfunction{filterFastq} & Filter reads from one file to another\\ |
|
72 |
+ \multicolumn{3}{l}{Parallel evaluation} \\ |
|
73 |
+ & \Rfunction{srapply} & \Rfunction{lapply}-like parallel evaluation \\ |
|
74 |
+ \hline |
|
75 |
+ \end{tabular} |
|
76 |
+ \caption{Key functions for working with FASTQ files} |
|
77 |
+ \label{tab:fastq} |
|
78 |
+\end{table} |
|
79 |
+ |
|
80 |
+\paragraph{Input} FASTQ files are large so processing involves |
|
81 |
+iteration in `chunks' using \Rfunction{FastqStreamer} |
|
82 |
+<<stream, eval=FALSE>>= |
|
83 |
+strm <- FastqStreamer("a.fastq.gz") |
|
84 |
+repeat { |
|
85 |
+ fq <- yield(strm) |
|
86 |
+ if (length(fq) == 0) |
|
87 |
+ break |
|
88 |
+ ## process chunk |
|
89 |
+} |
|
223 | 90 |
@ |
224 |
-%% |
|
225 |
-Here we select only those reads that map to \texttt{chr5.fa}: |
|
226 |
-<<readAligned-filter>>= |
|
227 |
-chr5 <- readAligned(sp, "s_2_export.txt", filter=cfilt) |
|
228 |
-@ |
|
229 |
-%% |
|
230 |
-Filters can be `composed' to act in unison, e.g., selecting only reads |
|
231 |
-mapping to \texttt{chr5.fa} and on the \texttt{+} strand: |
|
232 |
-<<readAligned-compose-filter>>= |
|
233 |
-filt <- compose(cfilt, sfilt) |
|
234 |
-chr5plus <- readAligned(sp, "s_2_export.txt", filter=filt) |
|
235 |
-@ |
|
236 |
-%% |
|
237 |
-Filters can subset aligned reads at other stages in the work flow, |
|
238 |
-using a paradigm like the following: |
|
239 |
-<<AlignedRead-filter>>= |
|
240 |
-chr5 <- aln[cfilt(aln)] |
|
241 |
-@ |
|
242 |
-%% |
|
243 |
-Users can easily create their own filter by writing a function that |
|
244 |
-accepts an object of class \Rcode{AlignedRead}, and returns a logical |
|
245 |
-vector indicating which reads in the object pass the filter. See the |
|
246 |
-example on the \Rcode{srFilter} help page for details, and for |
|
247 |
-information about additional built-in filters. |
|
248 |
- |
|
249 |
-\subsection{Exploring \ShortRead{} objects} |
|
250 |
- |
|
251 |
-\Robject{aln} is an object of \Rclass{\Sexpr{class(aln)}} class. It |
|
252 |
-contains short reads and their (calibrated) qualities: |
|
253 |
-<<aln-sread-quality>>= |
|
254 |
-sread(aln) |
|
255 |
-quality(aln) |
|
256 |
-@ |
|
257 |
- |
|
258 |
-The short reads are stored as a \Rclass{DNAStringSet} class. This |
|
259 |
-class is defined in \Rpackage{Biostrings}. It represents DNA sequence |
|
260 |
-data relatively efficiently. There are a number of very useful |
|
261 |
-methods defined for \Rclass{DNAStringSet}. Some of these methods are |
|
262 |
-illustrated in this vignette. Other methods are described in the help |
|
263 |
-pages and vignettes of the \Rpackage{Biostrings} and |
|
264 |
-\Rpackage{IRanges} packages. |
|
265 |
- |
|
266 |
-Qualities are represented as \Sexpr{class(quality(aln))}-class |
|
267 |
-objects. The qualities in the \Robject{aln} object returned by |
|
268 |
-\Rfunction{readAligned} are of class \Rclass{BStringSet}. The |
|
269 |
-\Rclass{BStringSet} class is also defined in \Rpackage{Biostrings}, |
|
270 |
-and shares many methods with those of \Rclass{DNAStringSet}. |
|
271 |
- |
|
272 |
-The \Robject{aln} object contains additional information about |
|
273 |
-alignments. Some of this additional information is expected from any |
|
274 |
-alignment, whether generated by Solexa or other software. For |
|
275 |
-example, \Robject{aln} contains the particular sequence within a |
|
276 |
-target (e.g., chromosomes in a genome assembly), the position (e.g., |
|
277 |
-base pair coordinate), and strand to which the alignment was made, and |
|
278 |
-the quality of the alignment. The display of \Robject{aln} suggests |
|
279 |
-how to access this information. For instance, the strand to which |
|
280 |
-alignments are made can be extracted (as a factor with three levels |
|
281 |
-and possibly \Rcode{NA}; the level \Rcode{"*"} corresponds to reads |
|
282 |
-for which strand alignment is intrinsically not meaningful, whereas |
|
283 |
-\Rcode{NA} represents the traditional concept of information not |
|
284 |
-available, e.g., because the read did not align at all) and tabulated |
|
285 |
-using familiar \R{} functions. |
|
286 |
-<<chromosomes>>= |
|
287 |
-whichStrand <- strand(aln) |
|
288 |
-class(whichStrand) |
|
289 |
-levels(whichStrand) |
|
290 |
-table(whichStrand, useNA="ifany") |
|
291 |
-@ |
|
292 |
-%% |
|
293 |
-This shows that about |
|
294 |
-%% |
|
295 |
-\Sexpr{format(100*sum(is.na(whichStrand))/ length(whichStrand))}\%{} |
|
296 |
-%% |
|
297 |
-of reads were not aligned (level \Rcode{NA}). |
|
298 |
- |
|
299 |
-The \Robject{aln} object contains information in addition to that |
|
300 |
-expected of all alignments. This information is accessible using |
|
301 |
-\Rfunction{alignData}: |
|
302 |
-<<alignData>>= |
|
303 |
-alignData(aln) |
|
304 |
-@ |
|
305 |
-%% |
|
306 |
-Users familiar with the \Rclass{ExpressionSet} class in |
|
307 |
-\Rpackage{Biobase} will recognize this as an |
|
308 |
-\Rclass{AnnotatedDataFrame}-like object, containing a data frame with |
|
309 |
-rows for each short read. The \Rclass{AlignedDataFrame} contains |
|
310 |
-additional meta data about the meaning of each column. For instance, |
|
311 |
-data extracted from the Solexa export file includes: |
|
312 |
-<<varMetadata>>= |
|
313 |
-varMetadata(alignData(aln)) |
|
91 |
+or drawing a random sample from the file |
|
92 |
+<<sampler, eval=FALSE>>= |
|
93 |
+sampler <- FastqSampler("a.fastq.gz") |
|
94 |
+fq <- yield(sampler) |
|
95 |
+@ |
|
96 |
+\noindent The default size for both streams and samples is 1M records; |
|
97 |
+this volume of data fits easily into memory. Small FASTQ files can be |
|
98 |
+read in to memory in their entirety using \Rfunction{readFastq}; we do |
|
99 |
+this for our sample data |
|
100 |
+<<readFastq>>= |
|
101 |
+fl <- system.file(package="ShortRead", "extdata", "E-MTAB-1147", |
|
102 |
+ "ERR127302_1_subset.fastq.gz") |
|
103 |
+fq <- readFastq(fl) |
|
104 |
+@ |
|
105 |
+ |
|
106 |
+The result of data input is an instance of class \Rclass{ShortReadQ} |
|
107 |
+(Table~\ref{tab:classes}). |
|
108 |
+\begin{table} |
|
109 |
+ \centering |
|
110 |
+ \begin{tabular}{ll} |
|
111 |
+ \hline |
|
112 |
+ \Rclass{DNAStringSet} & (\Biocpkg{Biostrings}) Short read sequences \\ |
|
113 |
+ \Rclass{FastqQuality}, etc. & Quality encodings \\ |
|
114 |
+ \Rclass{ShortReadQ} & Reads, quality scores, and ids \\ |
|
115 |
+ \hline |
|
116 |
+ \end{tabular} |
|
117 |
+ \caption{Primary data types in the \Biocpkg{ShortRead} package} |
|
118 |
+ \label{tab:classes} |
|
119 |
+\end{table} |
|
120 |
+This class contains reads, their quality scores, and optionally the id |
|
121 |
+of the read. |
|
122 |
+<<ShortReadQ>>= |
|
123 |
+fq |
|
124 |
+fq[1:5] |
|
125 |
+head(sread(fq), 3) |
|
126 |
+head(quality(fq), 3) |
|
314 | 127 |
@ |
315 |
-%% |
|
316 |
-Guides to the precise meaning of this data are on the help page for |
|
317 |
-the \Rclass{AlignedRead} class, and in the manufacturer manuals. |
|
318 |
- |
|
319 |
-Simple information about the alignments can be found by querying this |
|
320 |
-object. For instance, unaligned reads have \Rcode{NA} as their |
|
321 |
-position, and reads passing Solexa `filtering' (their base purity and |
|
322 |
-chastity criteria) have a component of their auxiliary |
|
323 |
-\Robject{alignData} set to \Rcode{"Y"}. Thus the fraction of unaligned |
|
324 |
-reads and reads passing filtering are |
|
325 |
-<<aln-okreads>>= |
|
326 |
-mapped <- !is.na(position(aln)) |
|
327 |
-filtered <- alignData(aln)[["filtering"]] =="Y" |
|
328 |
-sum(!mapped) / length(aln) |
|
329 |
-sum(filtered) / length(aln) |
|
128 |
+\noindent The reads are represented as \Rclass{DNAStringSet} |
|
129 |
+instances, and can be manipulated with the rich tools defined in the |
|
130 |
+\Biocpkg{Biostrings} package. The quality scores are represented by a |
|
131 |
+class that represents the quality encoding inferred from the file; the |
|
132 |
+encoding in use can be discovered with |
|
133 |
+<<encoding>>= |
|
134 |
+encoding(quality(fq)) |
|
330 | 135 |
@ |
136 |
+\noindent The primary source of documentation for these classes is |
|
137 |
+\Rcode{?ShortReadQ} and \Rcode{?QualityScore}. |
|
331 | 138 |
|
332 |
-Extracting the reads that passed filtering but were unmapped is |
|
333 |
-accomplished with |
|
334 |
-<<aln-failed>>= |
|
335 |
-failedAlign <- aln[filtered & !mapped] |
|
336 |
-failedAlign |
|
337 |
-@ |
|
338 |
-%% |
|
339 |
-Alternatively, we can extract just the short reads, and select the |
|
340 |
-subset of those that failed filtering. |
|
341 |
-<<sread-filter-fail-subset>>= |
|
342 |
-failedReads <- sread(aln)[filtered & !mapped] |
|
343 |
-@ |
|
139 |
+\section{Common work flows} |
|
344 | 140 |
|
345 | 141 |
\subsection{Quality assessment} |
346 | 142 |
|
347 |
-The \Rfunction{qa} function provides a convenient way to summarize |
|
348 |
-read and alignment quality. One way of obtaining quality assessment |
|
349 |
-results is |
|
350 |
-<<qa>>= |
|
351 |
-qaSummary <- qa(sp) |
|
143 |
+FASTQ files are often used for basic quality assessment, often to |
|
144 |
+augment the purely technical QA that might be provided by the |
|
145 |
+sequencing center with QA relevant to overall experimental design. A QA report is generated by creating a vector of paths to FASTQ files |
|
146 |
+<<qa-files, eval=FALSE>>= |
|
147 |
+fls <- dir("/path/to", "*fastq$", full=TRUE) |
|
352 | 148 |
@ |
353 |
-%% |
|
354 |
-The \Robject{qa} object is a list-like structure. As invoked above and |
|
355 |
-currently implemented, \Rfunction{qa} visits all |
|
356 |
-\texttt{s\_N\_export.txt} files in the appropriate directory. It |
|
357 |
-extracts useful information from the files, and summarizes the results |
|
358 |
-into a nested list-like structure. |
|
359 |
- |
|
360 |
-Evaluating \Rfunction{qa} for a single lane can take several |
|
361 |
-minutes. For this reason a common use case is to evaluate |
|
362 |
-\Rfunction{qa} and save the result to disk for later use, e.g., |
|
363 |
-<<eval=FALSE>>= |
|
364 |
-save(qaSummary, file="/path/to/file.rda") |
|
149 |
+\noindent collecting statistics over the files |
|
150 |
+<<qa-qa, eval=FALSE>>= |
|
151 |
+qaSummary <- qa(fls, type="fastq") |
|
365 | 152 |
@ |
366 |
-%% |
|
367 |
-A feature of \ShortRead{} is the use of \Rpackage{Rmpi} or |
|
368 |
-\Rpackage{multicore} and coarse-grained parallel processing when |
|
369 |
-available. Thus commands such as |
|
370 |
-<<eval=FALSE>>= |
|
371 |
-library("Rmpi") |
|
372 |
-mpi.spawn.Rslaves(nsl=8) |
|
373 |
-qaSummary <- qa(sp) |
|
374 |
-mpi.close.Rslaves() |
|
153 |
+\noindent and creating and viewing a report |
|
154 |
+<<qa-view, eval=FALSE>>= |
|
155 |
+browseURL(report(qaSummary)) |
|
375 | 156 |
@ |
376 |
-%% |
|
377 |
-or |
|
378 |
-<<eval=FALSE>>= |
|
379 |
-library(multicore) |
|
380 |
-qaSummary <- qa(sp) |
|
157 |
+\noindent By default, the report is based on a sample of 1M |
|
158 |
+reads. Parallel processing (across files) can be enabled by specifying |
|
159 |
+\Rcode{options(srapply\_fapply)}, as described in \Rcode{?srapply}. |
|
160 |
+ |
|
161 |
+These QA facilities are easily augmented by writing custom functions |
|
162 |
+for reads sampled from files, or by explorting the elements of the |
|
163 |
+object returned from \Rcode{qa()}, e.g., for an analysis of |
|
164 |
+ArrayExpress experiment E-MTAB-1147: |
|
165 |
+<<qa-files, echo=FALSE>>= |
|
166 |
+load("qa_E-MTAB-1147.Rda") |
|
381 | 167 |
@ |
382 |
-%% |
|
383 |
-will distribute the task of processing each lane to each of the |
|
384 |
-\Rpackage{Rmpi} workers or \Rpackage{multicore} cores. In the |
|
385 |
-\Rpackage{Rmpi} example, all 8 lanes of a Solexa experiment are |
|
386 |
-processed in the time take to process a single lane. |
|
387 |
-\Rpackage{multicore} may impose significant memory demands, as each |
|
388 |
-core will attempt to load a full lane of data. |
|
389 |
- |
|
390 |
-The elements of the quality assessment list are suggested by the |
|
391 |
-output: |
|
392 | 168 |
<<qa-elements>>= |
393 | 169 |
qaSummary |
394 | 170 |
@ |
... | ... |
@@ -396,8 +172,8 @@ qaSummary |
396 | 172 |
For instance, the count of reads in each lane is summarized in the |
397 | 173 |
\Robject{readCounts} element, and can be displayed as |
398 | 174 |
<<qa-readCounts>>= |
399 |
-qaSummary[["readCounts"]] |
|
400 |
-qaSummary[["baseCalls"]] |
|
175 |
+head(qaSummary[["readCounts"]]) |
|
176 |
+head(qaSummary[["baseCalls"]]) |
|
401 | 177 |
@ |
402 | 178 |
%% |
403 | 179 |
The \Robject{readCounts} element contains a data frame with 1 row and |
... | ... |
@@ -409,17 +185,6 @@ Solexa filtering criteria, and the number of reads aligned to the |
409 | 185 |
reference genome for the lane. The \Robject{baseCalls} element |
410 | 186 |
summarizes base calls in the unfiltered reads. |
411 | 187 |
|
412 |
-Other elements of \Robject{qaSummary} are more complicated, and their |
|
413 |
-interpretation is not directly obvious. Instead, a common use is to |
|
414 |
-forward the results of \Rfunction{qa} to a report generator. |
|
415 |
-<<report, eval=FALSE>>= |
|
416 |
-report(qaSummary, dest="/path/to/report_directory") |
|
417 |
-@ |
|
418 |
-%% |
|
419 |
-The report includes \R{} code that can be used to understand how |
|
420 |
-\Sexpr{class(qaSummary)}-class objects can be processed; reports are |
|
421 |
-generated as HTML suitable for browser viewing. |
|
422 |
- |
|
423 | 188 |
The functions that produce the report tables and graphics are |
424 | 189 |
internal to the package. They can be accessed through calling |
425 | 190 |
ShortRead:::functionName where functionName is one of the functions |
... | ... |
@@ -436,6 +201,47 @@ listed below, organized by report section. |
436 | 201 |
\item [] Adapter Contamination : .ppnCount |
437 | 202 |
\end{description} |
438 | 203 |
|
204 |
+\subsection{Filtering and trimming} |
|
205 |
+ |
|
206 |
+It is straight-forward to create filters to eliminate reads or to trim |
|
207 |
+reads based on diverse characteristics. The basic structure is to open |
|
208 |
+a FASTQ file, iterate through chunks of the file performing filtering |
|
209 |
+or trimming steps, and appending the filtered data to a new file. |
|
210 |
+<<filter-scheme>>= |
|
211 |
+myFilterAndTrim <- |
|
212 |
+ function(fl, destination=sprintf("%s_subset", fl)) |
|
213 |
+{ |
|
214 |
+ ## open input stream |
|
215 |
+ stream <- open(FastqStreamer(fl)) |
|
216 |
+ on.exit(close(stream)) |
|
217 |
+ |
|
218 |
+ repeat { |
|
219 |
+ ## input chunk |
|
220 |
+ fq <- yield(stream) |
|
221 |
+ if (length(fq) == 0) |
|
222 |
+ break |
|
223 |
+ |
|
224 |
+ ## trim and filter, e.g., reads cannot contain 'N'... |
|
225 |
+ fq <- fq[nFilter()(fq)] # see ?srFilter for pre-defined filters |
|
226 |
+ ## trim as soon as 2 of 5 nucleotides has quality encoding less |
|
227 |
+ ## than "4" (phred score 20) |
|
228 |
+ fq <- trimTailw(fq, 2, "4", 2) |
|
229 |
+ ## drop reads that are less than 36nt |
|
230 |
+ fq <- fq[width(fq) < 36] |
|
231 |
+ |
|
232 |
+ ## append to destination |
|
233 |
+ writeFastq(fq, destination, "a") |
|
234 |
+ } |
|
235 |
+} |
|
236 |
+@ |
|
237 |
+\noindent This is memory efficient and flexible. Care must be taken to |
|
238 |
+coordinate pairs of FASTQ files representing paired-end reads, to |
|
239 |
+preserve order. Processing several files in parallel might use |
|
240 |
+<<filter-several, eval=FALSE>>= |
|
241 |
+fls <- dir("/path/to", "fastq$", full=TRUE) |
|
242 |
+srapply(fls, myFilterAndTrim) |
|
243 |
+@ |
|
244 |
+ |
|
439 | 245 |
\section{Using \Rpackage{ShortRead} for data exploration} |
440 | 246 |
|
441 | 247 |
\subsection{Data I/O} |
... | ... |
@@ -450,6 +256,9 @@ sequence-like data. For instance, the Solexa files |
450 | 256 |
\texttt{s\_N\_export.txt} contain lines with the following |
451 | 257 |
information: |
452 | 258 |
<<export>>= |
259 |
+## location of file |
|
260 |
+exptPath <- system.file("extdata", package="ShortRead") |
|
261 |
+sp <- SolexaPath(exptPath) |
|
453 | 262 |
pattern <- "s_2_export.txt" |
454 | 263 |
fl <- file.path(analysisPath(sp), pattern) |
455 | 264 |
strsplit(readLines(fl, n=1), "\t") |
... | ... |
@@ -470,10 +279,10 @@ package. \Rfunction{readXStringColumns} allows us to read in columns |
470 | 279 |
of text as these classes. |
471 | 280 |
|
472 | 281 |
Important arguments for \Rfunction{readXStringColumns} are the |
473 |
-\Rfunarg{dirPath} in which to look for files, the \Rfunarg{pattern} of |
|
474 |
-files to parse, and the \Rfunarg{colClasses} of the columns to be |
|
475 |
-parsed. The \Rfunarg{dirPath} and \Rfunarg{pattern} arguments are like |
|
476 |
-\Rfunarg{list.files}. \Rfunarg{colClasses} is like the corresponding |
|
282 |
+\Rcode{dirPath} in which to look for files, the \Rcode{pattern} of |
|
283 |
+files to parse, and the \Rcode{colClasses} of the columns to be |
|
284 |
+parsed. The \Rcode{dirPath} and \Rcode{pattern} arguments are like |
|
285 |
+\Rcode{list.files}. \Rcode{colClasses} is like the corresponding |
|
477 | 286 |
argument to \Rfunction{read.table}: it is a \Rclass{list} specifying |
478 | 287 |
the class of each column to be read, or \Robject{NULL} if the column |
479 | 288 |
is to be ignored. In our case there are 21 columns, and we would like |
... | ... |
@@ -498,7 +307,7 @@ The file has been parsed, and appropriate data objects were created. |
498 | 307 |
|
499 | 308 |
A feature of \Rfunction{readXStringColumns} and other input functions |
500 | 309 |
in the \Rpackage{ShortRead} package is that all files matching |
501 |
-\Rfunarg{pattern} in the specified \Rfunarg{dirPath} will be read into |
|
310 |
+\Rcode{pattern} in the specified \Rcode{dirPath} will be read into |
|
502 | 311 |
a single object. This provides a convenient way to, for instance, |
503 | 312 |
parse all tiles in a Solexa lane into a single \Rclass{DNAStringSet} |
504 | 313 |
object. |
... | ... |
@@ -516,75 +325,6 @@ related classes are used extensively in \Rpackage{ShortRead}, |
516 | 325 |
\Rpackage{Biostrings}, \Rpackage{BSgenome} and other packages relevant |
517 | 326 |
to short read technology. |
518 | 327 |
|
519 |
-\subsubsection{\Rfunction{readFastq}} |
|
520 |
- |
|
521 |
-\Rfunction{readXStringColumns} should be considered a `low-level' |
|
522 |
-function providing easy access to columns of data. Another flexible |
|
523 |
-input function is \Rfunction{readFastq}. Fastq files combine reads and |
|
524 |
-their base qualities in four-line records such as the following: |
|
525 |
-<<fastq-format>>= |
|
526 |
-fqpattern <- "s_1_sequence.txt" |
|
527 |
-fl <- file.path(analysisPath(sp), fqpattern) |
|
528 |
-readLines(fl, 4) |
|
529 |
-@ |
|
530 |
-% |
|
531 |
-The first and third lines are an identifier (encoding the machine, run, |
|
532 |
-lane, tile, x and y coordinates of the cluster that gave rise to the |
|
533 |
-read, in this case). The second line is the read, and the fourth line |
|
534 |
-the per-base quality. Files of this sort can be read in as |
|
535 |
-<<readFastq>>= |
|
536 |
-fq <- readFastq(sp, fqpattern) |
|
537 |
-fq |
|
538 |
-@ |
|
539 |
-% |
|
540 |
-This resulting object (of class \Sexpr{class(fq)}) contains the short |
|
541 |
-reads, their qualities, and the identifiers: |
|
542 |
-<<ShortReadQ>>= |
|
543 |
-reads <- sread(fq) |
|
544 |
-qualities <- quality(fq) |
|
545 |
-class(qualities) |
|
546 |
-id(fq) |
|
547 |
-@ |
|
548 |
-% |
|
549 |
-Notice that the class of the qualities is \Rclass{SFastqQuality}, to |
|
550 |
-indicate that these are quality scores derived using the Solexa |
|
551 |
-convention, rather than ordinary \Rclass{BStringSet} objects. |
|
552 |
- |
|
553 |
-The object has essential operations for convenient manipulation, for |
|
554 |
-instance simultaneously forming the subset of all three components: |
|
555 |
-<<ShortReadQ-subset>>= |
|
556 |
-fq[1:5] |
|
557 |
-@ |
|
558 |
- |
|
559 |
-\subsubsection{Additional input functions} |
|
560 |
- |
|
561 |
-\ShortRead{} includes additional functions to facilitate input. For |
|
562 |
-instance, \Rfunction{readPrb} reads Solexa \texttt{\_prb.txt} |
|
563 |
-files. These files contain base-specific quality information, and |
|
564 |
-\Rfunction{readPrb} returns an \Rclass{SFastqQuality}-class object |
|
565 |
-representing the fastq-encoded base-specific quality scores of all |
|
566 |
-reads. |
|
567 |
- |
|
568 |
-As a second example, the \texttt{s\_N\_LLLL\_int.txt} files in the |
|
569 |
-\Rfunction{imageAnalysisPath} directory contain lines, one line per |
|
570 |
-read, of nucleotide intensities. Each line contain lane, tile, X and Y |
|
571 |
-coordinate information, followed by quadruplets of intensity values. |
|
572 |
-There are as many quadruplets as there are cycles. Each quadruplet |
|
573 |
-represents the intensity of the \texttt{A}, \texttt{C}, \texttt{G}, |
|
574 |
-and \texttt{T} nucleotide at the corresponding cycle. These (and their |
|
575 |
-error estimates, if available), are input with |
|
576 |
-<<intensity-files>>= |
|
577 |
-int <- readIntensities(sp, withVariability=FALSE) |
|
578 |
-int |
|
579 |
-@ |
|
580 |
-%% |
|
581 |
-An interesting exercise is to display the intensities at cycle 2 |
|
582 |
-(below) and to compare these to cycle, e.g., 30. |
|
583 |
-<<intensities-cycle-2, fig=TRUE>>= |
|
584 |
-print(splom(intensity(int)[[,,2]], pch=".", cex=3)) |
|
585 |
-@ |
|
586 |
- |
|
587 |
-Additional files can be parsed using standard \R{} input methods. |
|
588 | 328 |
|
589 | 329 |
\subsection{Sorting} |
590 | 330 |
|
... | ... |
@@ -609,7 +349,7 @@ prior to PCR. |
609 | 349 |
|
610 | 350 |
The \Rfunction{tables} function summarizes read occurrences, for instance, |
611 | 351 |
<<tables>>= |
612 |
-tbls <- tables(aln) |
|
352 |
+tbls <- tables(fq) |
|
613 | 353 |
names(tbls) |
614 | 354 |
tbls$top[1:5] |
615 | 355 |
head(tbls$distribution) |
... | ... |
@@ -621,8 +361,9 @@ reads. Knowledgeable readers will recognize the top-occurring read as a |
621 | 361 |
close match to one of the manufacturer adapters. |
622 | 362 |
|
623 | 363 |
The \Robject{distribution} component returned by \Robject{tables} is a |
624 |
-data frame that summarizes how many reads (e.g., \Sexpr{tbls[["distribution"]][1,"nReads"]}) |
|
625 |
-are represented exactly \Sexpr{tbls[["distribution"]][1,"nOccurrences"]} times. |
|
364 |
+data frame that summarizes how many reads (e.g., |
|
365 |
+\Sexpr{tbls[["distribution"]][1,"nReads"]}) are represented exactly |
|
366 |
+\Sexpr{tbls[["distribution"]][1,"nOccurrences"]} times. |
|
626 | 367 |
|
627 | 368 |
\subsection{Finding near matches to short sequences} |
628 | 369 |
|
... | ... |
@@ -635,13 +376,13 @@ is of the same order as the reads themselves (10's to 100's of bases). |
635 | 376 |
To find reads close to the most common read in the example above, one |
636 | 377 |
might say |
637 | 378 |
<<srdistance>>= |
638 |
-dist <- srdistance(sread(aln), names(tbls$top)[1])[[1]] |
|
379 |
+dist <- srdistance(sread(fq), names(tbls$top)[1])[[1]] |
|
639 | 380 |
table(dist)[1:10] |
640 | 381 |
@ |
641 | 382 |
%% |
642 |
-`Near' matches can be filtered from the alignment, e.g., |
|
383 |
+`Near' matches can be filtered, e.g., |
|
643 | 384 |
<<aln-not-near>>= |
644 |
-alnSubset <- aln[dist>4] |
|
385 |
+fqSubset <- fq[dist>4] |
|
645 | 386 |
@ |
646 | 387 |
|
647 | 388 |
A different strategy can be used to tally or eliminate reads that consist |
... | ... |
@@ -650,8 +391,8 @@ calculates the frequency of each nucleotide (in DNA strings) or letter |
650 | 391 |
(for other string sets) in each read. Thus one could identify and |
651 | 392 |
eliminate reads with more than 30 adenine nucleotides with |
652 | 393 |
<<polya>>= |
653 |
-countA <- alphabetFrequency(sread(aln))[,"A"] |
|
654 |
-alnNoPolyA <- aln[countA < 30] |
|
394 |
+countA <- alphabetFrequency(sread(fq))[,"A"] |
|
395 |
+fqNoPolyA <- fq[countA < 30] |
|
655 | 396 |
@ |
656 | 397 |
%% |
657 | 398 |
\Rfunction{alphabetFrequency}, which simply counts nucleotides, is |
... | ... |
@@ -663,174 +404,20 @@ pairwise aligment are encouraged to investigate the |
663 | 404 |
\Rpackage{Biostrings} package, especially the \Rclass{PDict} class and |
664 | 405 |
\Rfunction{matchPDict} and \Rfunction{pairwiseAlignment} functions. |
665 | 406 |
|
666 |
-\subsection{The \Rfunction{coverage} function} |
|
667 |
- |
|
668 |
-The \Rfunction{coverage} function provides a way to summarize where |
|
669 |
-reads align on a reference sequence. The idea is that the aligned |
|
670 |
-reads, or under some analyses the extension of those aligned reads by |
|
671 |
-an amount meant to estimate the actual fragment size, `pile up' on top |
|
672 |
-of nucleotide positions in the reference sequence. A convenient |
|
673 |
-summary of the alignment of many reads is thus a vector describing the |
|
674 |
-depth of the pile at each position in the reference sequence. A |
|
675 |
-typical work flow invokes \Rfunction{coverage} on an instance of the |
|
676 |
-\Rclass{AlignedRead} class obtained from \Rfunction{readAligned}; |
|
677 |
-additional methods offering greater control operate on |
|
678 |
-\Rclass{IRanges} directly. The \Rfunction{coverage} methods returns a |
|
679 |
-run-length encoding of the pile-up (or a list of such run length |
|
680 |
-encodings). The run-length encoding returned by \Rfunction{coverage} |
|
681 |
-is a space-efficient representation; the long integer vector can be |
|
682 |
-recovered with \Rcode{as.integer}. |
|
683 |
- |
|
684 |
-There are complicated issues associated with use of |
|
685 |
-\Rfunction{coverage}, relating to how software reports the `position' |
|
686 |
-of an alignment, especially on the minus strand. These issues are |
|
687 |
-illustrated in figure~\ref{fig:coverage}. |
|
688 |
-\begin{figure} |
|
689 |
-\begin{verbatim} |
|
690 |
-'leftmost': |
|
691 |
- P |
|
692 |
- +++++---------- |
|
693 |
-'+' strand: 5' ....|....|....|....|....|....|.. 3' |
|
694 |
-'-' strand: 3' ....|....|....|....|....|....|.. 5' |
|
695 |
- ----------+++++ |
|
696 |
- P |
|
697 |
- |
|
698 |
-'fiveprime': |
|
699 |
- P |
|
700 |
- +++++---------- |
|
701 |
-'+' strand: 5' ....|....|....|....|....|....|.. 3' |
|
702 |
-'-' strand: 3' ....|....|....|....|....|....|.. 5' |
|
703 |
- ----------+++++ |
|
704 |
- P |
|
705 |
-\end{verbatim} |
|
706 |
- \caption{Alignment schemes used by |
|
707 |
- \Rfunction{coverage}. \texttt{+++} represents the read and |
|
708 |
- \texttt{---} the extension. \texttt{P} is the alignment position |
|
709 |
- as recorded under the corresponding leftmost or fiveprime |
|
710 |
- schemes.} |
|
711 |
- \label{fig:coverage} |
|
712 |
-\end{figure} |
|
713 |
-In the figure, the two strands are represented by \verb"....|", |
|
714 |
-aligned reads by \verb"+++", and extensions by \verb"---". The idea |
|
715 |
-is that 5-nucleotide reads have been aligned to a reference sequence, |
|
716 |
-and the alignment extended by 10 nucleotides. In the `leftmost' |
|
717 |
-notation (used by \ELAND{}) and assuming that the reference sequence |
|
718 |
-is always numbered in relation to the plus strand and indexed starting |
|
719 |
-at 1 (\Rfunction{readAligned} translates reported alignment positions |
|
720 |
-so they are indexed from 1), the reported position is 15 for the |
|
721 |
-alignments on either the plus or the minus strand. In contrast the |
|
722 |
-`fiveprime' scheme the alignment to the plus strand is 15, and to the |
|
723 |
-minus strand 19. This is the scheme used by \MAQ{}, for instance. |
|
724 |
- |
|
725 |
-The default behavior of \Rfunction{coverage} is to use the `leftmost' |
|
726 |
-coordinate system. This is appropriate for data derived from \ELAND{}. |
|
727 |
- |
|
728 |
-\section{Advanced features} |
|
729 |
- |
|
730 |
-\subsection{The \Rfunarg{pattern} argument to input functions} |
|
731 |
- |
|
732 |
-Most \ShortRead{} input functions are designed to accept a directory |
|
733 |
-path argument, and a \Rfunarg{pattern} argument. The latter is a |
|
734 |
-grep-like pattern (as used by, e.g., \Rfunction{list.files}). Many |
|
735 |
-input functions are implemented so that all files matching the pattern |
|
736 |
-are read into a single large input object. Thus the |
|
737 |
-\texttt{s\_N\_LLLL\_seq.txt} files consist of four numeric columns and a |
|
738 |
-fifth column corresponding to the short read. The following code |
|
739 |
-illustrates the file structure and inputs the final column into a |
|
740 |
-\Rclass{DNAStringSet}: |
|
741 |
-<<readSeq>>= |
|
742 |
-seqFls <- list.files(baseCallPath(sp), "_seq.txt", full=TRUE) |
|
743 |
-strsplit(readLines(seqFls[[1]], 1), "\t") |
|
744 |
-colClasses <- c(rep(list(NULL), 4), "DNAString") |
|
745 |
-reads <- readXStringColumns(baseCallPath(sp), "s_1_0001_seq.txt", |
|
746 |
- colClasses=colClasses) |
|
747 |
-@ |
|
748 |
-%% |
|
749 |
-The more general pattern |
|
750 |
-<<readSeq-all>>= |
|
751 |
-reads <- readXStringColumns(baseCallPath(sp), "s_1_.*_seq.txt", |
|
752 |
- colClasses=colClasses) |
|
753 |
-@ |
|
754 |
-%% |
|
755 |
-inputs all lane 1 tile files into a single \Rclass{DNAStringSet} object. |
|
756 |
- |
|
757 |
-\subsection{\Rfunction{srapply}} |
|
758 |
- |
|
759 |
-Solexa and other short read technologies often include many files, |
|
760 |
-e.g., one \texttt{s\_L\_NNNN\_int.txt} file per tile, 300 tiles per |
|
761 |
-lane, 8 lanes per flow cell for 2400 \texttt{s\_L\_NNNN\_int.txt} files |
|
762 |
-per flow cell. A natural way to extract information from these is to |
|
763 |
-write short functions, e.g., to find the average intensity per base at |
|
764 |
-cycle 12. |
|
765 |
-<<calcInt-demo>>= |
|
766 |
-calcInt <- function(file, cycle, verbose=FALSE) |
|
767 |
-{ |
|
768 |
- if (verbose) |
|
769 |
- cat("calcInt", file, cycle, "\n") |
|
770 |
- int <- readIntensities(dirname(file), basename(file), |
|
771 |
- intExtension="", withVariability=FALSE) |
|
772 |
- apply(intensity(int)[,,12], 2, mean) |
|
773 |
-} |
|
774 |
-@ |
|
775 |
-One way to apply this function to all intensity files in a Solexa run |
|
776 |
-is |
|
777 |
-<<calcInt-sapply>>= |
|
778 |
-intFls <- list.files(imageAnalysisPath(sp), ".*_int.txt$", full=TRUE) |
|
779 |
-lres <- lapply(intFls, calcInt, cycle=12) |
|
780 |
-@ |
|
781 |
-%% |
|
782 |
-The files are generally large and numerous, so even simple |
|
783 |
-calculations consume significant computational resources. The |
|
784 |
-\Rfunction{srapply} function is meant to provide a transparent way to |
|
785 |
-perform calculations like this distributed over multiple nodes of an |
|
786 |
-MPI cluster, or across multiple cores of a single machine. Thus |
|
787 |
-<<srapply-simple>>= |
|
788 |
-srres <- srapply(intFls, calcInt, cycle=12) |
|
789 |
-identical(lres, srres) |
|
790 |
-@ |
|
791 |
-%% |
|
792 |
-evaluates the function as \Rfunction{lapply}, whereas |
|
793 |
-<<srapply-mpi, eval=FALSE>>= |
|
794 |
-library("Rmpi") |
|
795 |
-mpi.spawn.Rslaves(nsl=16) |
|
796 |
-srres <- srapply(intFls, calcInt, cycle=12) |
|
797 |
-mpi.close.Rslaves() |
|
798 |
-@ |
|
799 |
-%% |
|
800 |
-distributes the calculation over available workers, while |
|
801 |
-<<srapply-multicore, eval=FALSE>>= |
|
802 |
-library(multicore) |
|
803 |
-srres <- srapply(intFls, calcInt, cycle=12) |
|
804 |
-@ |
|
805 |
-%% |
|
806 |
-distributes tasks across cores of a single machine. The result is a |
|
807 |
-speedup approximately inversely proportional to the number of |
|
808 |
-available compute nodes or cores; memory requirements for |
|
809 |
-the \Rpackage{multicore} approach may be substantial. |
|
810 |
- |
|
811 |
-\section{Conclusions and directions for development} |
|
812 |
- |
|
813 |
-\ShortRead{} provides tools for reading, manipulation, and quality |
|
814 |
-assessment of short read data. Current facilities in \ShortRead{} |
|
815 |
-emphasize processing of single-end Solexa data. |
|
407 |
+\section{Legacy support for early file formats} |
|
816 | 408 |
|
817 |
-Development priorities in the near term include expanded facilities |
|
818 |
-for importing key file types from additional manufactures, more |
|
819 |
-extensive quality assessment methodologies, and development of |
|
820 |
-infrastructure for paired-end reads. |
|
409 |
+The \Biocpkg{ShortRead} package contains functions and classes to |
|
410 |
+support early file formats and ungapped alignments. Help pages are |
|
411 |
+flagged as `legacy'; versions of \Biocpkg{ShortRead} prior to 1.21 |
|
412 |
+(\Bioconductor{} version 2.13) contain a vignette illustrating common |
|
413 |
+work flows with these file formats. |
|
821 | 414 |
|
822 | 415 |
%--------------------------------------------------------- |
823 | 416 |
% SessionInfo |
824 | 417 |
%--------------------------------------------------------- |
825 |
-\begin{table*}[tbp] |
|
826 |
-\begin{minipage}{\textwidth} |
|
418 |
+\section{sessionInfo} |
|
827 | 419 |
<<sessionInfo, results=tex, print=TRUE>>= |
828 | 420 |
toLatex(sessionInfo()) |
829 | 421 |
@ |
830 |
-\end{minipage} |
|
831 |
-\caption{\label{tab:sessioninfo}% |
|
832 |
-The output of \Rfunction{sessionInfo} on the build system |
|
833 |
-after running this vignette.} |
|
834 |
-\end{table*} |
|
835 | 422 |
|
836 | 423 |
\end{document} |