Browse code

mark readAligned and other early functions 'legacy'

- revise vignette to reflect working with short reads, not alignments
- update example data


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/ShortRead@82095 bc3139a8-67e5-0310-9ffc-ced21a209358

Martin Morgan authored on 24/10/2013 04:05:22
Showing 37 changed files

... ...
@@ -1,19 +1,24 @@
1 1
 Package: ShortRead
2 2
 Type: Package
3
-Title: Classes and methods for high-throughput short-read sequencing data.
4
-Version: 1.21.1
3
+Title: FASTQ input and manipulation
4
+Version: 1.21.2
5 5
 Author: Martin Morgan, Michael Lawrence, Simon Anders
6 6
 Maintainer: Bioconductor Package Maintainer <maintainer@bioconductor.org>
7
-Description: Base classes, functions, and methods for representation of
8
-             high-throughput, short-read sequencing data.
7
+Description: This package implements sampling, iteration, and input of
8
+    FASTQ files. The package includes functions for filtering and
9
+    trimming reads, and for generating a quality assessment
10
+    report. Data are represented as DNAStringSet-derived objects, and
11
+    easily manipulated for a diversity of purposes.  The package also
12
+    contains legacy support for early single-end, ungapped alignment
13
+    formats.
9 14
 License: Artistic-2.0
10 15
 LazyLoad: yes
11
-Depends: methods, BiocGenerics (>= 0.1.0), IRanges (>= 1.19.34),
12
-         GenomicRanges (>= 1.13.43), Biostrings (>= 2.29.18),
13
-         lattice, Rsamtools (>= 1.13.1)
14
-Imports: Biobase, hwriter, zlibbioc, latticeExtra
16
+Depends: BiocGenerics (>= 0.1.0), IRanges (>= 1.19.34),
17
+    GenomicRanges (>= 1.13.43), Biostrings (>= 2.29.18),
18
+    Rsamtools (>= 1.13.1)
19
+Imports: Biobase, hwriter, methods, zlibbioc, lattice, latticeExtra
15 20
 Enhances: Rmpi, parallel
16
-Suggests: biomaRt, RUnit, GenomicFeatures, yeastNagalakshmi
21
+Suggests: BiocStyle, RUnit, biomaRt, GenomicFeatures, yeastNagalakshmi
17 22
 LinkingTo: IRanges, XVector, Biostrings
18 23
 biocViews: DataImport, Sequencing, HighThroughputSequencing, 
19 24
            QualityControl
... ...
@@ -71,14 +71,14 @@
71 71
 setMethod(qa, "ShortReadQ", .qa_ShortReadQ)
72 72
 
73 73
 .qa_fastq_lane <-
74
-    function(dirPath, pattern, ..., sample=TRUE, type="fastq", 
74
+    function(dirPath, ..., sample=TRUE, type="fastq", 
75 75
         verbose=FALSE)
76 76
 {
77 77
     if (verbose)
78 78
         message("qa 'fastq' pattern:", pattern)
79 79
     if (sample) {
80
-        samp <- FastqSampler(file.path(dirPath, pattern), ...)
81
-        qa <- qa(yield(samp), pattern, ..., verbose=verbose)
80
+        samp <- FastqSampler(dirPath, ...)
81
+        qa <- qa(yield(samp), basename(dirPath), ..., verbose=verbose)
82 82
         close(samp)
83 83
         elts <- .srlist(qa)
84 84
         elts$readCounts$read <- samp$status()[["total"]]
... ...
@@ -95,8 +95,7 @@ setMethod(qa, "ShortReadQ", .qa_ShortReadQ)
95 95
 {
96 96
     fls <- .file_names(dirPath, pattern)
97 97
     lst <-
98
-        srapply(basename(fls), .qa_fastq_lane,
99
-                dirPath=dirPath, type=type, ...,
98
+        srapply(fls, .qa_fastq_lane, type=type, ...,
100 99
                 reduce=.reduce(1), verbose=verbose, 
101 100
                 USE.NAMES=TRUE)
102 101
     lst <- do.call(rbind, lst)
... ...
@@ -108,7 +108,7 @@ uniqueFilter <-
108 108
 }
109 109
 
110 110
 occurrenceFilter <-
111
-    function(min=1L, max=1L, withSread=c(TRUE, FALSE, NA),
111
+    function(min=1L, max=1L, withSread=c(NA, TRUE, FALSE),
112 112
              duplicates=c("head", "tail", "sample", "none"),
113 113
              .name=.occurrenceName(min, max, withSread,
114 114
                  duplicates))
115 115
old mode 100644
116 116
new mode 100755
117 117
new file mode 100644
118 118
Binary files /dev/null and b/inst/extdata/E-MTAB-1147/ERR127302_1_subset.fastq.gz differ
119 119
new file mode 100644
120 120
Binary files /dev/null and b/inst/extdata/E-MTAB-1147/ERR127302_2_subset.fastq.gz differ
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{append,AlignedDataFrame,AlignedDataFrame-method}
5 5
 
6 6
 \title{
7
-  "AlignedDataFrame" representing alignment annotations as a data frame
7
+  (Legacy) "AlignedDataFrame" representing alignment annotations as a data frame
8 8
 }
9 9
 \description{
10 10
 
... ...
@@ -1,7 +1,7 @@
1 1
 \name{AlignedDataFrame}
2 2
 \alias{AlignedDataFrame}
3 3
 
4
-\title{AlignedDataFrame constructor}
4
+\title{(Legacy) AlignedDataFrame constructor}
5 5
 \description{
6 6
 
7 7
   Construct an \code{AlignedDataFrame} from a data frame and its
... ...
@@ -27,7 +27,7 @@
27 27
 \alias{detail,AlignedRead-method}
28 28
 \alias{show,AlignedRead-method}
29 29
 
30
-\title{"AlignedRead" class for aligned short reads}
30
+\title{(Legacy) "AlignedRead" class for aligned short reads}
31 31
 
32 32
 \description{
33 33
 
... ...
@@ -1,7 +1,7 @@
1 1
 \name{AlignedRead}
2 2
 \alias{AlignedRead}
3 3
 
4
-\title{Construct objects of class "AlignedRead"}
4
+\title{(Legacy) Construct objects of class "AlignedRead"}
5 5
 \description{
6 6
 
7 7
   This function constructs objects of
... ...
@@ -4,7 +4,7 @@
4 4
 \alias{report,BowtieQA-method}
5 5
 \alias{report_html,BowtieQA-method}
6 6
 
7
-\title{Quality assessment summaries from Bowtie files}
7
+\title{(Legacy) Quality assessment summaries from Bowtie files}
8 8
 
9 9
 \description{
10 10
 
... ...
@@ -7,7 +7,7 @@
7 7
 \alias{show,ExperimentPath-method}
8 8
 \alias{detail,ExperimentPath-method}
9 9
 
10
-\title{"ExperimentPath" class representing a file hierarchy of data
10
+\title{(Legacy) "ExperimentPath" class representing a file hierarchy of data
11 11
   files}
12 12
 
13 13
 \description{
... ...
@@ -2,7 +2,6 @@
2 2
 \docType{class}
3 3
 
4 4
 % Class:
5
-\alias{class:GappedReads}
6 5
 \alias{GappedReads-class}
7 6
 \alias{GappedReads}
8 7
 
... ...
@@ -22,7 +21,7 @@
22 21
 \alias{narrow,GappedReads-method}
23 22
 
24 23
 
25
-\title{GappedReads objects}
24
+\title{(Legacy) GappedReads objects}
26 25
 
27 26
 \description{
28 27
   The GappedReads class extends the \link[GenomicRanges]{GAlignments}
... ...
@@ -21,7 +21,7 @@
21 21
 \alias{show,Intensity-method}
22 22
 \alias{show,IntensityMeasure-method}
23 23
 
24
-\title{"Intensity", "IntensityInfo", and "IntensityMeasure" base
24
+\title{(Legacy) "Intensity", "IntensityInfo", and "IntensityMeasure" base
25 25
   classes for short read image intensities}
26 26
 
27 27
 \description{
... ...
@@ -5,7 +5,7 @@
5 5
 \alias{report,MAQMapQA-method}
6 6
 \alias{report_html,MAQMapQA-method}
7 7
 
8
-\title{Quality assessment summaries from MAQ map files}
8
+\title{(Legacy) Quality assessment summaries from MAQ map files}
9 9
 
10 10
 \description{
11 11
 
... ...
@@ -23,7 +23,7 @@
23 23
 %
24 24
 \alias{RochePath}
25 25
 
26
-\title{"RochePath" class representing a Roche (454) experiment location}
26
+\title{(Legacy) "RochePath" class representing a Roche (454) experiment location}
27 27
 
28 28
 \description{
29 29
 
... ...
@@ -3,7 +3,7 @@
3 3
 \alias{RocheSet-class}
4 4
 \alias{RocheSet}
5 5
 
6
-\title{Roche (454) experiment-wide data container}
6
+\title{(Legacy) Roche (454) experiment-wide data container}
7 7
 \description{
8 8
 
9 9
   This class is meant to coordinate all data in a Roche (454)
... ...
@@ -2,7 +2,7 @@
2 2
 \docType{class}
3 3
 \alias{RtaIntensity-class}
4 4
 
5
-\title{Class "RtaIntensity"}
5
+\title{(Legacy) Class "RtaIntensity"}
6 6
 \description{
7 7
 
8 8
   Subclass of \code{\linkS4class{Intensity}} for representing image
... ...
@@ -1,7 +1,7 @@
1 1
 \name{RtaIntensity}
2 2
 \alias{RtaIntensity}
3 3
 
4
-\title{Construct objects of class "RtaIntensity"}
4
+\title{(Legacy) Construct objects of class "RtaIntensity"}
5 5
 
6 6
 \description{
7 7
 
... ...
@@ -10,7 +10,8 @@
10 10
 \alias{show,SRSet-method}
11 11
 \alias{detail,SRSet-method}
12 12
 
13
-\title{A base class for Roche experiment-wide data}
13
+\title{(Legacy) A base class for Roche experiment-wide data}
14
+
14 15
 \description{
15 16
   This class coordinates phenotype (sample) and sequence data, primarily
16 17
   as used on the Roche platform.
... ...
@@ -8,13 +8,17 @@
8 8
 \docType{package}
9 9
 \title{
10 10
 
11
-  Base classes and methods for high-throughput short-read sequencing data.
11
+  FASTQ input and manipulation.
12 12
 
13 13
 }
14 14
 \description{
15 15
 
16
-  Base classes, functions, and methods for representation of
17
-  high-throughput, short-read sequencing data.
16
+  This package implements sampling, iteration, and input of FASTQ
17
+  files. The package includes functions for filtering and trimming
18
+  reads, and for generating a quality assessment report. Data are
19
+  represented as DNAStringSet-derived objects, and easily manipulated
20
+  for a diversity of purposes.  The package also contains legacy support
21
+  for early single-end, ungapped alignment formats.
18 22
 
19 23
 }
20 24
 \details{
... ...
@@ -8,7 +8,8 @@
8 8
 \alias{report_html,SolexaRealignQA-method}
9 9
 \alias{show,SolexaExportQA-method}
10 10
 
11
-\title{Quality assessment summaries from Solexa export and realign files}
11
+\title{(Legacy) Quality assessment summaries from Solexa export and
12
+  realign files}
12 13
 
13 14
 \description{
14 15
 
... ...
@@ -2,7 +2,8 @@
2 2
 \alias{SolexaIntensity}
3 3
 \alias{SolexaIntensityInfo}
4 4
 
5
-\title{Construct objects of class "SolexaIntensity" and "SolexaIntensityInfo"}
5
+\title{(Legacy) Construct objects of class "SolexaIntensity" and
6
+"SolexaIntensityInfo"}
6 7
 
7 8
 \description{
8 9
 
... ...
@@ -18,7 +18,7 @@
18 18
 \alias{readBaseQuality,SolexaPath-method}
19 19
 \alias{readAligned,SolexaPath-method}
20 20
 
21
-\title{"SolexaPath" class representing a standard output file hierarchy}
21
+\title{(Legacy) "SolexaPath" class representing a standard output file hierarchy}
22 22
 \description{
23 23
 
24 24
   Solexa produces a hierarchy of output files. The content of the
... ...
@@ -12,7 +12,7 @@
12 12
 % transforming methods
13 13
 \alias{readAligned,SolexaSet-method}
14 14
 
15
-\title{"SolexaSet" coordinating Solexa output locations with sample annotations}
15
+\title{(Legacy) "SolexaSet" coordinating Solexa output locations with sample annotations}
16 16
 \description{
17 17
 
18 18
   This class coordinates the file hierarchy produced by the Solexa
... ...
@@ -72,16 +72,16 @@ x = numeric(1000)
72 72
 x[sample(1000, 100)] <- abs(rnorm(100))
73 73
 df <- data.frame(x = c(x, -x), pos = seq(1, 1e5, length.out=1000),
74 74
                  group = rep(c("positive", "negative"), each=1000))
75
-cv <- xyplot(x ~ pos, df, group=group, type="s",
76
-             col=col, main="yeast chrI:1 - 2e5",
77
-             ylab="Coverage", xlab="Coordinate",
78
-             scales=list(y=list(tck=c(1,0)),
79
-                         x=list(rot=45, tck=c(1,0), tick.number=20)),
80
-             panel=function(...) {
81
-                     panel.xyplot(...)
82
-                     panel.grid(h=-1, v=20)
83
-                     panel.abline(a=0, b=0, col="grey")
84
-             })
75
+cv <- lattice::xyplot(x ~ pos, df, group=group, type="s",
76
+    col=col, main="yeast chrI:1 - 2e5",
77
+    ylab="Coverage", xlab="Coordinate",
78
+    scales=list(y=list(tck=c(1,0)),
79
+                x=list(rot=45, tck=c(1,0), tick.number=20)),
80
+    panel=function(...) {
81
+            lattice::panel.xyplot(...)
82
+            lattice::panel.grid(h=-1, v=20)
83
+            lattice::panel.abline(a=0, b=0, col="grey")
84
+    })
85 85
 s <- SpTrellis(cv)
86 86
 s
87 87
 zi(s)
... ...
@@ -24,7 +24,7 @@
24 24
 \alias{laneDescription}
25 25
 \alias{laneNames}
26 26
 
27
-\title{Accessors for ShortRead classes}
27
+\title{(Legacy) Accessors for ShortRead classes}
28 28
 
29 29
 \description{
30 30
 
... ...
@@ -3,7 +3,7 @@
3 3
 \alias{readAligned}
4 4
 \alias{readAligned,character-method}
5 5
 
6
-\title{Read aligned reads and their quality scores into R representations}
6
+\title{(Legacy) Read aligned reads and their quality scores into R representations}
7 7
 
8 8
 \description{
9 9
 
... ...
@@ -3,7 +3,7 @@
3 3
 \alias{readBaseQuality}
4 4
 \alias{readBaseQuality,character-method}
5 5
 
6
-\title{Read short reads and their quality scores into R representations}
6
+\title{(Legacy) Read short reads and their quality scores into R representations}
7 7
 
8 8
 \description{
9 9
 
... ...
@@ -1,7 +1,7 @@
1 1
 \name{readBfaToc}
2 2
 \alias{readBfaToc}
3 3
 
4
-\title{Get a list of the sequences in a Maq .bfa file}
4
+\title{(Legacy) Get a list of the sequences in a Maq .bfa file}
5 5
 
6 6
 \description{
7 7
 As \code{\link{coverage}} needs to know the lengths of the reference sequences,
... ...
@@ -3,7 +3,7 @@
3 3
 \alias{readIntensities}
4 4
 \alias{readIntensities,character-method}
5 5
 
6
-\title{Read Illumina image intensity files}
6
+\title{(Legacy) Read Illumina image intensity files}
7 7
 
8 8
 \description{
9 9
 
... ...
@@ -3,7 +3,7 @@
3 3
 \alias{readPrb}
4 4
 \alias{readPrb,character-method}
5 5
 
6
-\title{Read Solexa prb files as fastq-style quality scores}
6
+\title{(Legacy) Read Solexa prb files as fastq-style quality scores}
7 7
 
8 8
 \description{
9 9
 
... ...
@@ -3,7 +3,7 @@
3 3
 \alias{readQseq}
4 4
 \alias{readQseq,character-method}
5 5
 
6
-\title{Read Solexa qseq files as fastq-style quality scores}
6
+\title{(Legacy) Read Solexa qseq files as fastq-style quality scores}
7 7
 
8 8
 \description{
9 9
 
... ...
@@ -35,12 +35,8 @@ compose(filt, ..., .name)
35 35
 
36 36
 idFilter(regex=character(0), fixed=FALSE, exclude=FALSE,
37 37
          .name="idFilter")
38
-chromosomeFilter(regex=character(0), fixed=FALSE, exclude=FALSE,
39
-                 .name="ChromosomeFilter")
40
-positionFilter(min=-Inf, max=Inf, .name="PositionFilter")
41
-strandFilter(strandLevels=character(0), .name="StrandFilter")
42 38
 occurrenceFilter(min=1L, max=1L,
43
-                 withSread=c(TRUE, FALSE, NA),
39
+                 withSread=c(NA, TRUE, FALSE),
44 40
                  duplicates=c("head", "tail", "sample", "none"),
45 41
                  .name=.occurrenceName(min, max, withSread,
46 42
                                        duplicates))
... ...
@@ -50,6 +46,15 @@ polynFilter(threshold=0L, nuc=c("A", "C", "T", "G", "other"),
50 46
 dustyFilter(threshold=Inf, batchSize=NA, .name="DustyFilter")
51 47
 srdistanceFilter(subject=character(0), threshold=0L,
52 48
                  .name="SRDistanceFilter")
49
+
50
+##
51
+## legacy filters for ungapped alignments
52
+##
53
+
54
+chromosomeFilter(regex=character(0), fixed=FALSE, exclude=FALSE,
55
+                 .name="ChromosomeFilter")
56
+positionFilter(min=-Inf, max=Inf, .name="PositionFilter")
57
+strandFilter(strandLevels=character(0), .name="StrandFilter")
53 58
 alignQualityFilter(threshold=0L, .name="AlignQualityFilter")
54 59
 alignDataFilter(expr=expression(), .name="AlignDataFilter")
55 60
 }
... ...
@@ -174,7 +179,7 @@ alignDataFilter(expr=expression(), .name="AlignDataFilter")
174 179
   treated: \code{TRUE} to include the sread, chromosome, strand, and
175 180
   position when determining occurrence, \code{FALSE} to include
176 181
   chromosome, strand, and position, and \code{NA} to include only
177
-  sread. The default is \code{withSread=TRUE}. \code{duplicates}
182
+  sread. The default is \code{withSread=NA}. \code{duplicates}
178 183
   determines how reads with more than \code{max} reads are
179 184
   treated. \code{head} selects the first \code{max} reads of each set of
180 185
   duplicates, \code{tail} the last \code{max} reads, and \code{sample} a
... ...
@@ -63,7 +63,7 @@ showMethods("tables")
63 63
 sp <- SolexaPath(system.file("extdata", package="ShortRead"))
64 64
 aln <- readAligned(sp)
65 65
 tables(sread(aln), n=6)
66
-xyplot(log10(nReads)~log10(nOccurrences),
66
+lattice::xyplot(log10(nReads)~log10(nOccurrences),
67 67
        tables(sread(aln))$distribution)
68 68
 }
69 69
 \keyword{manip}
... ...
@@ -1,394 +1,170 @@
1 1
 %\VignetteIndexEntry{An introduction to ShortRead}
2
-%\VignetteDepends{}
2
+%\VignetteDepends{BiocStyle}
3 3
 %\VignetteKeywords{Short read, I/0, quality assessment}
4 4
 %\VignettePackage{ShortRead}
5 5
 \documentclass[]{article}
6 6
 
7
-\usepackage{times}
8
-\usepackage{hyperref}
9
-
10
-\newcommand{\Rfunction}[1]{{\texttt{#1}}}
11
-\newcommand{\Robject}[1]{{\texttt{#1}}}
12
-\newcommand{\Rpackage}[1]{{\textit{#1}}}
13
-\newcommand{\Rmethod}[1]{{\texttt{#1}}}
14
-\newcommand{\Rfunarg}[1]{{\texttt{#1}}}
15
-\newcommand{\Rclass}[1]{{\textit{#1}}}
16
-\newcommand{\Rcode}[1]{{\texttt{#1}}}
17
-
18
-\newcommand{\software}[1]{\textsf{#1}}
19
-\newcommand{\R}{\software{R}}
20
-\newcommand{\ShortRead}{\Rpackage{ShortRead}}
7
+<<style, eval=TRUE, echo=FALSE, results=tex>>=
8
+BiocStyle::latex()
9
+@ 
21 10
 
22
-\newcommand{\ELAND}{\software{ELAND}}
23
-\newcommand{\MAQ}{\software{MAQ}}
24
-\newcommand{\Bowtie}{\software{Bowtie}}
11
+\newcommand{\ShortRead}{\Biocpkg{ShortRead}}
25 12
 
26 13
 \title{An Introduction to \Rpackage{ShortRead}}
27 14
 \author{Martin Morgan}
28
-\date{Modified: 28 September 2010. Compiled: \today}
15
+\date{Modified: 21 October, 2013. Compiled: \today}
29 16
 
30 17
 \begin{document}
31 18
 
32 19
 \maketitle
33 20
 
34
-<<options,echo=FALSE>>=
35
-options(width=60)
36
-@ 
37
-
38 21
 <<preliminaries>>=
39 22
 library("ShortRead")
40 23
 @ 
41 24
 
42
-The \Rpackage{ShortRead} package aims to provide key functionality for
43
-input, quality assurance, and basic manipulation of `short read' DNA
44
-sequences such as those produced by Solexa, 454, and related
45
-technologies, including flexible import of common short read data
46
-formats. This vignette introduces key functionality.
47
-
48
-Support is most fully developed for Solexa; contributions from the
49
-community are welcome.
50
-
51
-\section{A first workflow}
52
-
53
-This section walks through a simple work flow. It outlines the
54
-hierarchy of files produced by Solexa. It then illustrates a common
55
-way for reading short read data into \R{}.
56
-
57
-\subsection{\Rclass{SolexaPath}: navigating Solexa output}
58
-
59
-\Rclass{SolexaPath} provides functionality to navigate files produced
60
-by Solexa Genome Analyzer pipeline software. A typical way to start a
61
-\ShortRead{} session is to point to the root of the output file
62
-hierarchy. The \ShortRead{} package includes a very small subset of
63
-files emulating this hierarchy. The root is found at
64
-<<SolexaPath-root>>=
65
-exptPath <- system.file("extdata", package="ShortRead")
66
-@ 
67
-%% 
68
-Usually \Rcode{exptPath} would be a location on the users' file system. Key
69
-components of the hierarchy are parsed into \R{} with
70
-<<SolexaPat>>=
71
-sp <- SolexaPath(exptPath)
72
-sp
73
-@ 
74
-%% 
75
-\Rfunction{SolexaPath} scans the directory hierarchy to identifying
76
-useful directories. For instance, image intensity files are in the
77
-`Firecrest' directory, while summary and alignment files are in the
78
-analysis directory
79
-<<firecrest>>=
80
-imageAnalysisPath(sp)
81
-analysisPath(sp)
82
-@ 
83
-%% 
84
-Most functionality in \ShortRead{} uses \Rcode{baseCallPath} or
85
-\Rcode{analysisPath}. Solexa documentation provides details of file
86
-content. \Rfunction{SolexaPath} accepts additional arguments that
87
-allow individual file paths to be specified.
88
- 
89
-Many functions for Solexa data input `know' where appropriate files
90
-are located. Specifying \Rcode{sp} is often sufficient for identifying
91
-the desired directory path. Examples of this are illustrated below,
92
-with for instance \Rfunction{readAligned} and \Rfunction{readFastq}.
93
-
94
-Displaying an object, e.g., \Robject{sp}, provides hints at how to
95
-access information in the object, e.g., \Rfunction{analysisPath}. This
96
-is a convention in \ShortRead{}.
97
-
98
-\subsection{\Rfunction{readAligned}: reading aligned data into \R{}}
99
-
100
-Solexa \texttt{s\_N\_export.txt} files (\texttt{\_N\_} is a
101
-placeholder for the lane identifier) represent one place to start
102
-working the short read data in \R{}. These files result from running
103
-ANALYSIS eland\_extended in the Solexa Genome Analyzer. The files
104
-contain information on all reads, including alignment information for
105
-those reads successfully aligned to the genome.  \ShortRead{} parses
106
-additional alignment files, including \MAQ{} binary and text
107
-(\texttt{mapview}) files and \Bowtie{} text files; consult the help page
108
-for \Rfunction{readAligned} for details. \ShortRead{} flexibly parses
109
-many other Solexa files; aligned reads represent just one entry point.
110
-
111
-To read a single \texttt{s\_N\_export.txt} file into \R{}, for instance
112
-from lane 2, use the command
113
-<<readAligned-simple>>=
114
-aln <- readAligned(sp, "s_2_export.txt")
115
-aln
116
-@ 
117
-%% 
118
-This illustrates the convention used for identifying files for input
119
-into \R{} and used by \ShortRead{}. The function takes a directory
120
-path and a pattern (as a regular expression, similar to the \R{}
121
-function \Rfunction{list.files}) of file names to match in the
122
-directory. Usually, all files matching the pattern are read into a
123
-\emph{single} \R{} object; this behavior is desirable for several of
124
-the input functions in \ShortRead{}. In the present case the usual
125
-expectation is that a single \texttt{s\_N\_export.txt} file will be
126
-read into a single \R{} object, so the \Rfunarg{pattern} argument will
127
-identify a single file.
128
-
129
-\subsubsection{Input of other aligned read files}
130
-
131
-\ELAND{} software provides access to much interesting data, in
132
-addition to alignments, but if the interest is in aligned reads then
133
-input may come from any of a number of different software
134
-packages. Many of these alignments can be input with
135
-\Rpackage{ShortRead}.
136
-
137
-\Bowtie{} is a very fast aligner, taking a few tens of minutes to
138
-align entire lanes of reads to reference genomes. Use
139
-\Rfunction{readAligned} with the \Rfunarg{type="Bowtie"} argument to
140
-input alignments.  Reading \Bowtie{} output using
141
-\Rfunction{readAligned} produces the same class of object as reading
142
-\ELAND{} output.  Like \ELAND{}, \Bowtie{} provides information on
143
-short read, quality, chromosome, position, and strand; there is no
144
-information on alignment quality avaiable from \Bowtie{}. \ELAND{} and
145
-\Bowtie{} provide very different auxiliary information.  Consult the
146
-\Rfunction{readAligned} and help page \Bowtie{} manual for additional
147
-detail.
148
-
149
-\MAQ{} is another poplar aligner. \Rpackage{ShortRead} can input
150
-\MAQ{} binary or text formats (see the arguments
151
-\Rfunarg{type="MAQMapShort"}, \Rfunarg{"MAQMap"}, and
152
-\Rfunarg{"MAQMapview"}). As with \Bowtie{}, \MAQ{} provides essential
153
-information about reads and their aligments, plus additional
154
-information that differs somewhat from the additional information
155
-provided by \ELAND{}.
156
-
157
-Alignment information may come in a variety of different text-based
158
-formats. Not all of these will be supported by
159
-\Rpackage{ShortRead}. There are a number of tools available to input
160
-this into \R{}. 
161
-
162
-A basic strategy is involves two passes over the data, followed by
163
-synthesis of results into an \Rclass{AlignedRead} object.  First,
164
-input alignment data using functions such as
165
-\Rfunction{read.table}. Use the \Rfunarg{colClasses} argument to
166
-`mask-out' (i.e., avoid importing) DNA and quality sequences. Next,
167
-use \Rfunction{readXStringColumns} or \Rfunction{readFastq} to import
168
-the short read and quality information. Finally, use the alignment
169
-data and reads as arugments to the \Rfunction{AlignedRead} function to
170
-synthesize the input. The following illustrates use of
171
-\Rfunction{readXStringColumns} and \Rfunction{readFastq}. These
172
-functions receive further attention below.
173
-
174
-\subsubsection{Cautions}
175
-
176
-There are several confusing areas of input. (1) Some alignment
177
-programs and genome resources start numbering nucleotides of the
178
-subject sequence at 0, whereas others start at 1. (2) Some alignment
179
-programs report matches on the minus strand in terms of the
180
-`left-most' position of the read (i.e., the location of the 3' end of
181
-the aligned read), whereas other report `five-prime''matches (i.e., in
182
-terms of the 5' end of the read), regardless of whether the alignment
183
-is on the plus or minus strand. (3) Some alignment programs reverse
184
-complement the sequence of reads aligned to the minus strand. (4) Base
185
-qualities are sometimes encoded as character strings, but the encoding
186
-differs between `fastq' and `solexa fastq'. It seems that all
187
-combinations of these choices are common `in the wild'.
188
-
189
-The help page for \Rfunction{readAligned} attempts to be explicit
190
-about how reads are formatted. Briefly:
191
-\begin{itemize}
192
-\item Subject sequence nucleotides are numbered starting at 1, rather
193
-  than zero. \Rfunction{readAligned} adjusts the coordinate system of
194
-  input reads if necessary (e.g., reading \MAQ{} alignments).
195
-\item Alignments on the minus strand are reported in `left-most'
196
-  coordinates systems.
197
-\item \ELAND{} and \Bowtie{} alignments on the minus strand are not reverse
198
-  complemented.
199
-\item Character-encoded base quality scores are intrepreted as the
200
-  default for the software package being parsed, e.g., as `Solexa
201
-  fastq' for \ELAND{}. The object returned by \Rfunction{quality}
202
-  applied to an \Rclass{AlignedRead} object is either
203
-  \Rclass{FastqQuality} or \Rclass{SFastqQuality}.
204
-\end{itemize}
205
-Alignment programs sometimes offer the opportunity to custommize
206
-output; such customization needs to be accomodated when reads are
207
-input using \Rpackage{ShortRead}.
208
-
209
-\subsubsection{Filtering input}
210
-
211
-Downstream analysis may often want to use a well-defined subset of
212
-reads. These can be selected with the \Rfunarg{filter} argument of
213
-\Rfunction{readAligned}. There are built-in filters, for instance to
214
-remove all reads containing an \texttt{N} nucleotide, to select just
215
-those reads that map to the genome file \texttt{chr5.fa}, to select
216
-reads on the \texttt{+} strand, or to `level the playing field' by
217
-selecting only a single read for any chromosome, position and strand:
218
-<<filter-egs>>=
219
-nfilt <- nFilter()
220
-cfilt <- chromosomeFilter('chr5.fa')
221
-sfilt <- strandFilter("+")
222
-ofilt <- occurrenceFilter(withSread=FALSE)
25
+The \Rpackage{ShortRead} package provides functionality for working
26
+with FASTQ files from high throughput sequence analysis. The package
27
+also contains functions for legacy (single-end, ungapped) aligned
28
+reads; for working with BAM files, please see the \Biocpkg{Rsamtools},
29
+\Biocpkg{GenomicRanges}, and related packages.
30
+
31
+\section{Sample data}
32
+
33
+Sample FASTQ data are derived from ArrayExpress record
34
+\href{http://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-1147/}{E-MTAB-1147}.
35
+Paired-end FASTQ files were retrieved and then sampled to 20,000
36
+records with
37
+<<sample, eval=FALSE>>=
38
+sampler <- FastqSampler('E-MTAB-1147/fastq/ERR127302_1.fastq.gz', 20000)
39
+set.seed(123); ERR127302_1 <- yield(sampler)
40
+sampler <- FastqSampler('E-MTAB-1147/fastq/ERR127302_2.fastq.gz', 20000)
41
+set.seed(123); ERR127302_2 <- yield(sampler)
42
+@ 
43
+
44
+\section{Functionality}
45
+
46
+Functionality is summarized in Table~\ref{tab:fastq}. 
47
+
48
+\begin{table}
49
+  \centering
50
+  \begin{tabular}{lll}
51
+    \hline
52
+    \multicolumn{3}{l}{Input} \\
53
+    & \Rfunction{FastqStreamer} & Iterate through FASTQ files in chunks \\
54
+    & \Rfunction{FastqSampler} & Draw random samples from FASTQ files \\
55
+    & \Rfunction{readFastq} & Read an entire FASTQ file into memory \\
56
+    & \Rfunction{writeFastq} & Write FASTQ objects to a connection (file) \\
57
+    \multicolumn{3}{l}{Sequence and quality summary} \\
58
+    & \Rfunction{alphabetFrequency} & Nucleotide or quality score use per read\\
59
+    & \Rfunction{alphabetByCycle} & Nucleotide or quality score use by cycle\\
60
+    & \Rfunction{alphabetScore} & Whole-read quality summary\\
61
+    & \Rfunction{encoding} & Character / `phred' score mapping \\
62
+    \multicolumn{3}{l}{Quality assessment} \\
63
+    & \Rfunction{qa} & Visit FASTQ files to collect QA statistics \\
64
+    & \Rfunction{report} & Generate a quality assessment report \\
65
+    \multicolumn{3}{l}{Filtering and trimming} \\
66
+    & \Rfunction{srFilter} & Pre-defined and bespoke filters \\
67
+    & \Rfunction{trimTails}, etc. & Trim low-quality nucleotides \\
68
+    & \Rfunction{narrow} & Remove leading / trailing nucleotides \\
69
+    & \Rfunction{tables} & Summarize read occurrence \\
70
+    & \Rfunction{srduplicated}, etc. & Identify duplicate reads \\
71
+    & \Rfunction{filterFastq} & Filter reads from one file to another\\
72
+    \multicolumn{3}{l}{Parallel evaluation} \\
73
+    & \Rfunction{srapply} & \Rfunction{lapply}-like parallel evaluation \\
74
+    \hline
75
+  \end{tabular}
76
+  \caption{Key functions for working with FASTQ files}
77
+  \label{tab:fastq}
78
+\end{table}
79
+
80
+\paragraph{Input} FASTQ files are large so processing involves
81
+iteration in `chunks' using \Rfunction{FastqStreamer}
82
+<<stream, eval=FALSE>>=
83
+strm <- FastqStreamer("a.fastq.gz")
84
+repeat {
85
+    fq <- yield(strm)
86
+    if (length(fq) == 0)
87
+        break
88
+    ## process chunk
89
+}
223 90
 @
224
-%% 
225
-Here we select only those reads that map to \texttt{chr5.fa}:
226
-<<readAligned-filter>>=
227
-chr5 <- readAligned(sp, "s_2_export.txt", filter=cfilt)
228
-@ 
229
-%% 
230
-Filters can be `composed' to act in unison, e.g., selecting only reads
231
-mapping to \texttt{chr5.fa} and on the \texttt{+} strand:
232
-<<readAligned-compose-filter>>=
233
-filt <- compose(cfilt, sfilt)
234
-chr5plus <- readAligned(sp, "s_2_export.txt", filter=filt)
235
-@ 
236
-%% 
237
-Filters can subset aligned reads at other stages in the work flow,
238
-using a paradigm like the following:
239
-<<AlignedRead-filter>>=
240
-chr5 <- aln[cfilt(aln)]
241
-@ 
242
-%% 
243
-Users can easily create their own filter by writing a function that
244
-accepts an object of class \Rcode{AlignedRead}, and returns a logical
245
-vector indicating which reads in the object pass the filter. See the
246
-example on the \Rcode{srFilter} help page for details, and for
247
-information about additional built-in filters.
248
-
249
-\subsection{Exploring \ShortRead{} objects}
250
-
251
-\Robject{aln} is an object of \Rclass{\Sexpr{class(aln)}} class. It
252
-contains short reads and their (calibrated) qualities:
253
-<<aln-sread-quality>>=
254
-sread(aln)
255
-quality(aln)
256
-@ 
257
-
258
-The short reads are stored as a \Rclass{DNAStringSet} class.  This
259
-class is defined in \Rpackage{Biostrings}. It represents DNA sequence
260
-data relatively efficiently.  There are a number of very useful
261
-methods defined for \Rclass{DNAStringSet}. Some of these methods are
262
-illustrated in this vignette. Other methods are described in the help
263
-pages and vignettes of the \Rpackage{Biostrings} and
264
-\Rpackage{IRanges} packages.
265
-
266
-Qualities are represented as \Sexpr{class(quality(aln))}-class
267
-objects. The qualities in the \Robject{aln} object returned by
268
-\Rfunction{readAligned} are of class \Rclass{BStringSet}. The
269
-\Rclass{BStringSet} class is also defined in \Rpackage{Biostrings},
270
-and shares many methods with those of \Rclass{DNAStringSet}.
271
-
272
-The \Robject{aln} object contains additional information about
273
-alignments. Some of this additional information is expected from any
274
-alignment, whether generated by Solexa or other software.  For
275
-example, \Robject{aln} contains the particular sequence within a
276
-target (e.g., chromosomes in a genome assembly), the position (e.g.,
277
-base pair coordinate), and strand to which the alignment was made, and
278
-the quality of the alignment. The display of \Robject{aln} suggests
279
-how to access this information. For instance, the strand to which
280
-alignments are made can be extracted (as a factor with three levels
281
-and possibly \Rcode{NA}; the level \Rcode{"*"} corresponds to reads
282
-for which strand alignment is intrinsically not meaningful, whereas
283
-\Rcode{NA} represents the traditional concept of information not
284
-available, e.g., because the read did not align at all) and tabulated
285
-using familiar \R{} functions.
286
-<<chromosomes>>=
287
-whichStrand <- strand(aln)
288
-class(whichStrand)
289
-levels(whichStrand)
290
-table(whichStrand, useNA="ifany")
291
-@ 
292
-%% 
293
-This shows that about 
294
-%% 
295
-\Sexpr{format(100*sum(is.na(whichStrand))/ length(whichStrand))}\%{}
296
-%% 
297
-of reads were not aligned (level \Rcode{NA}).
298
-
299
-The \Robject{aln} object contains information in addition to that
300
-expected of all alignments. This information is accessible using
301
-\Rfunction{alignData}:
302
-<<alignData>>=
303
-alignData(aln)
304
-@ 
305
-%% 
306
-Users familiar with the \Rclass{ExpressionSet} class in
307
-\Rpackage{Biobase} will recognize this as an
308
-\Rclass{AnnotatedDataFrame}-like object, containing a data frame with
309
-rows for each short read. The \Rclass{AlignedDataFrame} contains
310
-additional meta data about the meaning of each column. For instance,
311
-data extracted from the Solexa export file includes:
312
-<<varMetadata>>=
313
-varMetadata(alignData(aln))
91
+or drawing a random sample from the file
92
+<<sampler, eval=FALSE>>=
93
+sampler <- FastqSampler("a.fastq.gz")
94
+fq <- yield(sampler)
95
+@ 
96
+\noindent The default size for both streams and samples is 1M records;
97
+this volume of data fits easily into memory. Small FASTQ files can be
98
+read in to memory in their entirety using \Rfunction{readFastq}; we do
99
+this for our sample data
100
+<<readFastq>>=
101
+fl <- system.file(package="ShortRead", "extdata", "E-MTAB-1147", 
102
+                  "ERR127302_1_subset.fastq.gz")
103
+fq <- readFastq(fl)
104
+@ 
105
+
106
+The result of data input is an instance of class \Rclass{ShortReadQ}
107
+(Table~\ref{tab:classes}). 
108
+\begin{table}
109
+  \centering
110
+  \begin{tabular}{ll}
111
+    \hline
112
+    \Rclass{DNAStringSet} & (\Biocpkg{Biostrings}) Short read sequences \\
113
+    \Rclass{FastqQuality}, etc. & Quality encodings \\
114
+    \Rclass{ShortReadQ} & Reads, quality scores, and ids \\
115
+    \hline
116
+  \end{tabular}
117
+  \caption{Primary data types in the \Biocpkg{ShortRead} package}
118
+  \label{tab:classes}
119
+\end{table}
120
+This class contains reads, their quality scores, and optionally the id
121
+of the read.
122
+<<ShortReadQ>>=
123
+fq
124
+fq[1:5]
125
+head(sread(fq), 3)
126
+head(quality(fq), 3)
314 127
 @ 
315
-%% 
316
-Guides to the precise meaning of this data are on the help page for
317
-the \Rclass{AlignedRead} class, and in the manufacturer manuals. 
318
-
319
-Simple information about the alignments can be found by querying this
320
-object. For instance, unaligned reads have \Rcode{NA} as their
321
-position, and reads passing Solexa `filtering' (their base purity and
322
-chastity criteria) have a component of their auxiliary
323
-\Robject{alignData} set to \Rcode{"Y"}. Thus the fraction of unaligned
324
-reads and reads passing filtering are
325
-<<aln-okreads>>=
326
-mapped <- !is.na(position(aln))
327
-filtered <- alignData(aln)[["filtering"]] =="Y"
328
-sum(!mapped) / length(aln)
329
-sum(filtered) / length(aln)
128
+\noindent The reads are represented as \Rclass{DNAStringSet}
129
+instances, and can be manipulated with the rich tools defined in the
130
+\Biocpkg{Biostrings} package. The quality scores are represented by a
131
+class that represents the quality encoding inferred from the file; the
132
+encoding in use can be discovered with
133
+<<encoding>>=
134
+encoding(quality(fq))
330 135
 @ 
136
+\noindent The primary source of documentation for these classes is
137
+\Rcode{?ShortReadQ} and \Rcode{?QualityScore}.
331 138
 
332
-Extracting the reads that passed filtering but were unmapped is
333
-accomplished with
334
-<<aln-failed>>=
335
-failedAlign <- aln[filtered & !mapped]
336
-failedAlign
337
-@ 
338
-%% 
339
-Alternatively, we can extract just the short reads, and select the
340
-subset of those that failed filtering.
341
-<<sread-filter-fail-subset>>=
342
-failedReads <- sread(aln)[filtered & !mapped]
343
-@ 
139
+\section{Common work flows}
344 140
 
345 141
 \subsection{Quality assessment}
346 142
 
347
-The \Rfunction{qa} function provides a convenient way to summarize
348
-read and alignment quality. One way of obtaining quality assessment
349
-results is
350
-<<qa>>=
351
-qaSummary <- qa(sp)
143
+FASTQ files are often used for basic quality assessment, often to
144
+augment the purely technical QA that might be provided by the
145
+sequencing center with QA relevant to overall experimental design. A QA report is generated by creating a vector of paths to FASTQ files
146
+<<qa-files, eval=FALSE>>=
147
+fls <- dir("/path/to", "*fastq$", full=TRUE)
352 148
 @ 
353
-%% 
354
-The \Robject{qa} object is a list-like structure. As invoked above and
355
-currently implemented, \Rfunction{qa} visits all
356
-\texttt{s\_N\_export.txt} files in the appropriate directory. It
357
-extracts useful information from the files, and summarizes the results
358
-into a nested list-like structure. 
359
-
360
-Evaluating \Rfunction{qa} for a single lane can take several
361
-minutes. For this reason a common use case is to evaluate
362
-\Rfunction{qa} and save the result to disk for later use, e.g.,
363
-<<eval=FALSE>>=
364
-save(qaSummary, file="/path/to/file.rda")
149
+\noindent collecting statistics over the files
150
+<<qa-qa, eval=FALSE>>=
151
+qaSummary <- qa(fls, type="fastq")
365 152
 @ 
366
-%% 
367
-A feature of \ShortRead{} is the use of \Rpackage{Rmpi} or
368
-\Rpackage{multicore} and coarse-grained parallel processing when
369
-available. Thus commands such as
370
-<<eval=FALSE>>=
371
-library("Rmpi")
372
-mpi.spawn.Rslaves(nsl=8)
373
-qaSummary <- qa(sp)
374
-mpi.close.Rslaves()
153
+\noindent and creating and viewing a report
154
+<<qa-view, eval=FALSE>>=
155
+browseURL(report(qaSummary))
375 156
 @ 
376
-%% 
377
-or
378
-<<eval=FALSE>>=
379
-library(multicore)
380
-qaSummary <- qa(sp)
157
+\noindent By default, the report is based on a sample of 1M
158
+reads. Parallel processing (across files) can be enabled by specifying
159
+\Rcode{options(srapply\_fapply)}, as described in \Rcode{?srapply}.
160
+
161
+These QA facilities are easily augmented by writing custom functions
162
+for reads sampled from files, or by explorting the elements of the
163
+object returned from \Rcode{qa()}, e.g., for an analysis of
164
+ArrayExpress experiment E-MTAB-1147:
165
+<<qa-files, echo=FALSE>>=
166
+load("qa_E-MTAB-1147.Rda")
381 167
 @ 
382
-%% 
383
-will distribute the task of processing each lane to each of the
384
-\Rpackage{Rmpi} workers or \Rpackage{multicore} cores. In the
385
-\Rpackage{Rmpi} example, all 8 lanes of a Solexa experiment are
386
-processed in the time take to process a single lane.
387
-\Rpackage{multicore} may impose significant memory demands, as each
388
-core will attempt to load a full lane of data.
389
-
390
-The elements of the quality assessment list are suggested by the
391
-output:
392 168
 <<qa-elements>>=
393 169
 qaSummary
394 170
 @ 
... ...
@@ -396,8 +172,8 @@ qaSummary
396 172
 For instance, the count of reads in each lane is summarized in the
397 173
 \Robject{readCounts} element, and can be displayed as
398 174
 <<qa-readCounts>>=
399
-qaSummary[["readCounts"]]
400
-qaSummary[["baseCalls"]]
175
+head(qaSummary[["readCounts"]])
176
+head(qaSummary[["baseCalls"]])
401 177
 @ 
402 178
 %% 
403 179
 The \Robject{readCounts} element contains a data frame with 1 row and
... ...
@@ -409,17 +185,6 @@ Solexa filtering criteria, and the number of reads aligned to the
409 185
 reference genome for the lane. The \Robject{baseCalls} element
410 186
 summarizes base calls in the unfiltered reads.
411 187
 
412
-Other elements of \Robject{qaSummary} are more complicated, and their
413
-interpretation is not directly obvious. Instead, a common use is to
414
-forward the results of \Rfunction{qa} to a report generator. 
415
-<<report, eval=FALSE>>=
416
-report(qaSummary, dest="/path/to/report_directory")
417
-@ 
418
-%%
419
-The report includes \R{} code that can be used to understand how
420
-\Sexpr{class(qaSummary)}-class objects can be processed; reports are
421
-generated as HTML suitable for browser viewing.
422
-
423 188
 The functions that produce the report tables and graphics are 
424 189
 internal to the package. They can be accessed through calling 
425 190
 ShortRead:::functionName where functionName is one of the functions
... ...
@@ -436,6 +201,47 @@ listed below, organized by report section.
436 201
 \item [] Adapter Contamination : .ppnCount
437 202
 \end{description}
438 203
 
204
+\subsection{Filtering and trimming}
205
+
206
+It is straight-forward to create filters to eliminate reads or to trim
207
+reads based on diverse characteristics. The basic structure is to open
208
+a FASTQ file, iterate through chunks of the file performing filtering
209
+or trimming steps, and appending the filtered data to a new file.
210
+<<filter-scheme>>=
211
+myFilterAndTrim <- 
212
+    function(fl, destination=sprintf("%s_subset", fl))
213
+{
214
+    ## open input stream
215
+    stream <- open(FastqStreamer(fl))
216
+    on.exit(close(stream))
217
+    
218
+    repeat {
219
+        ## input chunk
220
+        fq <- yield(stream)
221
+        if (length(fq) == 0)
222
+            break
223
+        
224
+        ## trim and filter, e.g., reads cannot contain 'N'...
225
+        fq <- fq[nFilter()(fq)]  # see ?srFilter for pre-defined filters
226
+        ## trim as soon as 2 of 5 nucleotides has quality encoding less 
227
+        ## than "4" (phred score 20)
228
+        fq <- trimTailw(fq, 2, "4", 2)
229
+        ## drop reads that are less than 36nt
230
+        fq <- fq[width(fq) < 36]
231
+        
232
+        ## append to destination
233
+        writeFastq(fq, destination, "a")
234
+    }
235
+}
236
+@ 
237
+\noindent This is memory efficient and flexible. Care must be taken to
238
+coordinate pairs of FASTQ files representing paired-end reads, to
239
+preserve order. Processing several files in parallel might use
240
+<<filter-several, eval=FALSE>>=
241
+fls <- dir("/path/to", "fastq$", full=TRUE)
242
+srapply(fls, myFilterAndTrim)
243
+@ 
244
+
439 245
 \section{Using \Rpackage{ShortRead} for data exploration}
440 246
 
441 247
 \subsection{Data I/O}
... ...
@@ -450,6 +256,9 @@ sequence-like data. For instance, the Solexa files
450 256
 \texttt{s\_N\_export.txt} contain lines with the following
451 257
 information:
452 258
 <<export>>=
259
+## location of file
260
+exptPath <- system.file("extdata", package="ShortRead")
261
+sp <- SolexaPath(exptPath)
453 262
 pattern <- "s_2_export.txt"
454 263
 fl <- file.path(analysisPath(sp), pattern)
455 264
 strsplit(readLines(fl, n=1), "\t")
... ...
@@ -470,10 +279,10 @@ package. \Rfunction{readXStringColumns} allows us to read in columns
470 279
 of text as these classes.
471 280
 
472 281
 Important arguments for \Rfunction{readXStringColumns} are the
473
-\Rfunarg{dirPath} in which to look for files, the \Rfunarg{pattern} of
474
-files to parse, and the \Rfunarg{colClasses} of the columns to be
475
-parsed. The \Rfunarg{dirPath} and \Rfunarg{pattern} arguments are like
476
-\Rfunarg{list.files}. \Rfunarg{colClasses} is like the corresponding
282
+\Rcode{dirPath} in which to look for files, the \Rcode{pattern} of
283
+files to parse, and the \Rcode{colClasses} of the columns to be
284
+parsed. The \Rcode{dirPath} and \Rcode{pattern} arguments are like
285
+\Rcode{list.files}. \Rcode{colClasses} is like the corresponding
477 286
 argument to \Rfunction{read.table}: it is a \Rclass{list} specifying
478 287
 the class of each column to be read, or \Robject{NULL} if the column
479 288
 is to be ignored. In our case there are 21 columns, and we would like
... ...
@@ -498,7 +307,7 @@ The file has been parsed, and appropriate data objects were created.
498 307
 
499 308
 A feature of \Rfunction{readXStringColumns} and other input functions
500 309
 in the \Rpackage{ShortRead} package is that all files matching
501
-\Rfunarg{pattern} in the specified \Rfunarg{dirPath} will be read into
310
+\Rcode{pattern} in the specified \Rcode{dirPath} will be read into
502 311
 a single object. This provides a convenient way to, for instance,
503 312
 parse all tiles in a Solexa lane into a single \Rclass{DNAStringSet}
504 313
 object.
... ...
@@ -516,75 +325,6 @@ related classes are used extensively in \Rpackage{ShortRead},
516 325
 \Rpackage{Biostrings}, \Rpackage{BSgenome} and other packages relevant
517 326
 to short read technology.
518 327
 
519
-\subsubsection{\Rfunction{readFastq}}
520
-
521
-\Rfunction{readXStringColumns} should be considered a `low-level'
522
-function providing easy access to columns of data. Another flexible
523
-input function is \Rfunction{readFastq}. Fastq files combine reads and
524
-their base qualities in four-line records such as the following:
525
-<<fastq-format>>=
526
-fqpattern <- "s_1_sequence.txt"
527
-fl <- file.path(analysisPath(sp), fqpattern)
528
-readLines(fl, 4)
529
-@ 
530
-% 
531
-The first and third lines are an identifier (encoding the machine, run,
532
-lane, tile, x and y coordinates of the cluster that gave rise to the
533
-read, in this case). The second line is the read, and the fourth line
534
-the per-base quality. Files of this sort can be read in as
535
-<<readFastq>>=
536
-fq <- readFastq(sp, fqpattern)
537
-fq
538
-@ 
539
-% 
540
-This resulting object (of class \Sexpr{class(fq)}) contains the short
541
-reads, their qualities, and the identifiers:
542
-<<ShortReadQ>>=
543
-reads <- sread(fq)
544
-qualities <- quality(fq)
545
-class(qualities)
546
-id(fq)
547
-@ 
548
-% 
549
-Notice that the class of the qualities is \Rclass{SFastqQuality}, to
550
-indicate that these are quality scores derived using the Solexa
551
-convention, rather than ordinary \Rclass{BStringSet} objects.
552
-
553
-The object has essential operations for convenient manipulation, for
554
-instance simultaneously forming the subset of all three components:
555
-<<ShortReadQ-subset>>=
556
-fq[1:5]
557
-@ 
558
-
559
-\subsubsection{Additional input functions}
560
-
561
-\ShortRead{} includes additional functions to facilitate input. For
562
-instance, \Rfunction{readPrb} reads Solexa \texttt{\_prb.txt}
563
-files. These files contain base-specific quality information, and
564
-\Rfunction{readPrb} returns an \Rclass{SFastqQuality}-class object
565
-representing the fastq-encoded base-specific quality scores of all
566
-reads.
567
-
568
-As a second example, the \texttt{s\_N\_LLLL\_int.txt} files in the
569
-\Rfunction{imageAnalysisPath} directory contain lines, one line per
570
-read, of nucleotide intensities. Each line contain lane, tile, X and Y
571
-coordinate information, followed by quadruplets of intensity values.
572
-There are as many quadruplets as there are cycles. Each quadruplet
573
-represents the intensity of the \texttt{A}, \texttt{C}, \texttt{G},
574
-and \texttt{T} nucleotide at the corresponding cycle. These (and their
575
-error estimates, if available), are input with
576
-<<intensity-files>>=
577
-int <- readIntensities(sp, withVariability=FALSE)
578
-int
579
-@ 
580
-%% 
581
-An interesting exercise is to display the intensities at cycle 2
582
-(below) and to compare these to cycle, e.g., 30.
583
-<<intensities-cycle-2, fig=TRUE>>=
584
-print(splom(intensity(int)[[,,2]], pch=".", cex=3))
585
-@ 
586
- 
587
-Additional files can be parsed using standard \R{} input methods. 
588 328
 
589 329
 \subsection{Sorting}
590 330
 
... ...
@@ -609,7 +349,7 @@ prior to PCR.
609 349
 
610 350
 The \Rfunction{tables} function summarizes read occurrences, for instance,
611 351
 <<tables>>=
612
-tbls <- tables(aln)
352
+tbls <- tables(fq)
613 353
 names(tbls)
614 354
 tbls$top[1:5]
615 355
 head(tbls$distribution)
... ...
@@ -621,8 +361,9 @@ reads. Knowledgeable readers will recognize the top-occurring read as a
621 361
 close match to one of the manufacturer adapters.
622 362
 
623 363
 The \Robject{distribution} component returned by \Robject{tables} is a
624
-data frame that summarizes how many reads (e.g., \Sexpr{tbls[["distribution"]][1,"nReads"]}) 
625
-are represented exactly \Sexpr{tbls[["distribution"]][1,"nOccurrences"]} times.
364
+data frame that summarizes how many reads (e.g.,
365
+\Sexpr{tbls[["distribution"]][1,"nReads"]}) are represented exactly
366
+\Sexpr{tbls[["distribution"]][1,"nOccurrences"]} times.
626 367
 
627 368
 \subsection{Finding near matches to short sequences}
628 369
 
... ...
@@ -635,13 +376,13 @@ is of the same order as the reads themselves (10's to 100's of bases).
635 376
 To find reads close to the most common read in the example above, one
636 377
 might say
637 378
 <<srdistance>>=
638
-dist <- srdistance(sread(aln), names(tbls$top)[1])[[1]]
379
+dist <- srdistance(sread(fq), names(tbls$top)[1])[[1]]
639 380
 table(dist)[1:10]
640 381
 @ 
641 382
 %% 
642
-`Near' matches can be filtered from the alignment, e.g.,
383
+`Near' matches can be filtered, e.g.,
643 384
 <<aln-not-near>>=
644
-alnSubset <- aln[dist>4]
385
+fqSubset <- fq[dist>4]
645 386
 @
646 387
 
647 388
 A different strategy can be used to tally or eliminate reads that consist
... ...
@@ -650,8 +391,8 @@ calculates the frequency of each nucleotide (in DNA strings) or letter
650 391
 (for other string sets) in each read. Thus one could identify and
651 392
 eliminate reads with more than 30 adenine nucleotides with
652 393
 <<polya>>=
653
-countA <- alphabetFrequency(sread(aln))[,"A"] 
654
-alnNoPolyA <- aln[countA < 30]
394
+countA <- alphabetFrequency(sread(fq))[,"A"] 
395
+fqNoPolyA <- fq[countA < 30]
655 396
 @ 
656 397
 %% 
657 398
 \Rfunction{alphabetFrequency}, which simply counts nucleotides, is
... ...
@@ -663,174 +404,20 @@ pairwise aligment are encouraged to investigate the
663 404
 \Rpackage{Biostrings} package, especially the \Rclass{PDict} class and
664 405
 \Rfunction{matchPDict} and \Rfunction{pairwiseAlignment} functions.
665 406
 
666
-\subsection{The \Rfunction{coverage} function}
667
-
668
-The \Rfunction{coverage} function provides a way to summarize where
669
-reads align on a reference sequence. The idea is that the aligned
670
-reads, or under some analyses the extension of those aligned reads by
671
-an amount meant to estimate the actual fragment size, `pile up' on top
672
-of nucleotide positions in the reference sequence. A convenient
673
-summary of the alignment of many reads is thus a vector describing the
674
-depth of the pile at each position in the reference sequence. A
675
-typical work flow invokes \Rfunction{coverage} on an instance of the
676
-\Rclass{AlignedRead} class obtained from \Rfunction{readAligned};
677
-additional methods offering greater control operate on
678
-\Rclass{IRanges} directly.  The \Rfunction{coverage} methods returns a
679
-run-length encoding of the pile-up (or a list of such run length
680
-encodings). The run-length encoding returned by \Rfunction{coverage}
681
-is a space-efficient representation; the long integer vector can be
682
-recovered with \Rcode{as.integer}.
683
-
684
-There are complicated issues associated with use of
685
-\Rfunction{coverage}, relating to how software reports the `position'
686
-of an alignment, especially on the minus strand. These issues are
687
-illustrated in figure~\ref{fig:coverage}.
688
-\begin{figure}
689
-\begin{verbatim}
690
-'leftmost':
691
-                             P
692
-                             +++++----------
693
-'+' strand: 5' ....|....|....|....|....|....|.. 3'
694
-'-' strand: 3' ....|....|....|....|....|....|.. 5'
695
-                   ----------+++++
696
-                             P
697
-
698
-'fiveprime':
699
-                             P
700
-                             +++++----------
701
-'+' strand: 5' ....|....|....|....|....|....|.. 3'
702
-'-' strand: 3' ....|....|....|....|....|....|.. 5'
703
-                   ----------+++++
704
-                                 P
705
-\end{verbatim}
706
-  \caption{Alignment schemes used by
707
-    \Rfunction{coverage}. \texttt{+++} represents the read and
708
-    \texttt{---} the extension. \texttt{P} is the alignment position
709
-    as recorded under the corresponding leftmost or fiveprime
710
-    schemes.}
711
-  \label{fig:coverage}
712
-\end{figure}
713
-In the figure, the two strands are represented by \verb"....|",
714
-aligned reads by \verb"+++", and extensions by \verb"---".  The idea
715
-is that 5-nucleotide reads have been aligned to a reference sequence,
716
-and the alignment extended by 10 nucleotides. In the `leftmost'
717
-notation (used by \ELAND{}) and assuming that the reference sequence
718
-is always numbered in relation to the plus strand and indexed starting
719
-at 1 (\Rfunction{readAligned} translates reported alignment positions
720
-so they are indexed from 1), the reported position is 15 for the
721
-alignments on either the plus or the minus strand. In contrast the
722
-`fiveprime' scheme the alignment to the plus strand is 15, and to the
723
-minus strand 19. This is the scheme used by \MAQ{}, for instance.
724
-
725
-The default behavior of \Rfunction{coverage} is to use the `leftmost'
726
-coordinate system. This is appropriate for data derived from \ELAND{}.
727
-
728
-\section{Advanced features}
729
-
730
-\subsection{The \Rfunarg{pattern} argument to input functions}
731
-
732
-Most \ShortRead{} input functions are designed to accept a directory
733
-path argument, and a \Rfunarg{pattern} argument. The latter is a
734
-grep-like pattern (as used by, e.g., \Rfunction{list.files}). Many
735
-input functions are implemented so that all files matching the pattern
736
-are read into a single large input object. Thus the
737
-\texttt{s\_N\_LLLL\_seq.txt} files consist of four numeric columns and a
738
-fifth column corresponding to the short read. The following code
739
-illustrates the file structure and inputs the final column into a
740
-\Rclass{DNAStringSet}:
741
-<<readSeq>>=
742
-seqFls <- list.files(baseCallPath(sp), "_seq.txt", full=TRUE)
743
-strsplit(readLines(seqFls[[1]], 1), "\t")
744
-colClasses <- c(rep(list(NULL), 4), "DNAString")
745
-reads <- readXStringColumns(baseCallPath(sp), "s_1_0001_seq.txt",
746
-                            colClasses=colClasses)
747
-@ 
748
-%% 
749
-The more general pattern
750
-<<readSeq-all>>=
751
-reads <- readXStringColumns(baseCallPath(sp), "s_1_.*_seq.txt",
752
-                            colClasses=colClasses)
753
-@ 
754
-%% 
755
-inputs all lane 1 tile files into a single \Rclass{DNAStringSet} object.
756
-
757
-\subsection{\Rfunction{srapply}}
758
-
759
-Solexa and other short read technologies often include many files,
760
-e.g., one \texttt{s\_L\_NNNN\_int.txt} file per tile, 300 tiles per
761
-lane, 8 lanes per flow cell for 2400 \texttt{s\_L\_NNNN\_int.txt} files
762
-per flow cell. A natural way to extract information from these is to
763
-write short functions, e.g., to find the average intensity per base at
764
-cycle 12.
765
-<<calcInt-demo>>=
766
-calcInt <- function(file, cycle, verbose=FALSE)
767
-{
768
-    if (verbose)
769
-        cat("calcInt", file, cycle, "\n")
770
-    int <- readIntensities(dirname(file), basename(file),
771
-                           intExtension="", withVariability=FALSE)
772
-    apply(intensity(int)[,,12], 2, mean)
773
-}
774
-@ 
775
-One way to apply this function to all intensity files in a Solexa run
776
-is
777
-<<calcInt-sapply>>=
778
-intFls <- list.files(imageAnalysisPath(sp), ".*_int.txt$", full=TRUE)
779
-lres <- lapply(intFls, calcInt, cycle=12)
780
-@ 
781
-%% 
782
-The files are generally large and numerous, so even simple
783
-calculations consume significant computational resources. The
784
-\Rfunction{srapply} function is meant to provide a transparent way to
785
-perform calculations like this distributed over multiple nodes of an
786
-MPI cluster, or across multiple cores of a single machine. Thus
787
-<<srapply-simple>>=
788
-srres <- srapply(intFls, calcInt, cycle=12)
789
-identical(lres, srres)
790
-@ 
791
-%% 
792
-evaluates the function as \Rfunction{lapply}, whereas
793
-<<srapply-mpi, eval=FALSE>>=
794
-library("Rmpi")
795
-mpi.spawn.Rslaves(nsl=16)
796
-srres <- srapply(intFls, calcInt, cycle=12)
797
-mpi.close.Rslaves()
798
-@ 
799
-%% 
800
-distributes the calculation over available workers, while 
801
-<<srapply-multicore, eval=FALSE>>=
802
-library(multicore)
803
-srres <- srapply(intFls, calcInt, cycle=12)
804
-@ 
805
-%% 
806
-distributes tasks across cores of a single machine. The result is a
807
-speedup approximately inversely proportional to the number of
808
-available compute nodes or cores; memory requirements for
809
-the \Rpackage{multicore} approach may be substantial.
810
-
811
-\section{Conclusions and directions for development}
812
-
813
-\ShortRead{} provides tools for reading, manipulation, and quality
814
-assessment of short read data. Current facilities in \ShortRead{}
815
-emphasize processing of single-end Solexa data. 
407
+\section{Legacy support for early file formats}
816 408
 
817
-Development priorities in the near term include expanded facilities
818
-for importing key file types from additional manufactures, more
819
-extensive quality assessment methodologies, and development of
820
-infrastructure for paired-end reads.
409
+The \Biocpkg{ShortRead} package contains functions and classes to
410
+support early file formats and ungapped alignments. Help pages are
411
+flagged as `legacy'; versions of \Biocpkg{ShortRead} prior to 1.21
412
+(\Bioconductor{} version 2.13) contain a vignette illustrating common
413
+work flows with these file formats.
821 414
 
822 415
 %---------------------------------------------------------
823 416
 % SessionInfo
824 417
 %---------------------------------------------------------
825
-\begin{table*}[tbp]
826
-\begin{minipage}{\textwidth}
418
+\section{sessionInfo}
827 419
 <<sessionInfo, results=tex, print=TRUE>>=
828 420
 toLatex(sessionInfo())
829 421
 @ 
830
-\end{minipage}
831
-\caption{\label{tab:sessioninfo}%
832
-The output of \Rfunction{sessionInfo} on the build system 
833
-after running this vignette.}
834
-\end{table*}
835 422
 
836 423
 \end{document}
837 424
new file mode 100644
838 425
Binary files /dev/null and b/vignettes/qa_E-MTAB-1147.Rda differ