Browse code

removed Hsapiens GmapGenome from gmapR vignette, dropped pasillaBamSubset from 'Suggests'

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69287 bc3139a8-67e5-0310-9ffc-ced21a209358

Cory Barr authored on 10/09/2012 20:58:54
Showing 2 changed files

... ...
@@ -14,7 +14,7 @@ Version: 0.99.14
14 14
 Depends: R (>= 2.15.0), methods, GenomicRanges
15 15
 Imports: IRanges, Rsamtools (>= 1.7.4), rtracklayer (>= 1.17.15), GenomicRanges,
16 16
          GenomicFeatures, Biostrings, VariantAnnotation, tools, Biobase
17
-Suggests: RUnit, BSgenome.Dmelanogaster.UCSC.dm3, pasillaBamSubset, 
17
+Suggests: RUnit, BSgenome.Dmelanogaster.UCSC.dm3,
18 18
 	  BSgenome.Scerevisiae.UCSC.sacCer3, VariantAnnotation,
19 19
           org.Hs.eg.db, TxDb.Hsapiens.UCSC.hg19.knownGene,
20 20
           BSgenome.Hsapiens.UCSC.hg19
... ...
@@ -130,7 +130,7 @@ if (!suppressWarnings(require(BSgenome.Dmelanogaster.UCSC.dm3))) {
130 130
   library(BSgenome.Dmelanogaster.UCSC.dm3)
131 131
 }
132 132
 
133
-gmapGenomePath <- file.path(getwd(), "testGenome")
133
+gmapGenomePath <- file.path(getwd(), "flyGenome")
134 134
 gmapGenomeDirectory <- GmapGenomeDirectory(gmapGenomePath, create = TRUE)
135 135
 ##> gmapGenomeDirectory
136 136
 ##GmapGenomeDirectory object
... ...
@@ -151,41 +151,51 @@ gmapGenome <- GmapGenome(genome=Dmelanogaster,
151 151
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
152 152
 
153 153
 The \software{GSNAP} algorithm incorporates biological knowledge to
154
-provide accurate alignments, particularly for RNA-seq data. The
155
-following example synthesizes some input reads and aligns them using
156
-\software{GSNAP}.
157
-
158
-If you have created the D. melanogaster \Rclass{GmapGenome} package as
159
-detailed above and installed it, you can begin this section as
160
-follows:
154
+provide accurate alignments, particularly for RNA-seq data. In this
155
+section, we will align reads from an RNA-seq experiment provided in a
156
+fastq file to a selected region of the human genome. In this example,
157
+we will align to the region of the human genome containing TP53 plus
158
+an additional one megabase on each side of this gene.
159
+
160
+First we need to obtain the desired region of interest from the genome:
161
+
162
+<<get_TP53_coordinates>>=
163
+library("org.Hs.eg.db")
164
+library("TxDb.Hsapiens.UCSC.hg19.knownGene")
165
+eg <- org.Hs.eg.db::org.Hs.egSYMBOL2EG[["TP53"]]
166
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
167
+tx <- transcripts(txdb, vals = list(gene_id = eg))
168
+roi <- range(tx) + 1e6
169
+@ 
161 170
 
162
-<<<load_GmapGenome, eval=FALSE>>=
163
-library("GmapGenome.Dmelanogaster.UCSC.dm3")
164
-gmapGenome <- GmapGenome.Dmelanogaster.UCSC.dm3
171
+Next we get the genetic sequence and use it to create a GmapGenome
172
+object. (Please note that the TP53_demo GmapGenome object is used by
173
+many examples and tests in the gmapR and VariationTools packages. If
174
+the object has been created before, its subsequent creation will be
175
+instantaneous.)
176
+
177
+<<get_TP53_seq>>=
178
+library("BSgenome.Hsapiens.UCSC.hg19")
179
+p53Seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, roi,
180
+                 as.character = FALSE) 
181
+names(p53Seq) <- "TP53"
182
+gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE)
165 183
 @ 
166 184
 
167
-Otherwise, you can create the \Rcode{gmapGenome} object as detailed in the
168
-initial section of this vignette.
185
+The data package \software{LungCancerLines} contains fastqs of reads
186
+obtained from sequencing H1993 and H2073 cell lines. We will use these
187
+fastqs to demonstrate \software{GSNAP} alignment with \software{gmapR}.
169 188
 
170
-<<align_with_gsnap, eval=FALSE>>=
171
-##synthesize paired-end FASTA input to align against the genome
172
-library("BSgenome.Dmelanogaster.UCSC.dm3")
173
-dnaStringSet <- Dmelanogaster$chr4
174
-makeTestFasta <- function(seqStarts, pairNum) {
175
-  seqEnds <- seqStarts + 74
176
-  nucleotides <- sapply(seq_along(seqStarts),
177
-                        function(i) as.character(subseq(dnaStringSet, seqStarts[i], seqEnds[i]))
178
-                        )
179
-  fastaHeaders <- paste0(">", "fakeRead", seq_along(nucleotides), "#0/", pairNum)
180
-  fastaLines <- paste(fastaHeaders, nucleotides, sep="\n")
181
-  tmpFasta <- paste0(tempfile(), ".", pairNum, ".fa")
182
-  writeLines(fastaLines, con=tmpFasta)
183
-  tmpFasta
184
-}
185
-seqStarts <- seq(1, 100000, by = 10000) + length(dnaStringSet) * 2 / 3
186
-tmpFasta1 <- makeTestFasta(seqStarts, '1')
187
-tmpFasta2 <- makeTestFasta(seqStarts + 125, '2')
189
+<<get_lung_cancer_fastqs>>=
190
+library("LungCancerLines")
191
+fastqs <- LungCancerFastqFiles()
192
+@ 
188 193
 
194
+\software{GSNAP} is highly configurable. Users create
195
+\Rclass{GsnapParam} objects to specify desired \software{GSNAP}
196
+behavior. 
197
+
198
+<<create_gsnapParam>>=
189 199
 ##specify how GSNAP should behave using a GsnapParam object
190 200
 gsnapParam <- GsnapParam(genome = gmapGenome,
191 201
                          unique_only = FALSE,
... ...
@@ -194,10 +204,15 @@ gsnapParam <- GsnapParam(genome = gmapGenome,
194 204
                          npaths = 10,
195 205
                          novelsplicing = FALSE, splicing = NULL, 
196 206
                          nthreads = 1,
197
-                         batch = 2L)
207
+                         batch = 2L,
208
+                         gunzip=TRUE)
209
+@ 
198 210
 
199
-gsnapOutput <- gsnap(input_a=tmpFasta1,
200
-                     input_b=tmpFasta2,
211
+Now we are ready to align.
212
+
213
+<<align_with_gsnap, eval=FALSE>>=
214
+gsnapOutput <- gsnap(input_a=fastqs["H1993.first"],
215
+                     input_b=fastqs["H1993.last"],
201 216
                      params=gsnapParam)
202 217
 ##gsnapOutput
203 218
 ##An object of class "GsnapOutput"
... ...
@@ -244,29 +259,41 @@ alignments that \software{GSNAP} outputs along with other utilities.
244 259
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
245 260
 
246 261
 Running the \Rmethod{bam\_tally} method will return a \Rclass{GRanges}
247
-of information per nucleotide. Below is a basic example. See the
248
-documentation for the \Rmethod{bam\_tally} method for details.
262
+of information per nucleotide. Below is an example demonstrating how
263
+to find variants in the TP53 gene of the human genome. See the
264
+documentation for the \Rmethod{bam\_tally} method for more details.
265
+
266
+\software{gmapR} provides access to a demo genome for examples. This
267
+genome encompasses the TP53 gene along with a 1-megabase flanking
268
+region on each side.
269
+<<TP53Genome_accessor>>=
270
+genome <- TP53Genome()
271
+@ 
272
+
273
+The \software{LungCancerLines} R package contains a BAM file of reads
274
+aligned to the TP53 region of the human genome. We'll use this file to
275
+demonstrate the use of \software{bam_tally} through the
276
+\software{gmapR} package. The resulting data structure will contain
277
+the needed information such as number of alternative alleles, quality
278
+scores, and nucleotide cycle for each allele.
249 279
 
250 280
 <<run_bamtally, eval=FALSE>>=
251 281
 library(gmapR)
252
-library(GmapGenome.Hsapiens.UCSC.hg19)
253
-genome <- GmapGenome.Hsapiens.UCSC.hg19
254 282
 
255
-bam_file <- file.path(system.file(package = "gmapR", mustWork=TRUE),
256
-                      "extdata/bam_tally_test.bam")
283
+bam_file <- system.file("extdata/H1993.analyzed.bam", 
284
+                        package="LungCancerLines", mustWork=TRUE)
257 285
 
258 286
 breaks <- c(0L, 15L, 60L, 75L)
259 287
 bqual <- 56L
260 288
 mapq <- 13L
261 289
 param <- BamTallyParam(genome, cycle_breaks = breaks,
262
-                       high_quality_cutoff = bqual,
290
+                       high_base_quality = bqual,
263 291
                        minimum_mapq = mapq,
264 292
                        concordant_only = FALSE, unique_only = FALSE,
265 293
                        primary_only = FALSE,
266 294
                        min_depth = 0L, variant_strand = 1L,
267 295
                        ignore_query_Ns = TRUE,
268 296
                        indels = FALSE)
269
-
270 297
 tallies <-bam_tally(bam_file,
271 298
                     param)
272 299
 @ 
... ...
@@ -330,26 +357,28 @@ repository, etc. After installation,
330 357
 \Rcode{library("GmapGenome.Hsapiens.UCSC.hg19")} will load a
331 358
 \Rclass{GmapGenome} object that has the same name as the package.
332 359
 
333
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
334
-\section{Aligning with SNP Tolerance}
335
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
336 360
 
337
-Both \software{GSNAP} and \software{GMAP} have the ability to perform
338
-alignment without penalizing against a set of known SNPs. In this
339
-section, we work through an example of aligning with this SNP
340
-tolerance. By using SNP-tolerance, we'll perform alignments without
341
-penalizing reads that match SNPs contained in the provided SNPs. In
342
-this case, we'll use dbSNP to provide the SNPs.
343 361
 
344
-This example will use hg19. If you have not installed
345
-\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section
346
-\ref{create_hg19_GmapGenome}.
347
-
348
-First we load the Hsapiens \Rclass{GmapGenome} object:
362
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
363
+%\section{Aligning with SNP Tolerance}
364
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
349 365
 
350
-<<<load_GmapGenomeHsapiens>>=
351
-library("GmapGenome.Hsapiens.UCSC.hg19")
352
-@ 
366
+%Both \software{GSNAP} and \software{GMAP} have the ability to perform
367
+%alignment without penalizing against a set of known SNPs. In this
368
+%section, we work through an example of aligning with this SNP
369
+%tolerance. By using SNP-tolerance, we'll perform alignments without
370
+%penalizing reads that match SNPs contained in the provided SNPs. In
371
+%this case, we'll use dbSNP to provide the SNPs.
372
+%
373
+%This example will use hg19. If you have not installed
374
+%\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section
375
+%\ref{create_hg19_GmapGenome}.
376
+%
377
+%First we load the Hsapiens \Rclass{GmapGenome} object:
378
+%
379
+%<<<load_GmapGenomeHsapiens>>=
380
+%library("GmapGenome.Hsapiens.UCSC.hg19")
381
+%@ 
353 382
 
354 383
 <<SessionInfo>>=
355 384
 sessionInfo()