...
|
...
|
@@ -130,7 +130,7 @@ if (!suppressWarnings(require(BSgenome.Dmelanogaster.UCSC.dm3))) {
|
130
|
130
|
library(BSgenome.Dmelanogaster.UCSC.dm3)
|
131
|
131
|
}
|
132
|
132
|
|
133
|
|
-gmapGenomePath <- file.path(getwd(), "testGenome")
|
|
133
|
+gmapGenomePath <- file.path(getwd(), "flyGenome")
|
134
|
134
|
gmapGenomeDirectory <- GmapGenomeDirectory(gmapGenomePath, create = TRUE)
|
135
|
135
|
##> gmapGenomeDirectory
|
136
|
136
|
##GmapGenomeDirectory object
|
...
|
...
|
@@ -151,41 +151,51 @@ gmapGenome <- GmapGenome(genome=Dmelanogaster,
|
151
|
151
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
152
|
152
|
|
153
|
153
|
The \software{GSNAP} algorithm incorporates biological knowledge to
|
154
|
|
-provide accurate alignments, particularly for RNA-seq data. The
|
155
|
|
-following example synthesizes some input reads and aligns them using
|
156
|
|
-\software{GSNAP}.
|
157
|
|
-
|
158
|
|
-If you have created the D. melanogaster \Rclass{GmapGenome} package as
|
159
|
|
-detailed above and installed it, you can begin this section as
|
160
|
|
-follows:
|
|
154
|
+provide accurate alignments, particularly for RNA-seq data. In this
|
|
155
|
+section, we will align reads from an RNA-seq experiment provided in a
|
|
156
|
+fastq file to a selected region of the human genome. In this example,
|
|
157
|
+we will align to the region of the human genome containing TP53 plus
|
|
158
|
+an additional one megabase on each side of this gene.
|
|
159
|
+
|
|
160
|
+First we need to obtain the desired region of interest from the genome:
|
|
161
|
+
|
|
162
|
+<<get_TP53_coordinates>>=
|
|
163
|
+library("org.Hs.eg.db")
|
|
164
|
+library("TxDb.Hsapiens.UCSC.hg19.knownGene")
|
|
165
|
+eg <- org.Hs.eg.db::org.Hs.egSYMBOL2EG[["TP53"]]
|
|
166
|
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
|
|
167
|
+tx <- transcripts(txdb, vals = list(gene_id = eg))
|
|
168
|
+roi <- range(tx) + 1e6
|
|
169
|
+@
|
161
|
170
|
|
162
|
|
-<<<load_GmapGenome, eval=FALSE>>=
|
163
|
|
-library("GmapGenome.Dmelanogaster.UCSC.dm3")
|
164
|
|
-gmapGenome <- GmapGenome.Dmelanogaster.UCSC.dm3
|
|
171
|
+Next we get the genetic sequence and use it to create a GmapGenome
|
|
172
|
+object. (Please note that the TP53_demo GmapGenome object is used by
|
|
173
|
+many examples and tests in the gmapR and VariationTools packages. If
|
|
174
|
+the object has been created before, its subsequent creation will be
|
|
175
|
+instantaneous.)
|
|
176
|
+
|
|
177
|
+<<get_TP53_seq>>=
|
|
178
|
+library("BSgenome.Hsapiens.UCSC.hg19")
|
|
179
|
+p53Seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, roi,
|
|
180
|
+ as.character = FALSE)
|
|
181
|
+names(p53Seq) <- "TP53"
|
|
182
|
+gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE)
|
165
|
183
|
@
|
166
|
184
|
|
167
|
|
-Otherwise, you can create the \Rcode{gmapGenome} object as detailed in the
|
168
|
|
-initial section of this vignette.
|
|
185
|
+The data package \software{LungCancerLines} contains fastqs of reads
|
|
186
|
+obtained from sequencing H1993 and H2073 cell lines. We will use these
|
|
187
|
+fastqs to demonstrate \software{GSNAP} alignment with \software{gmapR}.
|
169
|
188
|
|
170
|
|
-<<align_with_gsnap, eval=FALSE>>=
|
171
|
|
-##synthesize paired-end FASTA input to align against the genome
|
172
|
|
-library("BSgenome.Dmelanogaster.UCSC.dm3")
|
173
|
|
-dnaStringSet <- Dmelanogaster$chr4
|
174
|
|
-makeTestFasta <- function(seqStarts, pairNum) {
|
175
|
|
- seqEnds <- seqStarts + 74
|
176
|
|
- nucleotides <- sapply(seq_along(seqStarts),
|
177
|
|
- function(i) as.character(subseq(dnaStringSet, seqStarts[i], seqEnds[i]))
|
178
|
|
- )
|
179
|
|
- fastaHeaders <- paste0(">", "fakeRead", seq_along(nucleotides), "#0/", pairNum)
|
180
|
|
- fastaLines <- paste(fastaHeaders, nucleotides, sep="\n")
|
181
|
|
- tmpFasta <- paste0(tempfile(), ".", pairNum, ".fa")
|
182
|
|
- writeLines(fastaLines, con=tmpFasta)
|
183
|
|
- tmpFasta
|
184
|
|
-}
|
185
|
|
-seqStarts <- seq(1, 100000, by = 10000) + length(dnaStringSet) * 2 / 3
|
186
|
|
-tmpFasta1 <- makeTestFasta(seqStarts, '1')
|
187
|
|
-tmpFasta2 <- makeTestFasta(seqStarts + 125, '2')
|
|
189
|
+<<get_lung_cancer_fastqs>>=
|
|
190
|
+library("LungCancerLines")
|
|
191
|
+fastqs <- LungCancerFastqFiles()
|
|
192
|
+@
|
188
|
193
|
|
|
194
|
+\software{GSNAP} is highly configurable. Users create
|
|
195
|
+\Rclass{GsnapParam} objects to specify desired \software{GSNAP}
|
|
196
|
+behavior.
|
|
197
|
+
|
|
198
|
+<<create_gsnapParam>>=
|
189
|
199
|
##specify how GSNAP should behave using a GsnapParam object
|
190
|
200
|
gsnapParam <- GsnapParam(genome = gmapGenome,
|
191
|
201
|
unique_only = FALSE,
|
...
|
...
|
@@ -194,10 +204,15 @@ gsnapParam <- GsnapParam(genome = gmapGenome,
|
194
|
204
|
npaths = 10,
|
195
|
205
|
novelsplicing = FALSE, splicing = NULL,
|
196
|
206
|
nthreads = 1,
|
197
|
|
- batch = 2L)
|
|
207
|
+ batch = 2L,
|
|
208
|
+ gunzip=TRUE)
|
|
209
|
+@
|
198
|
210
|
|
199
|
|
-gsnapOutput <- gsnap(input_a=tmpFasta1,
|
200
|
|
- input_b=tmpFasta2,
|
|
211
|
+Now we are ready to align.
|
|
212
|
+
|
|
213
|
+<<align_with_gsnap, eval=FALSE>>=
|
|
214
|
+gsnapOutput <- gsnap(input_a=fastqs["H1993.first"],
|
|
215
|
+ input_b=fastqs["H1993.last"],
|
201
|
216
|
params=gsnapParam)
|
202
|
217
|
##gsnapOutput
|
203
|
218
|
##An object of class "GsnapOutput"
|
...
|
...
|
@@ -244,29 +259,41 @@ alignments that \software{GSNAP} outputs along with other utilities.
|
244
|
259
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
245
|
260
|
|
246
|
261
|
Running the \Rmethod{bam\_tally} method will return a \Rclass{GRanges}
|
247
|
|
-of information per nucleotide. Below is a basic example. See the
|
248
|
|
-documentation for the \Rmethod{bam\_tally} method for details.
|
|
262
|
+of information per nucleotide. Below is an example demonstrating how
|
|
263
|
+to find variants in the TP53 gene of the human genome. See the
|
|
264
|
+documentation for the \Rmethod{bam\_tally} method for more details.
|
|
265
|
+
|
|
266
|
+\software{gmapR} provides access to a demo genome for examples. This
|
|
267
|
+genome encompasses the TP53 gene along with a 1-megabase flanking
|
|
268
|
+region on each side.
|
|
269
|
+<<TP53Genome_accessor>>=
|
|
270
|
+genome <- TP53Genome()
|
|
271
|
+@
|
|
272
|
+
|
|
273
|
+The \software{LungCancerLines} R package contains a BAM file of reads
|
|
274
|
+aligned to the TP53 region of the human genome. We'll use this file to
|
|
275
|
+demonstrate the use of \software{bam_tally} through the
|
|
276
|
+\software{gmapR} package. The resulting data structure will contain
|
|
277
|
+the needed information such as number of alternative alleles, quality
|
|
278
|
+scores, and nucleotide cycle for each allele.
|
249
|
279
|
|
250
|
280
|
<<run_bamtally, eval=FALSE>>=
|
251
|
281
|
library(gmapR)
|
252
|
|
-library(GmapGenome.Hsapiens.UCSC.hg19)
|
253
|
|
-genome <- GmapGenome.Hsapiens.UCSC.hg19
|
254
|
282
|
|
255
|
|
-bam_file <- file.path(system.file(package = "gmapR", mustWork=TRUE),
|
256
|
|
- "extdata/bam_tally_test.bam")
|
|
283
|
+bam_file <- system.file("extdata/H1993.analyzed.bam",
|
|
284
|
+ package="LungCancerLines", mustWork=TRUE)
|
257
|
285
|
|
258
|
286
|
breaks <- c(0L, 15L, 60L, 75L)
|
259
|
287
|
bqual <- 56L
|
260
|
288
|
mapq <- 13L
|
261
|
289
|
param <- BamTallyParam(genome, cycle_breaks = breaks,
|
262
|
|
- high_quality_cutoff = bqual,
|
|
290
|
+ high_base_quality = bqual,
|
263
|
291
|
minimum_mapq = mapq,
|
264
|
292
|
concordant_only = FALSE, unique_only = FALSE,
|
265
|
293
|
primary_only = FALSE,
|
266
|
294
|
min_depth = 0L, variant_strand = 1L,
|
267
|
295
|
ignore_query_Ns = TRUE,
|
268
|
296
|
indels = FALSE)
|
269
|
|
-
|
270
|
297
|
tallies <-bam_tally(bam_file,
|
271
|
298
|
param)
|
272
|
299
|
@
|
...
|
...
|
@@ -330,26 +357,28 @@ repository, etc. After installation,
|
330
|
357
|
\Rcode{library("GmapGenome.Hsapiens.UCSC.hg19")} will load a
|
331
|
358
|
\Rclass{GmapGenome} object that has the same name as the package.
|
332
|
359
|
|
333
|
|
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
334
|
|
-\section{Aligning with SNP Tolerance}
|
335
|
|
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
336
|
360
|
|
337
|
|
-Both \software{GSNAP} and \software{GMAP} have the ability to perform
|
338
|
|
-alignment without penalizing against a set of known SNPs. In this
|
339
|
|
-section, we work through an example of aligning with this SNP
|
340
|
|
-tolerance. By using SNP-tolerance, we'll perform alignments without
|
341
|
|
-penalizing reads that match SNPs contained in the provided SNPs. In
|
342
|
|
-this case, we'll use dbSNP to provide the SNPs.
|
343
|
361
|
|
344
|
|
-This example will use hg19. If you have not installed
|
345
|
|
-\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section
|
346
|
|
-\ref{create_hg19_GmapGenome}.
|
347
|
|
-
|
348
|
|
-First we load the Hsapiens \Rclass{GmapGenome} object:
|
|
362
|
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
363
|
+%\section{Aligning with SNP Tolerance}
|
|
364
|
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
349
|
365
|
|
350
|
|
-<<<load_GmapGenomeHsapiens>>=
|
351
|
|
-library("GmapGenome.Hsapiens.UCSC.hg19")
|
352
|
|
-@
|
|
366
|
+%Both \software{GSNAP} and \software{GMAP} have the ability to perform
|
|
367
|
+%alignment without penalizing against a set of known SNPs. In this
|
|
368
|
+%section, we work through an example of aligning with this SNP
|
|
369
|
+%tolerance. By using SNP-tolerance, we'll perform alignments without
|
|
370
|
+%penalizing reads that match SNPs contained in the provided SNPs. In
|
|
371
|
+%this case, we'll use dbSNP to provide the SNPs.
|
|
372
|
+%
|
|
373
|
+%This example will use hg19. If you have not installed
|
|
374
|
+%\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section
|
|
375
|
+%\ref{create_hg19_GmapGenome}.
|
|
376
|
+%
|
|
377
|
+%First we load the Hsapiens \Rclass{GmapGenome} object:
|
|
378
|
+%
|
|
379
|
+%<<<load_GmapGenomeHsapiens>>=
|
|
380
|
+%library("GmapGenome.Hsapiens.UCSC.hg19")
|
|
381
|
+%@
|
353
|
382
|
|
354
|
383
|
<<SessionInfo>>=
|
355
|
384
|
sessionInfo()
|