... | ... |
@@ -125,8 +125,9 @@ example demonstrates: |
125 | 125 |
library(gmapR) |
126 | 126 |
|
127 | 127 |
if (!suppressWarnings(require(BSgenome.Dmelanogaster.UCSC.dm3))) { |
128 |
- source("http://bioconductor.org/biocLite.R") |
|
129 |
- biocLite("BSgenome.Dmelanogaster.UCSC.dm3") |
|
128 |
+ if (!requireNamespace("BiocManager", quietly=TRUE)) |
|
129 |
+ install.packages("BiocManager") |
|
130 |
+ BiocManager::install("BSgenome.Dmelanogaster.UCSC.dm3") |
|
130 | 131 |
library(BSgenome.Dmelanogaster.UCSC.dm3) |
131 | 132 |
} |
132 | 133 |
|
... | ... |
@@ -348,8 +349,9 @@ object: |
348 | 349 |
|
349 | 350 |
<<create_hg19_GmapGenome, eval=FALSE>>= |
350 | 351 |
if (!suppressWarnings(require(BSgenome.Hsapiens.UCSC.hg19))) { |
351 |
- source("http://bioconductor.org/biocLite.R") |
|
352 |
- biocLite("BSgenome.Hsapiens.UCSC.hg19") |
|
352 |
+ if (!requireNamespace("BiocManager", quietly=TRUE)) |
|
353 |
+ install.packages("BiocManager") |
|
354 |
+ BiocManager::install("BSgenome.Hsapiens.UCSC.hg19") |
|
353 | 355 |
library(BSgenome.Hsapiens.UCSC.hg19) |
354 | 356 |
} |
355 | 357 |
gmapGenome <- GmapGenome(genome=Hsapiens, |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@123972 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,10 +308,11 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE, include_soft_clips = 1L, xs=TRUE) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L, xs=TRUE, |
|
312 |
+ min_base_quality = 23L) |
|
312 | 313 |
tallies <-bam_tally(bam_file, |
313 | 314 |
param) |
314 |
-variantSummary(tallies, high_base_quality = 23L) |
|
315 |
+variantSummary(tallies) |
|
315 | 316 |
@ |
316 | 317 |
|
317 | 318 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@115831 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -165,7 +165,7 @@ library("org.Hs.eg.db") |
165 | 165 |
library("TxDb.Hsapiens.UCSC.hg19.knownGene") |
166 | 166 |
eg <- org.Hs.eg.db::org.Hs.egSYMBOL2EG[["TP53"]] |
167 | 167 |
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene |
168 |
-tx <- transcripts(txdb, vals = list(gene_id = eg)) |
|
168 |
+tx <- transcripts(txdb, filter = list(gene_id = eg)) |
|
169 | 169 |
roi <- range(tx) + 1e6 |
170 | 170 |
strand(roi) <- "*" |
171 | 171 |
@ |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@110039 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,7 +308,7 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE, include_soft_clips = 1L) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L, xs=TRUE) |
|
312 | 312 |
tallies <-bam_tally(bam_file, |
313 | 313 |
param) |
314 | 314 |
variantSummary(tallies, high_base_quality = 23L) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@102487 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,7 +308,7 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE, include_soft_clips = 1L, xs=TRUE) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L) |
|
312 | 312 |
tallies <-bam_tally(bam_file, |
313 | 313 |
param) |
314 | 314 |
variantSummary(tallies, high_base_quality = 23L) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@102485 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,7 +308,7 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE, include_soft_clips = 1L) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L, xs=TRUE) |
|
312 | 312 |
tallies <-bam_tally(bam_file, |
313 | 313 |
param) |
314 | 314 |
variantSummary(tallies, high_base_quality = 23L) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@93316 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,7 +308,7 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE, include_soft_clips = 1L, count_xs = TRUE) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L) |
|
312 | 312 |
tallies <-bam_tally(bam_file, |
313 | 313 |
param) |
314 | 314 |
variantSummary(tallies, high_base_quality = 23L) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@90225 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,7 +308,7 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE, include_soft_clips = 1L) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L, count_xs = TRUE) |
|
312 | 312 |
tallies <-bam_tally(bam_file, |
313 | 313 |
param) |
314 | 314 |
variantSummary(tallies, high_base_quality = 23L) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@88343 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -308,7 +308,7 @@ param <- BamTallyParam(genome, |
308 | 308 |
primary_only = FALSE, |
309 | 309 |
min_depth = 0L, variant_strand = 1L, |
310 | 310 |
ignore_query_Ns = TRUE, |
311 |
- indels = FALSE) |
|
311 |
+ indels = FALSE, include_soft_clips = 1L) |
|
312 | 312 |
tallies <-bam_tally(bam_file, |
313 | 313 |
param) |
314 | 314 |
variantSummary(tallies, high_base_quality = 23L) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@87282 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -182,9 +182,12 @@ library("gmapR") |
182 | 182 |
p53Seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, roi, |
183 | 183 |
as.character = FALSE) |
184 | 184 |
names(p53Seq) <- "TP53" |
185 |
-gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE, |
|
185 |
+gmapGenome <- GmapGenome(genome = p53Seq, |
|
186 |
+ name = paste0("TP53_demo_", |
|
187 |
+ packageVersion("TxDb.Hsapiens.UCSC.hg19.knownGene")), |
|
188 |
+ create = TRUE, |
|
186 | 189 |
k = 12L) |
187 |
-@ |
|
190 |
+@ |
|
188 | 191 |
|
189 | 192 |
We add the known transcripts (splice sites) to the genome index: |
190 | 193 |
<<set_TP53_splicesites>>= |
... | ... |
@@ -294,11 +297,8 @@ summarizing these data into R data structures. For now, there is one: |
294 | 297 |
\Rfunction{variantSummary}, which returns a \Rcode{VRanges} object |
295 | 298 |
describing putative genetic variants in the sample. |
296 | 299 |
<<run_bamtally, eval=FALSE>>= |
297 |
-library(gmapR) |
|
298 |
- |
|
299 | 300 |
bam_file <- system.file("extdata/H1993.analyzed.bam", |
300 | 301 |
package="LungCancerLines", mustWork=TRUE) |
301 |
- |
|
302 | 302 |
breaks <- c(0L, 15L, 60L, 75L) |
303 | 303 |
bqual <- 56L |
304 | 304 |
mapq <- 13L |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@86371 bc3139a8-67e5-0310-9ffc-ced21a209358
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@85509 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -137,9 +137,9 @@ gmapGenomeDirectory <- GmapGenomeDirectory(gmapGenomePath, create = TRUE) |
137 | 137 |
##path: /reshpcfs/home/coryba/projects/gmapR2/testGenome |
138 | 138 |
|
139 | 139 |
gmapGenome <- GmapGenome(genome=Dmelanogaster, |
140 |
- directory = gmapGenomeDirectory, |
|
141 |
- name = "dm3", |
|
142 |
- create = TRUE, |
|
140 |
+ directory=gmapGenomeDirectory, |
|
141 |
+ name="dm3", |
|
142 |
+ create=TRUE, |
|
143 | 143 |
k = 12L) |
144 | 144 |
##> gmapGenome |
145 | 145 |
##GmapGenome object |
... | ... |
@@ -167,6 +167,7 @@ eg <- org.Hs.eg.db::org.Hs.egSYMBOL2EG[["TP53"]] |
167 | 167 |
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene |
168 | 168 |
tx <- transcripts(txdb, vals = list(gene_id = eg)) |
169 | 169 |
roi <- range(tx) + 1e6 |
170 |
+strand(roi) <- "*" |
|
170 | 171 |
@ |
171 | 172 |
|
172 | 173 |
Next we get the genetic sequence and use it to create a GmapGenome |
... | ... |
@@ -181,7 +182,14 @@ library("gmapR") |
181 | 182 |
p53Seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, roi, |
182 | 183 |
as.character = FALSE) |
183 | 184 |
names(p53Seq) <- "TP53" |
184 |
-gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE, k = 12L) |
|
185 |
+gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE, |
|
186 |
+ k = 12L) |
|
187 |
+@ |
|
188 |
+ |
|
189 |
+We add the known transcripts (splice sites) to the genome index: |
|
190 |
+<<set_TP53_splicesites>>= |
|
191 |
+exons <- gmapR:::subsetRegion(exonsBy(txdb), roi, "TP53") |
|
192 |
+spliceSites(gmapGenome, "knownGene") <- exons |
|
185 | 193 |
@ |
186 | 194 |
|
187 | 195 |
The data package \software{LungCancerLines} contains fastqs of reads |
... | ... |
@@ -199,15 +207,15 @@ behavior. |
199 | 207 |
|
200 | 208 |
<<create_gsnapParam>>= |
201 | 209 |
##specify how GSNAP should behave using a GsnapParam object |
202 |
-gsnapParam <- GsnapParam(genome = gmapGenome, |
|
203 |
- unique_only = FALSE, |
|
204 |
- max_mismatches = NULL, |
|
205 |
- suboptimal_levels = 0, mode = "standard", |
|
206 |
- npaths = 10, |
|
207 |
- novelsplicing = FALSE, splicing = NULL, |
|
208 |
- nthreads = 1, |
|
209 |
- batch = 2L, |
|
210 |
- gunzip=TRUE) |
|
210 |
+gsnapParam <- GsnapParam(genome=gmapGenome, |
|
211 |
+ unique_only=FALSE, |
|
212 |
+ suboptimal_levels=2L, |
|
213 |
+ npaths=10L, |
|
214 |
+ novelsplicing=TRUE, |
|
215 |
+ splicing="knownGene", |
|
216 |
+ indel_penalty=1L, |
|
217 |
+ distant_splice_penalty=1L, |
|
218 |
+ clip_overlaps=TRUE) |
|
211 | 219 |
@ |
212 | 220 |
|
213 | 221 |
Now we are ready to align. |
... | ... |
@@ -216,6 +224,7 @@ Now we are ready to align. |
216 | 224 |
gsnapOutput <- gsnap(input_a=fastqs["H1993.first"], |
217 | 225 |
input_b=fastqs["H1993.last"], |
218 | 226 |
params=gsnapParam) |
227 |
+ |
|
219 | 228 |
##gsnapOutput |
220 | 229 |
##An object of class "GsnapOutput" |
221 | 230 |
##Slot "path": |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@80762 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -282,7 +282,7 @@ scores, and read position for each allele. |
282 | 282 |
The call to \Rfunction{bam\_tally} returns an opaque pointer to a |
283 | 283 |
C-level data structure. We anticipate having multiple means of |
284 | 284 |
summarizing these data into R data structures. For now, there is one: |
285 |
-\Rfunction{variantSummary}, which returns a \code{VRanges} object |
|
285 |
+\Rfunction{variantSummary}, which returns a \Rcode{VRanges} object |
|
286 | 286 |
describing putative genetic variants in the sample. |
287 | 287 |
<<run_bamtally, eval=FALSE>>= |
288 | 288 |
library(gmapR) |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@80611 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -279,6 +279,11 @@ demonstrate the use of \software{bam\_tally} through the |
279 | 279 |
the needed information such as number of alternative alleles, quality |
280 | 280 |
scores, and read position for each allele. |
281 | 281 |
|
282 |
+The call to \Rfunction{bam\_tally} returns an opaque pointer to a |
|
283 |
+C-level data structure. We anticipate having multiple means of |
|
284 |
+summarizing these data into R data structures. For now, there is one: |
|
285 |
+\Rfunction{variantSummary}, which returns a \code{VRanges} object |
|
286 |
+describing putative genetic variants in the sample. |
|
282 | 287 |
<<run_bamtally, eval=FALSE>>= |
283 | 288 |
library(gmapR) |
284 | 289 |
|
... | ... |
@@ -288,8 +293,7 @@ bam_file <- system.file("extdata/H1993.analyzed.bam", |
288 | 293 |
breaks <- c(0L, 15L, 60L, 75L) |
289 | 294 |
bqual <- 56L |
290 | 295 |
mapq <- 13L |
291 |
-param <- BamTallyParam(genome, read_pos_breaks = breaks, |
|
292 |
- high_base_quality = bqual, |
|
296 |
+param <- BamTallyParam(genome, |
|
293 | 297 |
minimum_mapq = mapq, |
294 | 298 |
concordant_only = FALSE, unique_only = FALSE, |
295 | 299 |
primary_only = FALSE, |
... | ... |
@@ -298,6 +302,7 @@ param <- BamTallyParam(genome, read_pos_breaks = breaks, |
298 | 302 |
indels = FALSE) |
299 | 303 |
tallies <-bam_tally(bam_file, |
300 | 304 |
param) |
305 |
+variantSummary(tallies, high_base_quality = 23L) |
|
301 | 306 |
@ |
302 | 307 |
|
303 | 308 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@78427 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -277,7 +277,7 @@ aligned to the TP53 region of the human genome. We'll use this file to |
277 | 277 |
demonstrate the use of \software{bam\_tally} through the |
278 | 278 |
\software{gmapR} package. The resulting data structure will contain |
279 | 279 |
the needed information such as number of alternative alleles, quality |
280 |
-scores, and nucleotide cycle for each allele. |
|
280 |
+scores, and read position for each allele. |
|
281 | 281 |
|
282 | 282 |
<<run_bamtally, eval=FALSE>>= |
283 | 283 |
library(gmapR) |
... | ... |
@@ -288,7 +288,7 @@ bam_file <- system.file("extdata/H1993.analyzed.bam", |
288 | 288 |
breaks <- c(0L, 15L, 60L, 75L) |
289 | 289 |
bqual <- 56L |
290 | 290 |
mapq <- 13L |
291 |
-param <- BamTallyParam(genome, cycle_breaks = breaks, |
|
291 |
+param <- BamTallyParam(genome, read_pos_breaks = breaks, |
|
292 | 292 |
high_base_quality = bqual, |
293 | 293 |
minimum_mapq = mapq, |
294 | 294 |
concordant_only = FALSE, unique_only = FALSE, |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@71533 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -181,7 +181,7 @@ library("gmapR") |
181 | 181 |
p53Seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, roi, |
182 | 182 |
as.character = FALSE) |
183 | 183 |
names(p53Seq) <- "TP53" |
184 |
-gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE) |
|
184 |
+gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE, k = 12L) |
|
185 | 185 |
@ |
186 | 186 |
|
187 | 187 |
The data package \software{LungCancerLines} contains fastqs of reads |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@71493 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -139,7 +139,8 @@ gmapGenomeDirectory <- GmapGenomeDirectory(gmapGenomePath, create = TRUE) |
139 | 139 |
gmapGenome <- GmapGenome(genome=Dmelanogaster, |
140 | 140 |
directory = gmapGenomeDirectory, |
141 | 141 |
name = "dm3", |
142 |
- create = TRUE) |
|
142 |
+ create = TRUE, |
|
143 |
+ k = 12L) |
|
143 | 144 |
##> gmapGenome |
144 | 145 |
##GmapGenome object |
145 | 146 |
##genome: dm3 |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69291 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -169,7 +169,7 @@ roi <- range(tx) + 1e6 |
169 | 169 |
@ |
170 | 170 |
|
171 | 171 |
Next we get the genetic sequence and use it to create a GmapGenome |
172 |
-object. (Please note that the TP53_demo GmapGenome object is used by |
|
172 |
+object. (Please note that the TP53\_demo GmapGenome object is used by |
|
173 | 173 |
many examples and tests in the gmapR and VariationTools packages. If |
174 | 174 |
the object has been created before, its subsequent creation will be |
175 | 175 |
instantaneous.) |
... | ... |
@@ -273,7 +273,7 @@ genome <- TP53Genome() |
273 | 273 |
|
274 | 274 |
The \software{LungCancerLines} R package contains a BAM file of reads |
275 | 275 |
aligned to the TP53 region of the human genome. We'll use this file to |
276 |
-demonstrate the use of \software{bam_tally} through the |
|
276 |
+demonstrate the use of \software{bam\_tally} through the |
|
277 | 277 |
\software{gmapR} package. The resulting data structure will contain |
278 | 278 |
the needed information such as number of alternative alleles, quality |
279 | 279 |
scores, and nucleotide cycle for each allele. |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69290 bc3139a8-67e5-0310-9ffc-ced21a209358
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69287 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -130,7 +130,7 @@ if (!suppressWarnings(require(BSgenome.Dmelanogaster.UCSC.dm3))) { |
130 | 130 |
library(BSgenome.Dmelanogaster.UCSC.dm3) |
131 | 131 |
} |
132 | 132 |
|
133 |
-gmapGenomePath <- file.path(getwd(), "testGenome") |
|
133 |
+gmapGenomePath <- file.path(getwd(), "flyGenome") |
|
134 | 134 |
gmapGenomeDirectory <- GmapGenomeDirectory(gmapGenomePath, create = TRUE) |
135 | 135 |
##> gmapGenomeDirectory |
136 | 136 |
##GmapGenomeDirectory object |
... | ... |
@@ -151,41 +151,51 @@ gmapGenome <- GmapGenome(genome=Dmelanogaster, |
151 | 151 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
152 | 152 |
|
153 | 153 |
The \software{GSNAP} algorithm incorporates biological knowledge to |
154 |
-provide accurate alignments, particularly for RNA-seq data. The |
|
155 |
-following example synthesizes some input reads and aligns them using |
|
156 |
-\software{GSNAP}. |
|
157 |
- |
|
158 |
-If you have created the D. melanogaster \Rclass{GmapGenome} package as |
|
159 |
-detailed above and installed it, you can begin this section as |
|
160 |
-follows: |
|
154 |
+provide accurate alignments, particularly for RNA-seq data. In this |
|
155 |
+section, we will align reads from an RNA-seq experiment provided in a |
|
156 |
+fastq file to a selected region of the human genome. In this example, |
|
157 |
+we will align to the region of the human genome containing TP53 plus |
|
158 |
+an additional one megabase on each side of this gene. |
|
159 |
+ |
|
160 |
+First we need to obtain the desired region of interest from the genome: |
|
161 |
+ |
|
162 |
+<<get_TP53_coordinates>>= |
|
163 |
+library("org.Hs.eg.db") |
|
164 |
+library("TxDb.Hsapiens.UCSC.hg19.knownGene") |
|
165 |
+eg <- org.Hs.eg.db::org.Hs.egSYMBOL2EG[["TP53"]] |
|
166 |
+txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene |
|
167 |
+tx <- transcripts(txdb, vals = list(gene_id = eg)) |
|
168 |
+roi <- range(tx) + 1e6 |
|
169 |
+@ |
|
161 | 170 |
|
162 |
-<<<load_GmapGenome, eval=FALSE>>= |
|
163 |
-library("GmapGenome.Dmelanogaster.UCSC.dm3") |
|
164 |
-gmapGenome <- GmapGenome.Dmelanogaster.UCSC.dm3 |
|
171 |
+Next we get the genetic sequence and use it to create a GmapGenome |
|
172 |
+object. (Please note that the TP53_demo GmapGenome object is used by |
|
173 |
+many examples and tests in the gmapR and VariationTools packages. If |
|
174 |
+the object has been created before, its subsequent creation will be |
|
175 |
+instantaneous.) |
|
176 |
+ |
|
177 |
+<<get_TP53_seq>>= |
|
178 |
+library("BSgenome.Hsapiens.UCSC.hg19") |
|
179 |
+p53Seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19::Hsapiens, roi, |
|
180 |
+ as.character = FALSE) |
|
181 |
+names(p53Seq) <- "TP53" |
|
182 |
+gmapGenome <- GmapGenome(genome = p53Seq, name = "TP53_demo", create = TRUE) |
|
165 | 183 |
@ |
166 | 184 |
|
167 |
-Otherwise, you can create the \Rcode{gmapGenome} object as detailed in the |
|
168 |
-initial section of this vignette. |
|
185 |
+The data package \software{LungCancerLines} contains fastqs of reads |
|
186 |
+obtained from sequencing H1993 and H2073 cell lines. We will use these |
|
187 |
+fastqs to demonstrate \software{GSNAP} alignment with \software{gmapR}. |
|
169 | 188 |
|
170 |
-<<align_with_gsnap, eval=FALSE>>= |
|
171 |
-##synthesize paired-end FASTA input to align against the genome |
|
172 |
-library("BSgenome.Dmelanogaster.UCSC.dm3") |
|
173 |
-dnaStringSet <- Dmelanogaster$chr4 |
|
174 |
-makeTestFasta <- function(seqStarts, pairNum) { |
|
175 |
- seqEnds <- seqStarts + 74 |
|
176 |
- nucleotides <- sapply(seq_along(seqStarts), |
|
177 |
- function(i) as.character(subseq(dnaStringSet, seqStarts[i], seqEnds[i])) |
|
178 |
- ) |
|
179 |
- fastaHeaders <- paste0(">", "fakeRead", seq_along(nucleotides), "#0/", pairNum) |
|
180 |
- fastaLines <- paste(fastaHeaders, nucleotides, sep="\n") |
|
181 |
- tmpFasta <- paste0(tempfile(), ".", pairNum, ".fa") |
|
182 |
- writeLines(fastaLines, con=tmpFasta) |
|
183 |
- tmpFasta |
|
184 |
-} |
|
185 |
-seqStarts <- seq(1, 100000, by = 10000) + length(dnaStringSet) * 2 / 3 |
|
186 |
-tmpFasta1 <- makeTestFasta(seqStarts, '1') |
|
187 |
-tmpFasta2 <- makeTestFasta(seqStarts + 125, '2') |
|
189 |
+<<get_lung_cancer_fastqs>>= |
|
190 |
+library("LungCancerLines") |
|
191 |
+fastqs <- LungCancerFastqFiles() |
|
192 |
+@ |
|
188 | 193 |
|
194 |
+\software{GSNAP} is highly configurable. Users create |
|
195 |
+\Rclass{GsnapParam} objects to specify desired \software{GSNAP} |
|
196 |
+behavior. |
|
197 |
+ |
|
198 |
+<<create_gsnapParam>>= |
|
189 | 199 |
##specify how GSNAP should behave using a GsnapParam object |
190 | 200 |
gsnapParam <- GsnapParam(genome = gmapGenome, |
191 | 201 |
unique_only = FALSE, |
... | ... |
@@ -194,10 +204,15 @@ gsnapParam <- GsnapParam(genome = gmapGenome, |
194 | 204 |
npaths = 10, |
195 | 205 |
novelsplicing = FALSE, splicing = NULL, |
196 | 206 |
nthreads = 1, |
197 |
- batch = 2L) |
|
207 |
+ batch = 2L, |
|
208 |
+ gunzip=TRUE) |
|
209 |
+@ |
|
198 | 210 |
|
199 |
-gsnapOutput <- gsnap(input_a=tmpFasta1, |
|
200 |
- input_b=tmpFasta2, |
|
211 |
+Now we are ready to align. |
|
212 |
+ |
|
213 |
+<<align_with_gsnap, eval=FALSE>>= |
|
214 |
+gsnapOutput <- gsnap(input_a=fastqs["H1993.first"], |
|
215 |
+ input_b=fastqs["H1993.last"], |
|
201 | 216 |
params=gsnapParam) |
202 | 217 |
##gsnapOutput |
203 | 218 |
##An object of class "GsnapOutput" |
... | ... |
@@ -244,29 +259,41 @@ alignments that \software{GSNAP} outputs along with other utilities. |
244 | 259 |
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
245 | 260 |
|
246 | 261 |
Running the \Rmethod{bam\_tally} method will return a \Rclass{GRanges} |
247 |
-of information per nucleotide. Below is a basic example. See the |
|
248 |
-documentation for the \Rmethod{bam\_tally} method for details. |
|
262 |
+of information per nucleotide. Below is an example demonstrating how |
|
263 |
+to find variants in the TP53 gene of the human genome. See the |
|
264 |
+documentation for the \Rmethod{bam\_tally} method for more details. |
|
265 |
+ |
|
266 |
+\software{gmapR} provides access to a demo genome for examples. This |
|
267 |
+genome encompasses the TP53 gene along with a 1-megabase flanking |
|
268 |
+region on each side. |
|
269 |
+<<TP53Genome_accessor>>= |
|
270 |
+genome <- TP53Genome() |
|
271 |
+@ |
|
272 |
+ |
|
273 |
+The \software{LungCancerLines} R package contains a BAM file of reads |
|
274 |
+aligned to the TP53 region of the human genome. We'll use this file to |
|
275 |
+demonstrate the use of \software{bam_tally} through the |
|
276 |
+\software{gmapR} package. The resulting data structure will contain |
|
277 |
+the needed information such as number of alternative alleles, quality |
|
278 |
+scores, and nucleotide cycle for each allele. |
|
249 | 279 |
|
250 | 280 |
<<run_bamtally, eval=FALSE>>= |
251 | 281 |
library(gmapR) |
252 |
-library(GmapGenome.Hsapiens.UCSC.hg19) |
|
253 |
-genome <- GmapGenome.Hsapiens.UCSC.hg19 |
|
254 | 282 |
|
255 |
-bam_file <- file.path(system.file(package = "gmapR", mustWork=TRUE), |
|
256 |
- "extdata/bam_tally_test.bam") |
|
283 |
+bam_file <- system.file("extdata/H1993.analyzed.bam", |
|
284 |
+ package="LungCancerLines", mustWork=TRUE) |
|
257 | 285 |
|
258 | 286 |
breaks <- c(0L, 15L, 60L, 75L) |
259 | 287 |
bqual <- 56L |
260 | 288 |
mapq <- 13L |
261 | 289 |
param <- BamTallyParam(genome, cycle_breaks = breaks, |
262 |
- high_quality_cutoff = bqual, |
|
290 |
+ high_base_quality = bqual, |
|
263 | 291 |
minimum_mapq = mapq, |
264 | 292 |
concordant_only = FALSE, unique_only = FALSE, |
265 | 293 |
primary_only = FALSE, |
266 | 294 |
min_depth = 0L, variant_strand = 1L, |
267 | 295 |
ignore_query_Ns = TRUE, |
268 | 296 |
indels = FALSE) |
269 |
- |
|
270 | 297 |
tallies <-bam_tally(bam_file, |
271 | 298 |
param) |
272 | 299 |
@ |
... | ... |
@@ -330,26 +357,28 @@ repository, etc. After installation, |
330 | 357 |
\Rcode{library("GmapGenome.Hsapiens.UCSC.hg19")} will load a |
331 | 358 |
\Rclass{GmapGenome} object that has the same name as the package. |
332 | 359 |
|
333 |
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
334 |
-\section{Aligning with SNP Tolerance} |
|
335 |
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
336 | 360 |
|
337 |
-Both \software{GSNAP} and \software{GMAP} have the ability to perform |
|
338 |
-alignment without penalizing against a set of known SNPs. In this |
|
339 |
-section, we work through an example of aligning with this SNP |
|
340 |
-tolerance. By using SNP-tolerance, we'll perform alignments without |
|
341 |
-penalizing reads that match SNPs contained in the provided SNPs. In |
|
342 |
-this case, we'll use dbSNP to provide the SNPs. |
|
343 | 361 |
|
344 |
-This example will use hg19. If you have not installed |
|
345 |
-\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section |
|
346 |
-\ref{create_hg19_GmapGenome}. |
|
347 |
- |
|
348 |
-First we load the Hsapiens \Rclass{GmapGenome} object: |
|
362 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
363 |
+%\section{Aligning with SNP Tolerance} |
|
364 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
349 | 365 |
|
350 |
-<<<load_GmapGenomeHsapiens>>= |
|
351 |
-library("GmapGenome.Hsapiens.UCSC.hg19") |
|
352 |
-@ |
|
366 |
+%Both \software{GSNAP} and \software{GMAP} have the ability to perform |
|
367 |
+%alignment without penalizing against a set of known SNPs. In this |
|
368 |
+%section, we work through an example of aligning with this SNP |
|
369 |
+%tolerance. By using SNP-tolerance, we'll perform alignments without |
|
370 |
+%penalizing reads that match SNPs contained in the provided SNPs. In |
|
371 |
+%this case, we'll use dbSNP to provide the SNPs. |
|
372 |
+% |
|
373 |
+%This example will use hg19. If you have not installed |
|
374 |
+%\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section |
|
375 |
+%\ref{create_hg19_GmapGenome}. |
|
376 |
+% |
|
377 |
+%First we load the Hsapiens \Rclass{GmapGenome} object: |
|
378 |
+% |
|
379 |
+%<<<load_GmapGenomeHsapiens>>= |
|
380 |
+%library("GmapGenome.Hsapiens.UCSC.hg19") |
|
381 |
+%@ |
|
353 | 382 |
|
354 | 383 |
<<SessionInfo>>= |
355 | 384 |
sessionInfo() |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69142 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -155,8 +155,9 @@ provide accurate alignments, particularly for RNA-seq data. The |
155 | 155 |
following example synthesizes some input reads and aligns them using |
156 | 156 |
\software{GSNAP}. |
157 | 157 |
|
158 |
-If you've created the D. melanogaster \Rclass{GmapGenome} package as detailed above and installed |
|
159 |
-it, you can begin this section as follows: |
|
158 |
+If you have created the D. melanogaster \Rclass{GmapGenome} package as |
|
159 |
+detailed above and installed it, you can begin this section as |
|
160 |
+follows: |
|
160 | 161 |
|
161 | 162 |
<<<load_GmapGenome, eval=FALSE>>= |
162 | 163 |
library("GmapGenome.Dmelanogaster.UCSC.dm3") |
... | ... |
@@ -189,11 +190,11 @@ tmpFasta2 <- makeTestFasta(seqStarts + 125, '2') |
189 | 190 |
gsnapParam <- GsnapParam(genome = gmapGenome, |
190 | 191 |
unique_only = FALSE, |
191 | 192 |
max_mismatches = NULL, |
192 |
- suboptimal_levels = 0L, mode = "standard", |
|
193 |
- npaths = 10L, |
|
193 |
+ suboptimal_levels = 0, mode = "standard", |
|
194 |
+ npaths = 10, |
|
194 | 195 |
novelsplicing = FALSE, splicing = NULL, |
195 |
- nthreads = 1L, |
|
196 |
- batch = "2") |
|
196 |
+ nthreads = 1, |
|
197 |
+ batch = 2L) |
|
197 | 198 |
|
198 | 199 |
gsnapOutput <- gsnap(input_a=tmpFasta1, |
199 | 200 |
input_b=tmpFasta2, |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@69060 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -289,7 +289,7 @@ makeGmapGenomePackage(gmapGenome=gmapGenome, |
289 | 289 |
destDir="myDestDir", |
290 | 290 |
license="Artistic-2.0", |
291 | 291 |
pkgName="GmapGenome.Dmelanogaster.UCSC.dm3") |
292 |
-@ |
|
292 |
+@ |
|
293 | 293 |
|
294 | 294 |
After creating the package, you can run \Rcode{R CMD INSTALL |
295 | 295 |
myDestDir} to install it, or run \Rcode{R CMD build myDestDir} to |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@68698 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -295,6 +295,61 @@ After creating the package, you can run \Rcode{R CMD INSTALL |
295 | 295 |
myDestDir} to install it, or run \Rcode{R CMD build myDestDir} to |
296 | 296 |
create a distribution of the package. |
297 | 297 |
|
298 |
+Many users with be working with the human genome. Many of the examples |
|
299 |
+used in \software{gmapR} make use of a particular build of the human |
|
300 |
+genome. As such, creating a \Rclass{GmapGenome} of hg19 is |
|
301 |
+recommended. Here is one way to create it, using a \Rclass{BSgenome} |
|
302 |
+object: |
|
303 |
+ |
|
304 |
+<<create_hg19_GmapGenome, eval=FALSE>>= |
|
305 |
+if (!suppressWarnings(require(BSgenome.Hsapiens.UCSC.hg19))) { |
|
306 |
+ source("http://bioconductor.org/biocLite.R") |
|
307 |
+ biocLite("BSgenome.Hsapiens.UCSC.hg19") |
|
308 |
+ library(BSgenome.Hsapiens.UCSC.hg19) |
|
309 |
+} |
|
310 |
+gmapGenome <- GmapGenome(genome=Hsapiens, |
|
311 |
+ directory = "Hsapiens", |
|
312 |
+ name = "hg19", |
|
313 |
+ create = TRUE) |
|
314 |
+destDir <- "HsapiensGmapGenome" |
|
315 |
+pkgName <- "GmapGenome.Hsapiens.UCSC.hg19" |
|
316 |
+makeGmapGenomePackage(gmapGenome=gmapGenome, |
|
317 |
+ version="1.0.0", |
|
318 |
+ maintainer="<your.name@somewhere.com>", |
|
319 |
+ author="Your Name", |
|
320 |
+ destDir=destDir, |
|
321 |
+ license="Artistic-2.0", |
|
322 |
+ pkgName="GmapGenome.Hsapiens.UCSC.hg19") |
|
323 |
+@ |
|
324 |
+ |
|
325 |
+After running the above code, you should be able to run \Rcode{R CMD |
|
326 |
+ INSTALL GmapGenome.Hsapiens.UCSC.hg19} in the appropriate directory |
|
327 |
+from the command line, submit GmapGenome.Hsapiens.UCSC.hg19 to a |
|
328 |
+repository, etc. After installation, |
|
329 |
+\Rcode{library("GmapGenome.Hsapiens.UCSC.hg19")} will load a |
|
330 |
+\Rclass{GmapGenome} object that has the same name as the package. |
|
331 |
+ |
|
332 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
333 |
+\section{Aligning with SNP Tolerance} |
|
334 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
335 |
+ |
|
336 |
+Both \software{GSNAP} and \software{GMAP} have the ability to perform |
|
337 |
+alignment without penalizing against a set of known SNPs. In this |
|
338 |
+section, we work through an example of aligning with this SNP |
|
339 |
+tolerance. By using SNP-tolerance, we'll perform alignments without |
|
340 |
+penalizing reads that match SNPs contained in the provided SNPs. In |
|
341 |
+this case, we'll use dbSNP to provide the SNPs. |
|
342 |
+ |
|
343 |
+This example will use hg19. If you have not installed |
|
344 |
+\Rcode{GmapGenome.Hsapiens.UCSC.hg19}, please refer to the section |
|
345 |
+\ref{create_hg19_GmapGenome}. |
|
346 |
+ |
|
347 |
+First we load the Hsapiens \Rclass{GmapGenome} object: |
|
348 |
+ |
|
349 |
+<<<load_GmapGenomeHsapiens>>= |
|
350 |
+library("GmapGenome.Hsapiens.UCSC.hg19") |
|
351 |
+@ |
|
352 |
+ |
|
298 | 353 |
<<SessionInfo>>= |
299 | 354 |
sessionInfo() |
300 | 355 |
@ |
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@68552 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -1,3 +1,4 @@ |
1 |
+ |
|
1 | 2 |
% \VignetteIndexEntry{gmapR} |
2 | 3 |
% \VignetteKeywords{gmap,gsnap} |
3 | 4 |
% \VignettePackage{gmapR} |
... | ... |
@@ -256,9 +257,9 @@ bam_file <- file.path(system.file(package = "gmapR", mustWork=TRUE), |
256 | 257 |
breaks <- c(0L, 15L, 60L, 75L) |
257 | 258 |
bqual <- 56L |
258 | 259 |
mapq <- 13L |
259 |
-param <- BamTallyParam(cycle_breaks = as.integer(breaks), |
|
260 |
- high_quality_cutoff = as.integer(bqual), |
|
261 |
- minimum_mapq = as.integer(mapq), |
|
260 |
+param <- BamTallyParam(genome, cycle_breaks = breaks, |
|
261 |
+ high_quality_cutoff = bqual, |
|
262 |
+ minimum_mapq = mapq, |
|
262 | 263 |
concordant_only = FALSE, unique_only = FALSE, |
263 | 264 |
primary_only = FALSE, |
264 | 265 |
min_depth = 0L, variant_strand = 1L, |
... | ... |
@@ -266,7 +267,6 @@ param <- BamTallyParam(cycle_breaks = as.integer(breaks), |
266 | 267 |
indels = FALSE) |
267 | 268 |
|
268 | 269 |
tallies <-bam_tally(bam_file, |
269 |
- genome, |
|
270 | 270 |
param) |
271 | 271 |
@ |
272 | 272 |
|
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@68172 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,301 @@ |
1 |
+% \VignetteIndexEntry{gmapR} |
|
2 |
+% \VignetteKeywords{gmap,gsnap} |
|
3 |
+% \VignettePackage{gmapR} |
|
4 |
+\documentclass[10pt]{article} |
|
5 |
+ |
|
6 |
+\usepackage{times} |
|
7 |
+\usepackage{hyperref} |
|
8 |
+\usepackage{Sweave} |
|
9 |
+ |
|
10 |
+\textwidth=6.5in |
|
11 |
+\textheight=8.5in |
|
12 |
+% \parskip=.3cm |
|
13 |
+\oddsidemargin=-.1in |
|
14 |
+\evensidemargin=-.1in |
|
15 |
+\headheight=-.3in |
|
16 |
+ |
|
17 |
+\newcommand{\Rfunction}[1]{{\texttt{#1}}} |
|
18 |
+\newcommand{\Robject}[1]{{\texttt{#1}}} |
|
19 |
+\newcommand{\Rpackage}[1]{{\textit{#1}}} |
|
20 |
+\newcommand{\Rmethod}[1]{{\texttt{#1}}} |
|
21 |
+\newcommand{\Rfunarg}[1]{{\texttt{#1}}} |
|
22 |
+\newcommand{\Rclass}[1]{{\textit{#1}}} |
|
23 |
+\newcommand{\Rcode}[1]{{\texttt{#1}}} |
|
24 |
+ |
|
25 |
+\newcommand{\software}[1]{\textsf{#1}} |
|
26 |
+\newcommand{\R}{\software{R}} |
|
27 |
+ |
|
28 |
+\title{gmapR: Use the GMAP Suite of Tools in R} |
|
29 |
+\author{Michael Lawrence, Cory Barr} |
|
30 |
+\date{\today} |
|
31 |
+ |
|
32 |
+\begin{document} |
|
33 |
+ |
|
34 |
+\maketitle |
|
35 |
+\tableofcontents |
|
36 |
+\newpage |
|
37 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
38 |
+\section{Introduction} |
|
39 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
40 |
+ |
|
41 |
+The \Rpackage{gmapR} packages provides users with a way to access |
|
42 |
+\software{GSNAP}, \software{bam\_tally}, and other utilities from the |
|
43 |
+\software{GMAP} suite of tools within an R session. In this vignette, |
|
44 |
+we briefly look at the \software{GMAP} suite of tools available |
|
45 |
+through the \Rpackage{gmapR} package and work through an example. |
|
46 |
+ |
|
47 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
48 |
+\section{What is \software{GMAP}, \software{GSNAP}, and \software{bam\_tally}?} |
|
49 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
50 |
+ |
|
51 |
+The \software{GMAP} suite offers useful tools for the following: |
|
52 |
+ |
|
53 |
+\begin{itemize} |
|
54 |
+ |
|
55 |
+ \item Genomic mapping: Given a cDNA, find where it best aligns to |
|
56 |
+ an entire genome |
|
57 |
+ |
|
58 |
+ \item Genomic alignment: Given a cDNA and a genomic segment, |
|
59 |
+ provide a nucleotide-level correspondence for the exons of the |
|
60 |
+ cDNA to the genomic segment |
|
61 |
+ |
|
62 |
+ \item Summarization via coverage plus reference and allele |
|
63 |
+ nucleotide polymorphism counts for an aligned set of sequencing |
|
64 |
+ reads over a given genomic location |
|
65 |
+ |
|
66 |
+\end{itemize} |
|
67 |
+ |
|
68 |
+\software{GMAP} (Genomic Mapping and Alignment Program) is |
|
69 |
+particularly suited to relatively long mRNA and EST sequences such as |
|
70 |
+those that are obtained from Roche 454 or Pacific Biosciences |
|
71 |
+sequencing technologies. (At present, only \software{GSNAP} is |
|
72 |
+available through the \Rpackage{gmapR}. \software{GMAP} integration is |
|
73 |
+scheduled for the near future.) |
|
74 |
+ |
|
75 |
+\software{GSNAP} (Genomic Short-read Nucleotide Alignment Program) |
|
76 |
+also provides users with genomic mapping and alignment capabilities, |
|
77 |
+but is optimized to handle issues that arise when dealing with the |
|
78 |
+alignment of short reads generated from sequencing technologies such |
|
79 |
+as those from Illumina/Solexa or ABI/SOLiD. \software{GSNAP} offers |
|
80 |
+the following functionality, as mentioned in |
|
81 |
+\href{http://bioinformatics.oxfordjournals.org/content/26/7/873.full}{Fast |
|
82 |
+ and SNP-tolerant detection of complex variants and splicing in short |
|
83 |
+ reads} by Thomas D. Wu and Serban Nacu: |
|
84 |
+ |
|
85 |
+\begin{itemize} |
|
86 |
+ |
|
87 |
+ \item fast detection of complex variants and splicing in short |
|
88 |
+ reads, based on a successively constrained search process of |
|
89 |
+ merging and filtering position lists from a genomic index |
|
90 |
+ |
|
91 |
+ \item alignment of both single- and paired-end reads as short as 14 |
|
92 |
+ nt and of arbitrarily long length |
|
93 |
+ |
|
94 |
+ \item detection of short- and long-distance splicing, including |
|
95 |
+ interchromosomal splicing, in individual reads, using |
|
96 |
+ probabilistic models or a database of known splice sites |
|
97 |
+ |
|
98 |
+ \item SNP-tolerant alignment to a reference space of all possible |
|
99 |
+ combinations of major and minor alleles |
|
100 |
+ |
|
101 |
+ \item alignment of reads from bisulfite-treated DNA for the study of |
|
102 |
+ methylation state |
|
103 |
+ |
|
104 |
+\end{itemize} |
|
105 |
+ |
|
106 |
+\software{bam\_tally} provides users with the coverage as well as |
|
107 |
+counts of both reference alleles, alternative alleles, and indels over |
|
108 |
+a genomic region. |
|
109 |
+ |
|
110 |
+For more detailed information on the \software{GMAP} suite of tools |
|
111 |
+including a detailed explication of algorithmic specifics, see |
|
112 |
+\url{http://research-pub.gene.com/gmap/}. |
|
113 |
+ |
|
114 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
115 |
+\section{Create a \Rclass{GmapGenome} Object} |
|
116 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
117 |
+To align reads with \software{GMAP} or \software{GSNAP}, or to use |
|
118 |
+\software{bam\_tally}, you will need to either obtain or create a |
|
119 |
+\Rclass{GmapGenome} object. \Rclass{GmapGenome} objects can be created |
|
120 |
+from FASTA files or \Rclass{BSgenome} objects, as the following |
|
121 |
+example demonstrates: |
|
122 |
+ |
|
123 |
+<<create_GmapGenome, eval=FALSE>>= |
|
124 |
+library(gmapR) |
|
125 |
+ |
|
126 |
+if (!suppressWarnings(require(BSgenome.Dmelanogaster.UCSC.dm3))) { |
|
127 |
+ source("http://bioconductor.org/biocLite.R") |
|
128 |
+ biocLite("BSgenome.Dmelanogaster.UCSC.dm3") |
|
129 |
+ library(BSgenome.Dmelanogaster.UCSC.dm3) |
|
130 |
+} |
|
131 |
+ |
|
132 |
+gmapGenomePath <- file.path(getwd(), "testGenome") |
|
133 |
+gmapGenomeDirectory <- GmapGenomeDirectory(gmapGenomePath, create = TRUE) |
|
134 |
+##> gmapGenomeDirectory |
|
135 |
+##GmapGenomeDirectory object |
|
136 |
+##path: /reshpcfs/home/coryba/projects/gmapR2/testGenome |
|
137 |
+ |
|
138 |
+gmapGenome <- GmapGenome(genome=Dmelanogaster, |
|
139 |
+ directory = gmapGenomeDirectory, |
|
140 |
+ name = "dm3", |
|
141 |
+ create = TRUE) |
|
142 |
+##> gmapGenome |
|
143 |
+##GmapGenome object |
|
144 |
+##genome: dm3 |
|
145 |
+##directory: /reshpcfs/home/coryba/projects/gmapR2/testGenome |
|
146 |
+@ |
|
147 |
+ |
|
148 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
149 |
+\section{Aligning with \software{GSNAP}} |
|
150 |
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
|
151 |
+ |
|
152 |
+The \software{GSNAP} algorithm incorporates biological knowledge to |
|
153 |
+provide accurate alignments, particularly for RNA-seq data. The |
|
154 |
+following example synthesizes some input reads and aligns them using |
|
155 |
+\software{GSNAP}. |
|
156 |
+ |
|
157 |
+If you've created the D. melanogaster \Rclass{GmapGenome} package as detailed above and installed |
|
158 |
+it, you can begin this section as follows: |
|
159 |
+ |
|
160 |
+<<<load_GmapGenome, eval=FALSE>>= |
|
161 |
+library("GmapGenome.Dmelanogaster.UCSC.dm3") |
|
162 |
+gmapGenome <- GmapGenome.Dmelanogaster.UCSC.dm3 |
|
163 |
+@ |
|
164 |
+ |
|
165 |
+Otherwise, you can create the \Rcode{gmapGenome} object as detailed in the |
|
166 |
+initial section of this vignette. |
|
167 |
+ |
|
168 |
+<<align_with_gsnap, eval=FALSE>>= |
|
169 |
+##synthesize paired-end FASTA input to align against the genome |
|
170 |
+library("BSgenome.Dmelanogaster.UCSC.dm3") |
|
171 |
+dnaStringSet <- Dmelanogaster$chr4 |
|
172 |
+makeTestFasta <- function(seqStarts, pairNum) { |
|
173 |
+ seqEnds <- seqStarts + 74 |
|
174 |
+ nucleotides <- sapply(seq_along(seqStarts), |
|
175 |
+ function(i) as.character(subseq(dnaStringSet, seqStarts[i], seqEnds[i])) |
|
176 |
+ ) |
|
177 |
+ fastaHeaders <- paste0(">", "fakeRead", seq_along(nucleotides), "#0/", pairNum) |
|
178 |
+ fastaLines <- paste(fastaHeaders, nucleotides, sep="\n") |
|
179 |
+ tmpFasta <- paste0(tempfile(), ".", pairNum, ".fa") |
|
180 |
+ writeLines(fastaLines, con=tmpFasta) |
|
181 |
+ tmpFasta |
|
182 |
+} |
|
183 |
+seqStarts <- seq(1, 100000, by = 10000) + length(dnaStringSet) * 2 / 3 |
|
184 |
+tmpFasta1 <- makeTestFasta(seqStarts, '1') |
|
185 |
+tmpFasta2 <- makeTestFasta(seqStarts + 125, '2') |
|
186 |
+ |
|
187 |
+##specify how GSNAP should behave using a GsnapParam object |
|
188 |
+gsnapParam <- GsnapParam(genome = gmapGenome, |
|
189 |
+ unique_only = FALSE, |
|
190 |
+ max_mismatches = NULL, |
|
191 |
+ suboptimal_levels = 0L, mode = "standard", |
|
192 |
+ npaths = 10L, |
|
193 |
+ novelsplicing = FALSE, splicing = NULL, |
|
194 |
+ nthreads = 1L, |
|
195 |
+ batch = "2") |
|
196 |
+ |
|
197 |
+gsnapOutput <- gsnap(input_a=tmpFasta1, |
|
198 |
+ input_b=tmpFasta2, |
|
199 |
+ params=gsnapParam) |
|
200 |
+##gsnapOutput |
|
201 |
+##An object of class "GsnapOutput" |
|
202 |
+##Slot "path": |
|
203 |
+##[1] "/local/Rtmporwsvr" |
|
204 |
+## |
|
205 |
+##Slot "param": |
|
206 |
+##A GsnapParams object |
|
207 |
+##genome: dm3 (/reshpcfs/home/coryba/projects/gmapR2/testGenome) |
|
208 |
+##part: NULL |
|
209 |
+##batch: 2 |
|
210 |
+##max_mismatches: NULL |
|
211 |
+##suboptimal_levels: 0 |
|
212 |
+##snps: NULL |
|
213 |
+##mode: standard |
|
214 |
+##nthreads: 1 |
|
215 |
+##novelsplicing: FALSE |
|
216 |
+##splicing: NULL |
|
217 |
+##npaths: 10 |
|
218 |
+##quiet_if_excessive: FALSE |
|
219 |
+##nofails: FALSE |
|
220 |
+##split_output: TRUE |
|
221 |
+##extra: list() |
|
222 |
+## |
|
223 |
+##Slot "version": |
|
224 |
+## [1] NA NA NA NA NA NA NA NA NA NA NA NA |
|
225 |
+## |
|
226 |
+@ |
|
227 |
+ |
|
228 |
+The \Rcode{gsnapOutput} object will be of the class |
|
229 |
+\Rclass{GsnapOutput}. It will provide access to the BAM files of |
|
230 |
+alignments that \software{GSNAP} outputs along with other utilities. |
|
231 |
+ |
|