git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@57248 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,18 @@ |
1 |
+Package: gmapR |
|
2 |
+Maintainer: <barr.cory@gene.com> |
|
3 |
+License: Artistic-2.0 |
|
4 |
+Title: Provides convenience methods to work with GMAP and GSNAP from |
|
5 |
+ within R |
|
6 |
+Type: Package |
|
7 |
+LazyLoad: yes |
|
8 |
+Author: Cory Barr |
|
9 |
+Description: GSNAP and GMAP are a pair of tools to align short-read |
|
10 |
+ data written by Tom Wu. This package provides convenience methods |
|
11 |
+ to work with GMAP and GSNAP from within R |
|
12 |
+Version: 1.0 |
|
13 |
+Date: 2011-08-01 |
|
14 |
+Depends: R (>= 2.14.0), HTSeqGenieBase (>= 1.0.22) |
|
15 |
+Collate: 'buildGmapDbSNPIndex.R' 'buildGmapGenomeIndex.R' 'buildGmap.R' |
|
16 |
+ 'buildGsnapSNPIIT.R' 'consolidateGmapFiles.R' |
|
17 |
+ 'consolidateGsnapFiles.R' 'getGsnapFileCounts.R' |
|
18 |
+ 'parallelized_gsnap.R' |
0 | 7 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,38 @@ |
1 |
+##' Downloads the GMAP source code, builds it, then installs it to a specified directory |
|
2 |
+##' |
|
3 |
+##' |
|
4 |
+##' @title Download and Install GMAP |
|
5 |
+##' @param install_dir directory where GMAP will be installed |
|
6 |
+##' @return 0 |
|
7 |
+##' @author Cory Barr |
|
8 |
+##' @export |
|
9 |
+installGmap <- function(install_dir) { |
|
10 |
+ |
|
11 |
+ install_dir <- file_path_as_absolute(install_dir) |
|
12 |
+ |
|
13 |
+ temp_dir <- 'temp' |
|
14 |
+ if(file.exists(temp_dir)) |
|
15 |
+ unlink(temp_dir, recursive=TRUE) |
|
16 |
+ dir.create(temp_dir) |
|
17 |
+ setwd(temp_dir) |
|
18 |
+ |
|
19 |
+ system('wget http://research-pub.gene.com/gmap/src/gmap-gsnap-2011-03-28.tar.gz') |
|
20 |
+ system('tar xzvf gmap-gsnap-2011-03-28.tar.gz') |
|
21 |
+ orig_dir <- getwd() |
|
22 |
+ on.exit(setwd(orig_dir)) |
|
23 |
+ setwd('gmap-2011-03-28/') |
|
24 |
+ |
|
25 |
+ gsnap_dir <- file.path(install_dir, "gmap") |
|
26 |
+ gsnap_genomes_dir <- file.path(install_dir, "gmap", "genomes") |
|
27 |
+ dir.create(gsnap_genomes_dir, recursive=TRUE) |
|
28 |
+ |
|
29 |
+ sys_call <- paste("./configure", |
|
30 |
+ paste("prefix=", gsnap_dir, sep=''), |
|
31 |
+ paste("with_gmapdb=", gsnap_genomes_dir, sep='')) |
|
32 |
+ system(sys_call) |
|
33 |
+ system('make') |
|
34 |
+ system('make install') |
|
35 |
+ |
|
36 |
+ unlink(temp_dir, recursive=TRUE) |
|
37 |
+ return(0) |
|
38 |
+} |
0 | 39 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,71 @@ |
1 |
+##' Downloads a version of dbSNP and integrates it into an installation of GMAP/GSNAP for SNP-tolerant alignment |
|
2 |
+##' |
|
3 |
+##' .. content for \details{} .. |
|
4 |
+##' @title Download and Integrate dbSNP into GMAP/GSNAP |
|
5 |
+##' @param version version of dbSNP to obtain |
|
6 |
+##' @return 0 |
|
7 |
+##' @author Cory Barr |
|
8 |
+##' @export |
|
9 |
+buildGmapDbSNPIndex <- function(version) { |
|
10 |
+ |
|
11 |
+ if(version != 'snp131') |
|
12 |
+ stop("only supports dbSNP version snp131 at present") |
|
13 |
+ |
|
14 |
+ tmp_dir <- 'snp.tmp' |
|
15 |
+ if(file.exists(tmp_dir)) |
|
16 |
+ unlink(tmp_dir, recursive=TRUE) |
|
17 |
+ dir.create(tmp_dir) |
|
18 |
+ |
|
19 |
+ start_dir <- file_path_as_absolute(getwd()) |
|
20 |
+ setwd(tmp_dir) |
|
21 |
+ system('wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp131.txt.gz') |
|
22 |
+ system('gunzip snp131.txt.gz') |
|
23 |
+ |
|
24 |
+ ##all entries must be in this format: >rs62211261 21:14379270..14379270 CG |
|
25 |
+ converted_script <- system.file("scripts/parse_dbsnp.pl", package="gmapR") |
|
26 |
+ sys_call <- paste("cat", |
|
27 |
+ "snp131.txt", |
|
28 |
+ "| perl", |
|
29 |
+ converted_script, |
|
30 |
+ ">", |
|
31 |
+ "snp131.converted") |
|
32 |
+ system(sys_call) |
|
33 |
+ |
|
34 |
+ ##cat snp131.converted | iit_store -o snp131 |
|
35 |
+ iit_store <- file.path(start_dir, |
|
36 |
+ "gmap/bin/iit_store") |
|
37 |
+ sys_call <- paste("cat", |
|
38 |
+ "snp131.converted", |
|
39 |
+ "|", |
|
40 |
+ iit_store, |
|
41 |
+ "-o", |
|
42 |
+ "snp131") |
|
43 |
+ system(sys_call) |
|
44 |
+ |
|
45 |
+ ##cp -f snp131.iit /gne/home/coryba/tools/gsnap/share/hg19_ucsc/hg19_ucsc.maps |
|
46 |
+ sys_call <- paste("cp -f", |
|
47 |
+ "snp131.iit", |
|
48 |
+ file.path(start_dir, |
|
49 |
+ "gmap", |
|
50 |
+ "genomes", |
|
51 |
+ "hg19", |
|
52 |
+ "hg19.maps")) |
|
53 |
+ res <- system(sys_call, intern=TRUE) |
|
54 |
+ |
|
55 |
+ ##snpindex -d hg19_ucsc -V /gne/home/coryba/tools/gsnap/share/hg19_ucsc -v snp131 snp131.iit |
|
56 |
+ snpindex <- file.path(start_dir, "gmap/bin/snpindex") |
|
57 |
+ sys_call <- paste(snpindex, |
|
58 |
+ "-d", |
|
59 |
+ "hg19", |
|
60 |
+ "-V", |
|
61 |
+ file.path(start_dir, |
|
62 |
+ "gmap/genomes", |
|
63 |
+ "hg19"), |
|
64 |
+ "-v snp131 snp131.iit") |
|
65 |
+ res <- system(sys_call, intern=TRUE) |
|
66 |
+ |
|
67 |
+ setwd(start_dir) |
|
68 |
+ unlink(tmp_dir, recursive=TRUE) |
|
69 |
+ |
|
70 |
+ return(0) |
|
71 |
+} |
0 | 72 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,74 @@ |
1 |
+##' Gsnap index of hg19 (with haplotypes and unassembled regions excluded) |
|
2 |
+##' |
|
3 |
+##'Incorporate a reference genome into gmap/gsnap by downloading a |
|
4 |
+##' genome and constructing an IIT. Currently only hg19 is supported. |
|
5 |
+##' @title Create Gmap Reference Genome Index |
|
6 |
+##' @param genome |
|
7 |
+##' @return 0 |
|
8 |
+##' @author Cory Barr |
|
9 |
+##' @export |
|
10 |
+buildGmapIndex <- function(genome) { |
|
11 |
+ |
|
12 |
+ retriveGenomeFasta <- function(genome) { |
|
13 |
+ if(genome != 'hg19') |
|
14 |
+ stop("only hg19 supported at the moment") |
|
15 |
+ |
|
16 |
+ tmp_dir <- tempdir() |
|
17 |
+ if(file.exists(tmp_dir)) |
|
18 |
+ unlink(tmp_dir, recursive=TRUE) |
|
19 |
+ dir.create(tmp_dir) |
|
20 |
+ |
|
21 |
+ on.exit(setwd(file_path_as_absolute(getwd()))) |
|
22 |
+ setwd(tmp_dir) |
|
23 |
+ sys_command <- paste('wget http://hgdownload.cse.ucsc.edu/goldenPath/', |
|
24 |
+ genome, |
|
25 |
+ '/bigZips/chromFa.tar.gz', |
|
26 |
+ sep='') |
|
27 |
+ system(sys_command) |
|
28 |
+ system('tar xzf chromFa.tar.gz') |
|
29 |
+ |
|
30 |
+ genome_fasta_file <- paste(genome, 'fa', sep='.') |
|
31 |
+ ##excluding haplotypes, cat genome seqs together into one fasta: |
|
32 |
+ sys_call <- paste('ls *fa | grep -v "_gl" | grep -v "_hap" | xargs -i cat {} > ', |
|
33 |
+ genome_fasta_file, |
|
34 |
+ sep='') |
|
35 |
+ system(sys_call) |
|
36 |
+ |
|
37 |
+ file_path_as_absolute(genome_fasta_file) |
|
38 |
+ } |
|
39 |
+ |
|
40 |
+ genome_fa <- retrieveGenomeFasta(genome) |
|
41 |
+ |
|
42 |
+ buildGmapIITFromFasta <- function(genome, fasta) { |
|
43 |
+ |
|
44 |
+ ##currently, hg19 for the Genie pipelines is names hg19_ucsc to |
|
45 |
+ ##distinguish origin. This will be phased out |
|
46 |
+ genome <- 'hg19_ucsc' |
|
47 |
+ |
|
48 |
+ gmap_setup <- file.path(globals()['gmap_bin'], |
|
49 |
+ "gmap_setup") |
|
50 |
+ sys_command <- paste(gmap_setup, |
|
51 |
+ "-d", |
|
52 |
+ genome, |
|
53 |
+ fasta) |
|
54 |
+ system(sys_command) |
|
55 |
+ |
|
56 |
+ sys_command <- paste("make -f", |
|
57 |
+ paste('Makefile.', genome, sep=''), |
|
58 |
+ "coords") |
|
59 |
+ system(sys_command) |
|
60 |
+ |
|
61 |
+ sys_command <- paste("make -f", |
|
62 |
+ paste('Makefile.', genome, sep=''), |
|
63 |
+ "gmapdb") |
|
64 |
+ system(sys_command) |
|
65 |
+ |
|
66 |
+ sys_command <- paste("make -f", |
|
67 |
+ paste('Makefile.', genome, sep=''), |
|
68 |
+ "install") |
|
69 |
+ system(sys_command) |
|
70 |
+ } |
|
71 |
+ |
|
72 |
+ buildGmapIITFromFasta(genome, genome_fa) |
|
73 |
+ return(0) |
|
74 |
+} |
0 | 75 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,86 @@ |
1 |
+##TODO: put in gsnap package |
|
2 |
+ |
|
3 |
+##' This function helps create the files needed for SNP-tolerant gsnap |
|
4 |
+##' alignment. |
|
5 |
+##' |
|
6 |
+##' @title Build SNP-tolerant gsnap alignment |
|
7 |
+##' @param index_file The fasta-like file containing the SNPs |
|
8 |
+##' @param gsnap_bin location of the gsnap executable. Defaults to |
|
9 |
+##' location at gsnap:::globals()$gsnap |
|
10 |
+##' @param save_dir location of the default gmap directory. Defaults |
|
11 |
+##' to location at gsnap:::globals()$gsnap_save_dir |
|
12 |
+##' @param gsnap_genome_dir name of the genome, which corresponds to |
|
13 |
+##' the subdirectory of save_dir. The IIT files go here. Typical |
|
14 |
+##' examples are "hg19" and "mm9" |
|
15 |
+##' @param snp_file This argument will specify the name of the |
|
16 |
+##' resulting final IIT of SNPs and consequently the name of the |
|
17 |
+##' database specified via gsnap's -v option. For example, if snp_file |
|
18 |
+##' is db131, then gsnap will align via gsnap -d genome -v db131 |
|
19 |
+##' input.fastq |
|
20 |
+##' @return 0 |
|
21 |
+##` @author Cory Barr |
|
22 |
+##` rdname buildGsnapSNPIIT |
|
23 |
+ |
|
24 |
+buildGsnapSNPIIT <- function(index_file, |
|
25 |
+ gsnap_bin, |
|
26 |
+ save_dir, |
|
27 |
+ gsnap_genome_dir, |
|
28 |
+ snp_file) { |
|
29 |
+ |
|
30 |
+ if(!file.exists(index_file)) |
|
31 |
+ stop(paste("Could not find the file", index_file)) |
|
32 |
+ |
|
33 |
+ if (missing(gsnap_bin)) |
|
34 |
+ gsnap_bin <- globals()$gsnap |
|
35 |
+ if (missing(save_dir)) |
|
36 |
+ save_dir <- globals()$gsnap_save_dir |
|
37 |
+ |
|
38 |
+ full_save_dir <- file.path(save_dir, gsnap_genome_dir) |
|
39 |
+ |
|
40 |
+ if(!file.exists(full_save_dir)) |
|
41 |
+ stop(paste("Error. The directory", full_save_dir, "does not exist.")) |
|
42 |
+ |
|
43 |
+ tmp_dir <- paste("tmp.gsnap_index_creation.", |
|
44 |
+ ceiling(runif(1) * 10000000), |
|
45 |
+ sep="") |
|
46 |
+ |
|
47 |
+ dir.create(tmp_dir) |
|
48 |
+ |
|
49 |
+ iit_store <- file.path(globals()$gsnap_bin_dir, "iit_store") |
|
50 |
+ |
|
51 |
+ if(missing(snp_file)) { |
|
52 |
+ snp_file <- gsub("^.*/", "", index_file, perl=T) |
|
53 |
+ } |
|
54 |
+ snp_file <- file.path(tmp_dir, snp_file) |
|
55 |
+ iit_file <- paste(snp_file, ".iit", sep="") |
|
56 |
+ |
|
57 |
+ ##ex: cat snp131.converted | iit_store -o snp131 |
|
58 |
+ sys_call <- paste("cat", index_file, "|", iit_store, "-o", iit_file) |
|
59 |
+ system(sys_call) |
|
60 |
+ |
|
61 |
+ maps_dir <- paste(globals()$gsnap_save_dir, |
|
62 |
+ "/", |
|
63 |
+ gsnap_genome_dir, |
|
64 |
+ "/", |
|
65 |
+ gsnap_genome_dir, |
|
66 |
+ ".maps", |
|
67 |
+ sep="") |
|
68 |
+ ##ex:cp -f snp131.iit /gne/home/coryba/tools/gsnap/share/hg19_ucsc/hg19_ucsc.maps |
|
69 |
+ sys_call <- paste("cp -f", iit_file, maps_dir) |
|
70 |
+ system(sys_call) |
|
71 |
+ |
|
72 |
+ snpindex <- file.path(globals()$gsnap_bin, "snpindex") |
|
73 |
+ gsnap_genome_path <- file.path(globals()$gsnap_save_dir, gsnap_genome_dir) |
|
74 |
+ ##ex:snpindex -d hg19_ucsc -V /gne/home/coryba/tools/gsnap/share/hg19_ucsc -v snp131 snp131.iit |
|
75 |
+ sys_call <- paste(snpindex, |
|
76 |
+ "-d", |
|
77 |
+ gsnap_genome_dir, |
|
78 |
+ "-V", |
|
79 |
+ gsnap_genome_path, |
|
80 |
+ "-v", |
|
81 |
+ strsplit(snp_file, "/")[[1]][2], |
|
82 |
+ paste(snp_file, ".iit", sep="")) |
|
83 |
+ system(sys_call) |
|
84 |
+ unlink(tmp_dir, recursive=TRUE) |
|
85 |
+ return(0) |
|
86 |
+} |
0 | 87 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,63 @@ |
1 |
+##' Consolidates all pieces from parallelized gsnap output into |
|
2 |
+##' appropriately merged files |
|
3 |
+##' |
|
4 |
+##' If gmap was run in single-end mode, this will consolidate all |
|
5 |
+##' parallelized into the 3 output files. If gmap was run in |
|
6 |
+##' paired-end mode, consolidates to the 7 output files. |
|
7 |
+##' @title Consolidate gmap's output files |
|
8 |
+##' @param sam_file_dir directory where gmap's sam file output is stored |
|
9 |
+##' @param paired_end indicated whether gmap was run in paired_end mode |
|
10 |
+##' @param remove_merged remove the individual pieces after they are merged |
|
11 |
+##' @return list of the names of the files created from consolidation |
|
12 |
+##' @author Cory Barr |
|
13 |
+##' @export |
|
14 |
+consolidateGmapFiles <- function(sam_file_dir, |
|
15 |
+ paired_end=FALSE, |
|
16 |
+ remove_merged=FALSE) { |
|
17 |
+ |
|
18 |
+ if (! file.exists(sam_file_dir)) { |
|
19 |
+ stop(paste("Could not find the directory", sam_file_dir)) |
|
20 |
+ } else { |
|
21 |
+ sam_file_dir <- file_path_as_absolute(sam_file_dir) |
|
22 |
+ } |
|
23 |
+ |
|
24 |
+ return_list <- list() |
|
25 |
+ |
|
26 |
+ dir_files <- dir(sam_file_dir) |
|
27 |
+ |
|
28 |
+ no_mapping_files <- dir_files[grep("gmap_out.*nomapping$", |
|
29 |
+ dir_files, |
|
30 |
+ perl=T)] |
|
31 |
+ no_mapping_files <- paste(sam_file_dir, no_mapping_files, sep="/") |
|
32 |
+ consolidated_file <- consolidateSAMFiles(no_mapping_files, |
|
33 |
+ "gmap.merged.no_mapping.bam", |
|
34 |
+ remove_merged=remove_merged) |
|
35 |
+ return_list$no_mapping <- consolidated_file |
|
36 |
+ |
|
37 |
+ unpaired_uniq_files <- dir_files[grep("gmap_out.*uniq$", |
|
38 |
+ dir_files, |
|
39 |
+ perl=T)] |
|
40 |
+ unpaired_uniq_files <- paste(sam_file_dir, unpaired_uniq_files, sep="/") |
|
41 |
+ consolidated_file <- consolidateSAMFiles(unpaired_uniq_files, |
|
42 |
+ "gmap.merged.uniq.bam", |
|
43 |
+ remove_merged=remove_merged) |
|
44 |
+ return_list$unpaired_uniq <- consolidated_file |
|
45 |
+ |
|
46 |
+ unpaired_mult_files <- dir_files[grep("gmap_out.*mult$", |
|
47 |
+ dir_files, |
|
48 |
+ perl=T)] |
|
49 |
+ unpaired_mult_files <- paste(sam_file_dir, unpaired_mult_files, sep="/") |
|
50 |
+ consolidated_file <- consolidateSAMFiles(unpaired_mult_files, |
|
51 |
+ "gmap.merged.mult.bam", |
|
52 |
+ remove_merged=remove_merged) |
|
53 |
+ return_list$unpaired_mult <- consolidated_file |
|
54 |
+ |
|
55 |
+ |
|
56 |
+ |
|
57 |
+ ##paired-end output adds an additional 4 files to those above |
|
58 |
+ if (paired_end) { |
|
59 |
+ ## not functional for 454 Gmap yet |
|
60 |
+ } |
|
61 |
+ |
|
62 |
+ return(return_list) |
|
63 |
+} |
0 | 64 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,63 @@ |
1 |
+##' Consolidates all pieces from parallelized gsnap output into |
|
2 |
+##' appropriately merged files |
|
3 |
+##' |
|
4 |
+##' If gsnap was run in single-end mode, this will consolidate all |
|
5 |
+##' parallelized into the 3 output files. If gsnap was run in |
|
6 |
+##' paired-end mode, consolidates to the 7 output files. |
|
7 |
+##' @title Consolidate gsnap's parallelized output files |
|
8 |
+##' @param sam_file_dir directory where gsnap's sam file output is stored |
|
9 |
+##' @param remove_merged remove the individual pieces after they are merged |
|
10 |
+##' @return list of the names of the files created from consolidation |
|
11 |
+##' @author Cory Barr |
|
12 |
+consolidateGsnapFiles <- function(sam_file_dir, |
|
13 |
+ remove_merged=FALSE, |
|
14 |
+ parallelized=TRUE) { |
|
15 |
+ |
|
16 |
+ if (! file.exists(sam_file_dir)) { |
|
17 |
+ stop(paste("Could not find the directory", sam_file_dir)) |
|
18 |
+ } else { |
|
19 |
+ sam_file_dir <- file_path_as_absolute(sam_file_dir) |
|
20 |
+ } |
|
21 |
+ |
|
22 |
+ return_list <- list() |
|
23 |
+ |
|
24 |
+ dir_files <- dir(sam_file_dir, full.names=TRUE) |
|
25 |
+ |
|
26 |
+ .consolidate <- function(mapping_class, |
|
27 |
+ remove_merged, |
|
28 |
+ dir_files) { |
|
29 |
+ |
|
30 |
+ base_names <- basename(dir_files) |
|
31 |
+ |
|
32 |
+ unconsolidated_files <- dir_files[grep(paste("^gsnap_out\\.*", |
|
33 |
+ "\\d+\\.", |
|
34 |
+ mapping_class, |
|
35 |
+ "$", |
|
36 |
+ sep=''), |
|
37 |
+ base_names, |
|
38 |
+ perl=TRUE)] |
|
39 |
+ |
|
40 |
+ consolidated_file <- consolidateSAMFiles(sam_files=unconsolidated_files, |
|
41 |
+ outfile=paste( |
|
42 |
+ "gsnap.merged", |
|
43 |
+ mapping_class, |
|
44 |
+ "bam", |
|
45 |
+ sep='.'), |
|
46 |
+ remove_merged=remove_merged) |
|
47 |
+ } |
|
48 |
+ |
|
49 |
+ mapping_classes <- basename(dir_files) |
|
50 |
+ mapping_classes <- mapping_classes[grep("gsnap_out\\.", mapping_classes)] |
|
51 |
+ mapping_classes <- unique(sub("^gsnap_out\\.\\d+\\.", "", mapping_classes)) |
|
52 |
+ |
|
53 |
+ apply_func <- lapply |
|
54 |
+ if(parallelized) |
|
55 |
+ apply_func <- mclapply |
|
56 |
+ |
|
57 |
+ merged_files <- lapply(mapping_classes, |
|
58 |
+ .consolidate, |
|
59 |
+ remove_merged=remove_merged, |
|
60 |
+ dir_files=dir_files) |
|
61 |
+ names(merged_files) <- mapping_classes |
|
62 |
+ return(merged_files) |
|
63 |
+} |
0 | 64 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,65 @@ |
1 |
+##' Return number of reads in each of gsnap's output files |
|
2 |
+##' |
|
3 |
+##' With the --split-output argument, gsnap can output multiple |
|
4 |
+##' files. If these files are BAM files, this function counts the |
|
5 |
+##' number of reads in each file. |
|
6 |
+##' @title Return number of reads in each of gsnap's output files |
|
7 |
+##' @param aligner_dir Directory containing output bam files from gsnap |
|
8 |
+##' @param parallelize Boolean indicating if counting should be done in parallel |
|
9 |
+##' @return named list of read counts per gsnap output file |
|
10 |
+##' @author Cory Barr |
|
11 |
+##' @export |
|
12 |
+getGsnapFileCounts <- function(aligner_dir, parallelize=TRUE) { |
|
13 |
+ |
|
14 |
+ if(!file.exists(aligner_dir)) |
|
15 |
+ stop("aligner_dir does not exist") |
|
16 |
+ |
|
17 |
+ ##have to count gsnap output by chr, or can get overflow errors |
|
18 |
+ .getNumUniqueReadsInBam <- function(bam_file) { |
|
19 |
+ chr_granges <- getSeqlensFromBAM(bam_file) |
|
20 |
+ .getUniqueReadsByChr <- function(index) { |
|
21 |
+ sb_param <- |
|
22 |
+ ScanBamParam(what=c("qname"), |
|
23 |
+ which=chr_granges[index]) |
|
24 |
+ res <- scanBam(bam_file, param=sb_param)[[1]] |
|
25 |
+ unique(res$qname) |
|
26 |
+ } |
|
27 |
+ |
|
28 |
+ apply_func <- lapply |
|
29 |
+ if (parallelize) apply_func <- mclapply |
|
30 |
+ |
|
31 |
+ res <- unlist(apply_func(seq_len(length(chr_granges)), |
|
32 |
+ .getUniqueReadsByChr)) |
|
33 |
+ length(unique(res)) |
|
34 |
+ } |
|
35 |
+ |
|
36 |
+ bam_files <- dir(aligner_dir, pattern="\\.bam$", full.names=TRUE) |
|
37 |
+ |
|
38 |
+ nomapping_index <- grep("\\.no_?mapping\\.bam$", bam_files) |
|
39 |
+ mult_mapping_index <- grep("_mult[_\\.]", bam_files) |
|
40 |
+ uniq_mapping_index <- seq_len(length(bam_files))[-c(mult_mapping_index, nomapping_index)] |
|
41 |
+ nomapping_bam <- bam_files[nomapping_index] |
|
42 |
+ mult_mapping_bams <- bam_files[mult_mapping_index] |
|
43 |
+ uniq_mapping_bams <- bam_files[uniq_mapping_index] |
|
44 |
+ |
|
45 |
+ named_names <- bam_files[-nomapping_index] |
|
46 |
+ names(named_names) <- named_names |
|
47 |
+ gsnap_counts <- lapply(named_names, |
|
48 |
+ .getNumUniqueReadsInBam) |
|
49 |
+ |
|
50 |
+ names(gsnap_counts) <- sub("^.*\\.gsnap\\.merged\\.", "", names(gsnap_counts)) |
|
51 |
+ names(gsnap_counts) <- sub("\\.bam$", "", names(gsnap_counts)) |
|
52 |
+ |
|
53 |
+ ##handling the 'nomapping' file differently, since nothing this (so |
|
54 |
+ ##nothing is returned if scanBam is given a ScanBamParam which arg |
|
55 |
+ nomapper_ids <- unlist(scanBam(nomapping_bam, |
|
56 |
+ param=ScanBamParam(what=c("qname")))[[1]], |
|
57 |
+ use.names=FALSE) |
|
58 |
+ num_nomappers <- sum(!duplicated(nomapper_ids)) |
|
59 |
+ gsnap_counts[['nomapping']] <- num_nomappers |
|
60 |
+ |
|
61 |
+ names(gsnap_counts) <- paste("gsnap", names(gsnap_counts), sep=".") |
|
62 |
+ names(gsnap_counts) <- paste(names(gsnap_counts), "_reads", sep="") |
|
63 |
+ |
|
64 |
+ return(gsnap_counts) |
|
65 |
+} |
0 | 66 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,177 @@ |
1 |
+##' This function parallelizes gsnap across a cluster. |
|
2 |
+##' |
|
3 |
+##' @title Parallelized gsnap |
|
4 |
+##' @param num_machines number of cluster machines to run gsnap across |
|
5 |
+##' @param procs_per_machine number of gsnap jobs to run on each cluster node |
|
6 |
+##' @param gsnap location of gsnap executable |
|
7 |
+##' @param gsnap_params a string of the parameters to pass to gsnap. Note "--split-output" is not allowed. |
|
8 |
+##' @param input_file location of the input fastq to align |
|
9 |
+##' @param input_file_2 location of the second input fastq to align (for paired ends) |
|
10 |
+##' @param paired_end logical indicating if gsnap should be run in paired-end mode |
|
11 |
+##' @param output_dir directory for gsnap to write its output |
|
12 |
+##' @param test_mode used for debugging. turns off parallelization. |
|
13 |
+##' @param intern indicates whether to have STDOUT the return |
|
14 |
+##' value. identical to 'intern' argument to the system() function |
|
15 |
+##' @param multifile_out tells gsnap to write to 3 output files for |
|
16 |
+##' single-end data, or 7 output files for paired-end data |
|
17 |
+##' @return a list of the return values of the system function for |
|
18 |
+##' each parallelized piece |
|
19 |
+##' @author Cory Barr |
|
20 |
+##' @export |
|
21 |
+parallelized_gsnap <- function(num_machines, |
|
22 |
+ procs_per_machine, |
|
23 |
+ gsnap, |
|
24 |
+ gsnap_params, |
|
25 |
+ input_file, |
|
26 |
+ input_file_2=NULL, |
|
27 |
+ paired_end=F, |
|
28 |
+ output_dir, |
|
29 |
+ test_mode=FALSE, |
|
30 |
+ intern=FALSE, |
|
31 |
+ multifile_out=TRUE, |
|
32 |
+ record_sys_call_dir=NA) { |
|
33 |
+ |
|
34 |
+ if (length(grep("split-output ", gsnap_params) > 0)) |
|
35 |
+ stop("This function cannot handle gsnap parameters with the --split-output switch enabled") |
|
36 |
+ |
|
37 |
+ if(procs_per_machine == 1) { |
|
38 |
+ apply_func <- lapply |
|
39 |
+ } else { |
|
40 |
+ apply_func <- mclapply |
|
41 |
+ } |
|
42 |
+ |
|
43 |
+ .gsnap_per_machine <- function(i, |
|
44 |
+ procs_per_machine, |
|
45 |
+ gsnap, |
|
46 |
+ gsnap_params, |
|
47 |
+ paired_end, |
|
48 |
+ input_file, |
|
49 |
+ input_file_2, |
|
50 |
+ output_dir, |
|
51 |
+ total_gsnap_parts, |
|
52 |
+ intern, |
|
53 |
+ multifile_out, |
|
54 |
+ record_sys_call_dir) { |
|
55 |
+ |
|
56 |
+ ##library calls needed so slaves have access |
|
57 |
+ library("multicore") |
|
58 |
+ library("HTSeqGenieBase") |
|
59 |
+ |
|
60 |
+ gsnap_parts <- |
|
61 |
+ (i - 1) * procs_per_machine + (seq_len(procs_per_machine) - 1) |
|
62 |
+ gsnap_part_params <- paste("--part=", |
|
63 |
+ gsnap_parts, |
|
64 |
+ "/", |
|
65 |
+ total_gsnap_parts, |
|
66 |
+ sep="") |
|
67 |
+ |
|
68 |
+ if (paired_end) |
|
69 |
+ gsnap_params <- paste(gsnap_params, "-a paired") |
|
70 |
+ |
|
71 |
+ ##make the prefix of gsnaps 'split-output' outs be unique |
|
72 |
+ if (multifile_out) { |
|
73 |
+ gsnap_multi_out_params <- paste("gsnap_out.", gsnap_parts, sep="") |
|
74 |
+ gsnap_multi_out_params <- paste(output_dir, gsnap_multi_out_params, sep="/") |
|
75 |
+ sys_commands <- paste(gsnap, |
|
76 |
+ gsnap_params, |
|
77 |
+ gsnap_part_params, |
|
78 |
+ "--split-output", gsnap_multi_out_params, |
|
79 |
+ input_file) |
|
80 |
+ } else { |
|
81 |
+ stop("Not tested when not using gsnap's --split-output flag") |
|
82 |
+ } |
|
83 |
+ |
|
84 |
+ if (paired_end) { |
|
85 |
+ sys_commands <- paste(sys_commands, input_file_2) |
|
86 |
+ } |
|
87 |
+ |
|
88 |
+ sys_commands <- paste("nice", sys_commands) |
|
89 |
+ sys_commands <- paste(sys_commands, "2>&1") |
|
90 |
+ |
|
91 |
+ if(!is.na(record_sys_call_dir)) { |
|
92 |
+ if(!(file.exists(record_sys_call_dir))) |
|
93 |
+ dir.create(record_sys_call_dir) |
|
94 |
+ |
|
95 |
+ sys_call_file <- file.path(record_sys_call_dir, |
|
96 |
+ paste("aligner.sys_call", |
|
97 |
+ i - 1, |
|
98 |
+ sep=".")) |
|
99 |
+ writeLines(sys_commands, |
|
100 |
+ con=sys_call_file) |
|
101 |
+ } |
|
102 |
+ |
|
103 |
+ ##TODO: this awkward bit is about to |
|
104 |
+ if (intern) { ##parsing gsnap output. also sending multimaps out through STDOUT |
|
105 |
+ stop("intern=TRUE is not working yet") |
|
106 |
+ } else { #do not alter gsnap output |
|
107 |
+ results <- apply_func(sys_commands, system, intern=TRUE) |
|
108 |
+ } |
|
109 |
+ |
|
110 |
+ ##check results to see if each process finished |
|
111 |
+ successful <- sapply(seq_len(length(results)), |
|
112 |
+ function(index) { |
|
113 |
+ returned_lines <- results[[index]] |
|
114 |
+ res_len <- length(returned_lines) |
|
115 |
+ grepl("^Processed \\d+ queries in [\\d\\.]+ seconds", |
|
116 |
+ returned_lines[res_len], |
|
117 |
+ perl=TRUE) |
|
118 |
+ }) |
|
119 |
+ if(!all(successful)) |
|
120 |
+ stop(paste("aligner did not return successfully. System call is "), |
|
121 |
+ head(sys_commands[successful == FALSE], n=1)) |
|
122 |
+ |
|
123 |
+ return(as.integer(successful) - 1) |
|
124 |
+ } |
|
125 |
+ |
|
126 |
+ if (test_mode) { |
|
127 |
+ results <- lapply(seq_len(num_machines), |
|
128 |
+ .gsnap_per_machine, |
|
129 |
+ procs_per_machine=procs_per_machine, |
|
130 |
+ gsnap=gsnap, |
|
131 |
+ gsnap_params=gsnap_params, |
|
132 |
+ paired_end=paired_end, |
|
133 |
+ input_file=input_file, |
|
134 |
+ input_file_2=input_file_2, |
|
135 |
+ output_dir=output_dir, |
|
136 |
+ total_gsnap_parts=num_machines * procs_per_machine, |
|
137 |
+ intern=intern, |
|
138 |
+ multifile_out=multifile_out, |
|
139 |
+ record_sys_call_dir=record_sys_call_dir) |
|
140 |
+ } else { |
|
141 |
+ if (num_machines > 1) { |
|
142 |
+ library("snow") |
|
143 |
+ cl <- makeCluster(num_machines, "MPI") |
|
144 |
+ results <- parLapply(cl, |
|
145 |
+ seq_len(num_machines), |
|
146 |
+ .gsnap_per_machine, |
|
147 |
+ procs_per_machine=procs_per_machine, |
|
148 |
+ gsnap=gsnap, |
|
149 |
+ gsnap_params=gsnap_params, |
|
150 |
+ paired_end=paired_end, |
|
151 |
+ input_file=input_file, |
|
152 |
+ input_file_2=input_file_2, |
|
153 |
+ output_dir=output_dir, |
|
154 |
+ total_gsnap_parts=num_machines * procs_per_machine, |
|
155 |
+ intern=intern, |
|
156 |
+ multifile_out=multifile_out, |
|
157 |
+ record_sys_call_dir=record_sys_call_dir) |
|
158 |
+ stopCluster(cl) |
|
159 |
+ } else { |
|
160 |
+ results <- apply_func(seq_len(num_machines), |
|
161 |
+ .gsnap_per_machine, |
|
162 |
+ procs_per_machine=procs_per_machine, |
|
163 |
+ gsnap=gsnap, |
|
164 |
+ gsnap_params=gsnap_params, |
|
165 |
+ paired_end=paired_end, |
|
166 |
+ input_file=input_file, |
|
167 |
+ input_file_2=input_file_2, |
|
168 |
+ output_dir=output_dir, |
|
169 |
+ total_gsnap_parts=num_machines * procs_per_machine, |
|
170 |
+ intern=intern, |
|
171 |
+ multifile_out=multifile_out, |
|
172 |
+ record_sys_call_dir=record_sys_call_dir) |
|
173 |
+ } |
|
174 |
+ } |
|
175 |
+ |
|
176 |
+ return(results) |
|
177 |
+} |
0 | 178 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,82 @@ |
1 |
+#Hey Cory, |
|
2 |
+# |
|
3 |
+#I think we should be including class 'het' and the SNPs from class 'mixed' as well. The het class is easy, parsing SNPs out of the mixed class is going to be a bit more of a trick... |
|
4 |
+# |
|
5 |
+#Jeremiah |
|
6 |
+# |
|
7 |
+#On Fri, Nov 5, 2010 at 5:36 PM, Cory Barr <barr.cory@gene.com> wrote: |
|
8 |
+# |
|
9 |
+#I’m just keeping class=’single’ and locType=’exact’. Too exclusive? |
|
10 |
+ |
|
11 |
+use warnings; |
|
12 |
+use strict; |
|
13 |
+ |
|
14 |
+sub revcomp { |
|
15 |
+ my @nucs = @_; |
|
16 |
+ @nucs = map(uc, @nucs); |
|
17 |
+ |
|
18 |
+ for (my $i=0; $i<@nucs; $i++) { |
|
19 |
+ if ($nucs[$i] eq "A") { |
|
20 |
+ $nucs[$i] = "T"; |
|
21 |
+ } |
|
22 |
+ elsif ($nucs[$i] eq "C") { |
|
23 |
+ $nucs[$i] = "G"; |
|
24 |
+ } |
|
25 |
+ elsif ($nucs[$i] eq "G") { |
|
26 |
+ $nucs[$i] = "C"; |
|
27 |
+ } |
|
28 |
+ elsif ($nucs[$i] eq "T") { |
|
29 |
+ $nucs[$i] = "A"; |
|
30 |
+ } |
|
31 |
+ } |
|
32 |
+ |
|
33 |
+ return(@nucs); |
|
34 |
+} |
|
35 |
+ |
|
36 |
+while (my $input=<>) { |
|
37 |
+ chomp($input); |
|
38 |
+ my @pieces = split("\t", $input); |
|
39 |
+ my $chr = $pieces[1]; |
|
40 |
+ my $start = $pieces[2] + 1; |
|
41 |
+ my $end = $pieces[3]; |
|
42 |
+ my $id = $pieces[4]; |
|
43 |
+ my $strand = $pieces[6]; |
|
44 |
+ my $ref = $pieces[8]; |
|
45 |
+ my $seen = $pieces[9]; |
|
46 |
+ my $type = $pieces[11]; |
|
47 |
+ |
|
48 |
+ if ($type eq 'het') { $seen =~ s|\(HETEROZYGOUS\)/?||; } |
|
49 |
+ elsif ($type ne 'single' && $type ne 'mixed') { |
|
50 |
+ next; |
|
51 |
+ } |
|
52 |
+ |
|
53 |
+ if (not $ref =~ m/^[ACGT]$/) { next; } #cannot be a SNP unless matches this |
|
54 |
+ |
|
55 |
+ my @seen_nucs = split("/", $seen); |
|
56 |
+ if ($type eq 'mixed') { |
|
57 |
+ @seen_nucs = grep(/^[ACGT]$/, @seen_nucs); |
|
58 |
+ } |
|
59 |
+ |
|
60 |
+ if ($strand eq "-") { |
|
61 |
+ @seen_nucs = revcomp(@seen_nucs); |
|
62 |
+ } |
|
63 |
+ |
|
64 |
+ @seen_nucs = grep(!/^$ref/, @seen_nucs); |
|
65 |
+ |
|
66 |
+#all entries must be in this format: >rs62211261 21:14379270..14379270 CG |
|
67 |
+ foreach my $sn (@seen_nucs) { |
|
68 |
|
|
69 |
+ ">" . |
|
70 |
+ $id . |
|
71 |
+ " " . |
|
72 |
+ $chr . |
|
73 |
+ ":" . |
|
74 |
+ $start . |
|
75 |
+ ".." . |
|
76 |
+ $end . |
|
77 |
+ " " . |
|
78 |
+ $ref . |
|
79 |
+ $sn . |
|
80 |
+ "\n"; |
|
81 |
+ } |
|
82 |
+} |
0 | 83 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,9 @@ |
1 |
+\name{buildGmapDbSNPIndex} |
|
2 |
+\alias{buildGmapDbSNPIndex} |
|
3 |
+\title{Download and Integrate dbSNP into GMAP/GSNAP} |
|
4 |
+\usage{buildGmapDbSNPIndex(version)} |
|
5 |
+\description{Downloads a version of dbSNP and integrates it into an installation of GMAP/GSNAP for SNP-tolerant alignment} |
|
6 |
+\details{.. content for \details{} ..} |
|
7 |
+\value{0} |
|
8 |
+\author{Cory Barr} |
|
9 |
+\arguments{\item{version}{version of dbSNP to obtain}} |
0 | 10 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,10 @@ |
1 |
+\name{buildGmapIndex} |
|
2 |
+\alias{buildGmapIndex} |
|
3 |
+\title{Create Gmap Reference Genome Index} |
|
4 |
+\usage{buildGmapIndex(genome)} |
|
5 |
+\description{Gsnap index of hg19 (with haplotypes and unassembled regions excluded)} |
|
6 |
+\details{a reference genome into gmap/gsnap by downloading a |
|
7 |
+genome and constructing an IIT. Currently only hg19 is supported.} |
|
8 |
+\value{0} |
|
9 |
+\author{Cory Barr} |
|
10 |
+\arguments{\item{genome}{}} |
0 | 11 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,21 @@ |
1 |
+\name{buildGsnapSNPIIT} |
|
2 |
+\alias{buildGsnapSNPIIT} |
|
3 |
+\title{Build SNP-tolerant gsnap alignment} |
|
4 |
+\usage{buildGsnapSNPIIT(index_file, gsnap_bin, save_dir, gsnap_genome_dir, |
|
5 |
+ snp_file)} |
|
6 |
+\description{This function helps create the files needed for SNP-tolerant gsnap |
|
7 |
+alignment.} |
|
8 |
+\value{0} |
|
9 |
+\arguments{\item{index_file}{The fasta-like file containing the SNPs} |
|
10 |
+\item{gsnap_bin}{location of the gsnap executable. Defaults to |
|
11 |
+location at gsnap:::globals()$gsnap} |
|
12 |
+\item{save_dir}{location of the default gmap directory. Defaults |
|
13 |
+to location at gsnap:::globals()$gsnap_save_dir} |
|
14 |
+\item{gsnap_genome_dir}{name of the genome, which corresponds to |
|
15 |
+the subdirectory of save_dir. The IIT files go here. Typical |
|
16 |
+examples are "hg19" and "mm9"} |
|
17 |
+\item{snp_file}{This argument will specify the name of the |
|
18 |
+resulting final IIT of SNPs and consequently the name of the |
|
19 |
+database specified via gsnap's -v option. For example, if snp_file |
|
20 |
+is db131, then gsnap will align via gsnap -d genome -v db131 |
|
21 |
+input.fastq}} |
0 | 22 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,15 @@ |
1 |
+\name{consolidateGmapFiles} |
|
2 |
+\alias{consolidateGmapFiles} |
|
3 |
+\title{Consolidate gmap's output files} |
|
4 |
+\usage{consolidateGmapFiles(sam_file_dir, paired_end=FALSE, |
|
5 |
+ remove_merged=FALSE)} |
|
6 |
+\description{Consolidates all pieces from parallelized gsnap output into |
|
7 |
+appropriately merged files} |
|
8 |
+\details{If gmap was run in single-end mode, this will consolidate all |
|
9 |
+parallelized into the 3 output files. If gmap was run in |
|
10 |
+paired-end mode, consolidates to the 7 output files.} |
|
11 |
+\value{list of the names of the files created from consolidation} |
|
12 |
+\author{Cory Barr} |
|
13 |
+\arguments{\item{sam_file_dir}{directory where gmap's sam file output is stored} |
|
14 |
+\item{paired_end}{indicated whether gmap was run in paired_end mode} |
|
15 |
+\item{remove_merged}{remove the individual pieces after they are merged}} |
0 | 16 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,14 @@ |
1 |
+\name{consolidateGsnapFiles} |
|
2 |
+\alias{consolidateGsnapFiles} |
|
3 |
+\title{Consolidate gsnap's parallelized output files} |
|
4 |
+\usage{consolidateGsnapFiles(sam_file_dir, remove_merged=FALSE, |
|
5 |
+ parallelized=TRUE)} |
|
6 |
+\description{Consolidates all pieces from parallelized gsnap output into |
|
7 |
+appropriately merged files} |
|
8 |
+\details{If gsnap was run in single-end mode, this will consolidate all |
|
9 |
+parallelized into the 3 output files. If gsnap was run in |
|
10 |
+paired-end mode, consolidates to the 7 output files.} |
|
11 |
+\value{list of the names of the files created from consolidation} |
|
12 |
+\author{Cory Barr} |
|
13 |
+\arguments{\item{sam_file_dir}{directory where gsnap's sam file output is stored} |
|
14 |
+\item{remove_merged}{remove the individual pieces after they are merged}} |
0 | 15 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,12 @@ |
1 |
+\name{getGsnapFileCounts} |
|
2 |
+\alias{getGsnapFileCounts} |
|
3 |
+\title{Return number of reads in each of gsnap's output files} |
|
4 |
+\usage{getGsnapFileCounts(aligner_dir, parallelize=TRUE)} |
|
5 |
+\description{Return number of reads in each of gsnap's output files} |
|
6 |
+\details{With the --split-output argument, gsnap can output multiple |
|
7 |
+files. If these files are BAM files, this function counts the |
|
8 |
+number of reads in each file.} |
|
9 |
+\value{named list of read counts per gsnap output file} |
|
10 |
+\author{Cory Barr} |
|
11 |
+\arguments{\item{aligner_dir}{Directory containing output bam files from gsnap} |
|
12 |
+\item{parallelize}{Boolean indicating if counting should be done in parallel}} |
0 | 13 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,43 @@ |
1 |
+\name{gmapR-package} |
|
2 |
+\alias{gmapR-package} |
|
3 |
+\alias{gmapR} |
|
4 |
+\docType{package} |
|
5 |
+\title{ |
|
6 |
+What the package does (short line) |
|
7 |
+~~ package title ~~ |
|
8 |
+} |
|
9 |
+\description{ |
|
10 |
+More about what it does (maybe more than one line) |
|
11 |
+~~ A concise (1-5 lines) description of the package ~~ |
|
12 |
+} |
|
13 |
+\details{ |
|
14 |
+\tabular{ll}{ |
|
15 |
+Package: \tab gmapR\cr |
|
16 |
+Type: \tab Package\cr |
|
17 |
+Version: \tab 1.0\cr |
|
18 |
+Date: \tab 2011-07-25\cr |
|
19 |
+License: \tab What license is it under?\cr |
|
20 |
+LazyLoad: \tab yes\cr |
|
21 |
+} |
|
22 |
+~~ An overview of how to use the package, including the most important ~~ |
|
23 |
+~~ functions ~~ |
|
24 |
+} |
|
25 |
+\author{ |
|
26 |
+Who wrote it |
|
27 |
+ |
|
28 |
+Maintainer: Who to complain to <yourfault@somewhere.net> |
|
29 |
+~~ The author and/or maintainer of the package ~~ |
|
30 |
+} |
|
31 |
+\references{ |
|
32 |
+~~ Literature or other references for background information ~~ |
|
33 |
+} |
|
34 |
+~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ |
|
35 |
+~~ the R documentation directory ~~ |
|
36 |
+\keyword{ package } |
|
37 |
+\seealso{ |
|
38 |
+~~ Optional links to other man pages, e.g. ~~ |
|
39 |
+~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~ |
|
40 |
+} |
|
41 |
+\examples{ |
|
42 |
+~~ simple examples of the most important functions ~~ |
|
43 |
+} |
0 | 44 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,8 @@ |
1 |
+\name{installGmap} |
|
2 |
+\alias{installGmap} |
|
3 |
+\title{Download and Install GMAP} |
|
4 |
+\usage{installGmap(install_dir)} |
|
5 |
+\description{Downloads the GMAP source code, builds it, then installs it to a specified directory} |
|
6 |
+\value{0} |
|
7 |
+\author{Cory Barr} |
|
8 |
+\arguments{\item{install_dir}{directory where GMAP will be installed}} |
0 | 9 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,24 @@ |
1 |
+\name{parallelized_gsnap} |
|
2 |
+\alias{parallelized_gsnap} |
|
3 |
+\title{Parallelized gsnap} |
|
4 |
+\usage{parallelized_gsnap(num_machines, procs_per_machine, gsnap, |
|
5 |
+ gsnap_params, input_file, input_file_2, paired_end=F, output_dir, |
|
6 |
+ test_mode=FALSE, intern=FALSE, multifile_out=TRUE, |
|
7 |
+ record_sys_call_dir=NA)} |
|
8 |
+\description{This function parallelizes gsnap across a cluster.} |
|
9 |
+\value{a list of the return values of the system function for |
|
10 |
+each parallelized piece} |
|
11 |
+\author{Cory Barr} |
|
12 |
+\arguments{\item{num_machines}{number of cluster machines to run gsnap across} |
|
13 |
+\item{procs_per_machine}{number of gsnap jobs to run on each cluster node} |
|
14 |
+\item{gsnap}{location of gsnap executable} |
|
15 |
+\item{gsnap_params}{a string of the parameters to pass to gsnap. Note "--split-output" is not allowed.} |
|
16 |
+\item{input_file}{location of the input fastq to align} |
|
17 |
+\item{input_file_2}{location of the second input fastq to align (for paired ends)} |
|
18 |
+\item{paired_end}{logical indicating if gsnap should be run in paired-end mode} |
|
19 |
+\item{output_dir}{directory for gsnap to write its output} |
|
20 |
+\item{test_mode}{used for debugging. turns off parallelization.} |
|
21 |
+\item{intern}{indicates whether to have STDOUT the return |
|
22 |
+value. identical to 'intern' argument to the system() function} |
|
23 |
+\item{multifile_out}{tells gsnap to write to 3 output files for |
|
24 |
+single-end data, or 7 output files for paired-end data}} |