Browse code

add gmapR package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@57248 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 05/08/2011 07:34:10
Showing 20 changed files

1 1
new file mode 100644
... ...
@@ -0,0 +1,18 @@
1
+Package: gmapR
2
+Maintainer: <barr.cory@gene.com>
3
+License: Artistic-2.0
4
+Title: Provides convenience methods to work with GMAP and GSNAP from
5
+    within R
6
+Type: Package
7
+LazyLoad: yes
8
+Author: Cory Barr
9
+Description: GSNAP and GMAP are a pair of tools to align short-read
10
+    data written by Tom Wu.  This package provides convenience methods
11
+    to work with GMAP and GSNAP from within R
12
+Version: 1.0
13
+Date: 2011-08-01
14
+Depends: R (>= 2.14.0), HTSeqGenieBase (>= 1.0.22)
15
+Collate: 'buildGmapDbSNPIndex.R' 'buildGmapGenomeIndex.R' 'buildGmap.R'
16
+    'buildGsnapSNPIIT.R' 'consolidateGmapFiles.R'
17
+    'consolidateGsnapFiles.R' 'getGsnapFileCounts.R'
18
+    'parallelized_gsnap.R'
0 19
new file mode 100644
... ...
@@ -0,0 +1,6 @@
1
+export(buildGmapDbSNPIndex)
2
+export(buildGmapIndex)
3
+export(installGmap)
4
+export(consolidateGmapFiles)
5
+export(getGsnapFileCounts)
6
+export(parallelized_gsnap)
0 7
new file mode 100644
... ...
@@ -0,0 +1,38 @@
1
+##' Downloads the GMAP source code, builds it, then installs it to a specified directory
2
+##'
3
+##' 
4
+##' @title Download and Install GMAP
5
+##' @param install_dir directory where GMAP will be installed
6
+##' @return 0
7
+##' @author Cory Barr
8
+##' @export
9
+installGmap <- function(install_dir) {
10
+
11
+  install_dir <- file_path_as_absolute(install_dir)
12
+  
13
+  temp_dir <- 'temp'
14
+  if(file.exists(temp_dir))
15
+    unlink(temp_dir, recursive=TRUE)
16
+  dir.create(temp_dir)
17
+  setwd(temp_dir)
18
+  
19
+  system('wget http://research-pub.gene.com/gmap/src/gmap-gsnap-2011-03-28.tar.gz')
20
+  system('tar xzvf gmap-gsnap-2011-03-28.tar.gz')
21
+  orig_dir <- getwd()
22
+  on.exit(setwd(orig_dir))
23
+  setwd('gmap-2011-03-28/')
24
+  
25
+  gsnap_dir <- file.path(install_dir, "gmap")
26
+  gsnap_genomes_dir <- file.path(install_dir, "gmap", "genomes")
27
+  dir.create(gsnap_genomes_dir, recursive=TRUE)
28
+  
29
+  sys_call <- paste("./configure",
30
+                    paste("prefix=", gsnap_dir, sep=''),
31
+                    paste("with_gmapdb=", gsnap_genomes_dir, sep=''))
32
+  system(sys_call)                  
33
+  system('make')
34
+  system('make install')
35
+
36
+  unlink(temp_dir, recursive=TRUE)
37
+  return(0)
38
+}
0 39
new file mode 100644
... ...
@@ -0,0 +1,71 @@
1
+##' Downloads a version of dbSNP and integrates it into an installation of GMAP/GSNAP for SNP-tolerant alignment
2
+##'
3
+##' .. content for \details{} ..
4
+##' @title Download and Integrate dbSNP into GMAP/GSNAP
5
+##' @param version version of dbSNP to obtain
6
+##' @return 0
7
+##' @author Cory Barr
8
+##' @export
9
+buildGmapDbSNPIndex <- function(version) {
10
+
11
+  if(version != 'snp131')
12
+    stop("only supports dbSNP version snp131 at present")
13
+
14
+  tmp_dir <- 'snp.tmp'
15
+  if(file.exists(tmp_dir))
16
+    unlink(tmp_dir, recursive=TRUE)
17
+  dir.create(tmp_dir)
18
+  
19
+  start_dir <- file_path_as_absolute(getwd())
20
+  setwd(tmp_dir)
21
+  system('wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp131.txt.gz')
22
+  system('gunzip snp131.txt.gz')
23
+  
24
+  ##all entries must be in this format: >rs62211261 21:14379270..14379270 CG
25
+  converted_script <- system.file("scripts/parse_dbsnp.pl", package="gmapR")
26
+  sys_call <- paste("cat",
27
+                    "snp131.txt",
28
+                    "| perl",
29
+                    converted_script,
30
+                    ">",
31
+                    "snp131.converted")
32
+  system(sys_call)
33
+
34
+  ##cat snp131.converted | iit_store -o snp131
35
+  iit_store <- file.path(start_dir,
36
+                         "gmap/bin/iit_store")
37
+  sys_call <- paste("cat",
38
+                    "snp131.converted",
39
+                    "|",
40
+                    iit_store,
41
+                    "-o",
42
+                    "snp131")
43
+  system(sys_call)
44
+
45
+  ##cp -f snp131.iit /gne/home/coryba/tools/gsnap/share/hg19_ucsc/hg19_ucsc.maps
46
+  sys_call <- paste("cp -f",
47
+                    "snp131.iit",
48
+                    file.path(start_dir,
49
+                              "gmap",
50
+                              "genomes",
51
+                              "hg19",
52
+                              "hg19.maps"))
53
+  res <- system(sys_call, intern=TRUE)
54
+
55
+  ##snpindex -d hg19_ucsc -V /gne/home/coryba/tools/gsnap/share/hg19_ucsc -v snp131 snp131.iit
56
+  snpindex <- file.path(start_dir, "gmap/bin/snpindex")
57
+  sys_call <- paste(snpindex,
58
+                    "-d",
59
+                    "hg19",
60
+                    "-V",
61
+                    file.path(start_dir,
62
+                              "gmap/genomes",
63
+                              "hg19"),
64
+                    "-v snp131 snp131.iit")
65
+  res <- system(sys_call, intern=TRUE)
66
+
67
+  setwd(start_dir)
68
+  unlink(tmp_dir, recursive=TRUE)
69
+
70
+  return(0)
71
+}
0 72
new file mode 100644
... ...
@@ -0,0 +1,74 @@
1
+##' Gsnap index of hg19 (with haplotypes and unassembled regions excluded)
2
+##' 
3
+##'Incorporate a reference genome into gmap/gsnap by downloading a
4
+##' genome and constructing an IIT. Currently only hg19 is supported.
5
+##' @title Create Gmap Reference Genome Index
6
+##' @param genome 
7
+##' @return 0
8
+##' @author Cory Barr
9
+##' @export
10
+buildGmapIndex <- function(genome) {
11
+  
12
+  retriveGenomeFasta <- function(genome) {
13
+    if(genome != 'hg19')
14
+      stop("only hg19 supported at the moment")
15
+
16
+    tmp_dir <- tempdir()
17
+    if(file.exists(tmp_dir))
18
+      unlink(tmp_dir, recursive=TRUE)
19
+    dir.create(tmp_dir)
20
+
21
+    on.exit(setwd(file_path_as_absolute(getwd())))
22
+    setwd(tmp_dir)
23
+    sys_command <- paste('wget http://hgdownload.cse.ucsc.edu/goldenPath/',
24
+                         genome,
25
+                         '/bigZips/chromFa.tar.gz',
26
+                         sep='')
27
+    system(sys_command)
28
+    system('tar xzf chromFa.tar.gz')
29
+
30
+    genome_fasta_file <- paste(genome, 'fa', sep='.')
31
+    ##excluding haplotypes, cat genome seqs together into one fasta:
32
+    sys_call <- paste('ls *fa | grep -v "_gl" | grep -v "_hap" | xargs -i cat {} > ',
33
+                      genome_fasta_file,
34
+                      sep='')
35
+    system(sys_call)
36
+    
37
+    file_path_as_absolute(genome_fasta_file)     
38
+  }
39
+
40
+  genome_fa <- retrieveGenomeFasta(genome)
41
+
42
+  buildGmapIITFromFasta <- function(genome, fasta) {
43
+
44
+    ##currently, hg19 for the Genie pipelines is names hg19_ucsc to
45
+    ##distinguish origin. This will be phased out
46
+    genome <- 'hg19_ucsc'
47
+    
48
+    gmap_setup <- file.path(globals()['gmap_bin'],
49
+                              "gmap_setup")  
50
+    sys_command <- paste(gmap_setup,
51
+                         "-d",
52
+                         genome,
53
+                         fasta)
54
+    system(sys_command)
55
+      
56
+    sys_command <- paste("make -f",
57
+                         paste('Makefile.', genome, sep=''),
58
+                         "coords")
59
+    system(sys_command)
60
+      
61
+    sys_command <- paste("make -f",
62
+                         paste('Makefile.', genome, sep=''),
63
+                         "gmapdb")
64
+    system(sys_command)
65
+      
66
+    sys_command <- paste("make -f",
67
+                         paste('Makefile.', genome, sep=''),
68
+                         "install")
69
+    system(sys_command)
70
+  }
71
+  
72
+  buildGmapIITFromFasta(genome, genome_fa)
73
+  return(0)
74
+}
0 75
new file mode 100644
... ...
@@ -0,0 +1,86 @@
1
+##TODO: put in gsnap package
2
+
3
+##' This function helps create the files needed for SNP-tolerant gsnap
4
+##' alignment.
5
+##'
6
+##' @title Build SNP-tolerant gsnap alignment
7
+##' @param index_file The fasta-like file containing the SNPs
8
+##' @param gsnap_bin location of the gsnap executable. Defaults to
9
+##' location at gsnap:::globals()$gsnap
10
+##' @param save_dir location of the default gmap directory. Defaults
11
+##' to location at gsnap:::globals()$gsnap_save_dir
12
+##' @param gsnap_genome_dir name of the genome, which corresponds to
13
+##' the subdirectory of save_dir. The IIT files go here. Typical
14
+##' examples are "hg19" and "mm9"
15
+##' @param snp_file This argument will specify the name of the
16
+##' resulting final IIT of SNPs and consequently the name of the
17
+##' database specified via gsnap's -v option. For example, if snp_file
18
+##' is db131, then gsnap will align via gsnap -d genome -v db131
19
+##' input.fastq
20
+##' @return 0
21
+##` @author Cory Barr
22
+##` rdname buildGsnapSNPIIT
23
+
24
+buildGsnapSNPIIT <- function(index_file,
25
+                             gsnap_bin,
26
+                             save_dir,
27
+                             gsnap_genome_dir,
28
+                             snp_file) {
29
+
30
+  if(!file.exists(index_file))
31
+    stop(paste("Could not find the file", index_file))
32
+
33
+  if (missing(gsnap_bin))
34
+    gsnap_bin <- globals()$gsnap
35
+  if (missing(save_dir))
36
+    save_dir <- globals()$gsnap_save_dir
37
+
38
+  full_save_dir <- file.path(save_dir, gsnap_genome_dir)
39
+
40
+  if(!file.exists(full_save_dir))
41
+    stop(paste("Error. The directory", full_save_dir, "does not exist."))
42
+
43
+  tmp_dir <- paste("tmp.gsnap_index_creation.",
44
+                   ceiling(runif(1) * 10000000),
45
+                   sep="")
46
+
47
+  dir.create(tmp_dir)
48
+
49
+  iit_store <- file.path(globals()$gsnap_bin_dir, "iit_store")
50
+
51
+  if(missing(snp_file)) {
52
+    snp_file <- gsub("^.*/", "", index_file, perl=T)
53
+  }
54
+  snp_file <- file.path(tmp_dir, snp_file)
55
+  iit_file <- paste(snp_file, ".iit", sep="")
56
+  
57
+  ##ex: cat snp131.converted | iit_store -o snp131
58
+  sys_call <- paste("cat", index_file, "|", iit_store, "-o", iit_file)
59
+  system(sys_call)
60
+
61
+  maps_dir <- paste(globals()$gsnap_save_dir,
62
+                    "/",
63
+                    gsnap_genome_dir,
64
+                    "/",
65
+                    gsnap_genome_dir,
66
+                    ".maps",
67
+                    sep="")
68
+  ##ex:cp -f snp131.iit /gne/home/coryba/tools/gsnap/share/hg19_ucsc/hg19_ucsc.maps
69
+  sys_call <- paste("cp -f", iit_file, maps_dir)
70
+  system(sys_call)
71
+  
72
+  snpindex <- file.path(globals()$gsnap_bin, "snpindex")
73
+  gsnap_genome_path <- file.path(globals()$gsnap_save_dir, gsnap_genome_dir)
74
+  ##ex:snpindex -d hg19_ucsc -V /gne/home/coryba/tools/gsnap/share/hg19_ucsc -v snp131 snp131.iit
75
+  sys_call <- paste(snpindex,
76
+                    "-d",
77
+                    gsnap_genome_dir,
78
+                    "-V",
79
+                    gsnap_genome_path,
80
+                    "-v",
81
+                    strsplit(snp_file, "/")[[1]][2],
82
+                    paste(snp_file, ".iit", sep=""))
83
+  system(sys_call)
84
+  unlink(tmp_dir, recursive=TRUE)
85
+  return(0)
86
+}
0 87
new file mode 100644
... ...
@@ -0,0 +1,63 @@
1
+##' Consolidates all pieces from parallelized gsnap output into
2
+##' appropriately merged files
3
+##'
4
+##' If gmap was run in single-end mode, this will consolidate all
5
+##' parallelized into the 3 output files. If gmap was run in
6
+##' paired-end mode, consolidates to the 7 output files.
7
+##' @title Consolidate gmap's output files
8
+##' @param sam_file_dir directory where gmap's sam file output is stored
9
+##' @param paired_end indicated whether gmap was run in paired_end mode
10
+##' @param remove_merged remove the individual pieces after they are merged
11
+##' @return list of the names of the files created from consolidation
12
+##' @author Cory Barr
13
+##' @export
14
+consolidateGmapFiles <- function(sam_file_dir,
15
+                                  paired_end=FALSE,
16
+                                  remove_merged=FALSE) {
17
+
18
+  if (! file.exists(sam_file_dir)) {
19
+    stop(paste("Could not find the directory", sam_file_dir))
20
+  } else {
21
+    sam_file_dir <- file_path_as_absolute(sam_file_dir)
22
+  }
23
+
24
+  return_list <- list()
25
+  
26
+  dir_files <- dir(sam_file_dir)
27
+
28
+  no_mapping_files <- dir_files[grep("gmap_out.*nomapping$",
29
+                                     dir_files,
30
+                                     perl=T)]
31
+  no_mapping_files <- paste(sam_file_dir, no_mapping_files, sep="/")
32
+  consolidated_file <- consolidateSAMFiles(no_mapping_files,
33
+                                           "gmap.merged.no_mapping.bam",
34
+                                           remove_merged=remove_merged)
35
+  return_list$no_mapping <- consolidated_file
36
+
37
+  unpaired_uniq_files <- dir_files[grep("gmap_out.*uniq$",
38
+                                        dir_files,
39
+                                        perl=T)]
40
+  unpaired_uniq_files <- paste(sam_file_dir, unpaired_uniq_files, sep="/")
41
+  consolidated_file <- consolidateSAMFiles(unpaired_uniq_files,
42
+                                           "gmap.merged.uniq.bam",
43
+                                           remove_merged=remove_merged)
44
+  return_list$unpaired_uniq <- consolidated_file
45
+
46
+  unpaired_mult_files <- dir_files[grep("gmap_out.*mult$",
47
+                                        dir_files,
48
+                                        perl=T)]    
49
+  unpaired_mult_files <- paste(sam_file_dir, unpaired_mult_files, sep="/")
50
+  consolidated_file <- consolidateSAMFiles(unpaired_mult_files,
51
+                                           "gmap.merged.mult.bam",
52
+                                           remove_merged=remove_merged)
53
+  return_list$unpaired_mult <- consolidated_file
54
+
55
+  
56
+  
57
+  ##paired-end output adds an additional 4 files to those above
58
+  if (paired_end) {
59
+    ## not functional for 454 Gmap yet
60
+  }
61
+  
62
+  return(return_list)
63
+}
0 64
new file mode 100644
... ...
@@ -0,0 +1,63 @@
1
+##' Consolidates all pieces from parallelized gsnap output into
2
+##' appropriately merged files
3
+##'
4
+##' If gsnap was run in single-end mode, this will consolidate all
5
+##' parallelized into the 3 output files. If gsnap was run in
6
+##' paired-end mode, consolidates to the 7 output files.
7
+##' @title Consolidate gsnap's parallelized output files
8
+##' @param sam_file_dir directory where gsnap's sam file output is stored
9
+##' @param remove_merged remove the individual pieces after they are merged
10
+##' @return list of the names of the files created from consolidation
11
+##' @author Cory Barr
12
+consolidateGsnapFiles <- function(sam_file_dir,
13
+                                  remove_merged=FALSE,
14
+                                  parallelized=TRUE) {
15
+
16
+  if (! file.exists(sam_file_dir)) {
17
+    stop(paste("Could not find the directory", sam_file_dir))
18
+  } else {
19
+    sam_file_dir <- file_path_as_absolute(sam_file_dir)
20
+  }
21
+
22
+  return_list <- list()
23
+  
24
+  dir_files <- dir(sam_file_dir, full.names=TRUE)
25
+
26
+  .consolidate <- function(mapping_class,
27
+                           remove_merged,
28
+                           dir_files) {
29
+
30
+    base_names <- basename(dir_files)
31
+
32
+    unconsolidated_files <- dir_files[grep(paste("^gsnap_out\\.*",
33
+                                                 "\\d+\\.",
34
+                                                 mapping_class,
35
+                                                 "$",
36
+                                                 sep=''),
37
+                                           base_names,
38
+                                           perl=TRUE)]    
39
+
40
+    consolidated_file <- consolidateSAMFiles(sam_files=unconsolidated_files,
41
+                                             outfile=paste(
42
+                                               "gsnap.merged",
43
+                                               mapping_class,
44
+                                               "bam",
45
+                                               sep='.'),
46
+                                             remove_merged=remove_merged)
47
+  }
48
+
49
+  mapping_classes <- basename(dir_files)
50
+  mapping_classes <- mapping_classes[grep("gsnap_out\\.", mapping_classes)]
51
+  mapping_classes <- unique(sub("^gsnap_out\\.\\d+\\.", "", mapping_classes))
52
+
53
+  apply_func <- lapply
54
+  if(parallelized)
55
+    apply_func <- mclapply
56
+  
57
+  merged_files <- lapply(mapping_classes,
58
+                         .consolidate,
59
+                         remove_merged=remove_merged,
60
+                         dir_files=dir_files)
61
+  names(merged_files) <- mapping_classes
62
+  return(merged_files)
63
+}
0 64
new file mode 100644
... ...
@@ -0,0 +1,65 @@
1
+##' Return number of reads in each of gsnap's output files
2
+##'
3
+##' With the --split-output argument, gsnap can output multiple
4
+##' files. If these files are BAM files, this function counts the
5
+##' number of reads in each file.
6
+##' @title Return number of reads in each of gsnap's output files
7
+##' @param aligner_dir Directory containing output bam files from gsnap
8
+##' @param parallelize Boolean indicating if counting should be done in parallel
9
+##' @return named list of read counts per gsnap output file
10
+##' @author Cory Barr
11
+##' @export
12
+getGsnapFileCounts <- function(aligner_dir, parallelize=TRUE) {
13
+
14
+  if(!file.exists(aligner_dir))
15
+    stop("aligner_dir does not exist")
16
+  
17
+  ##have to count gsnap output by chr, or can get overflow errors
18
+  .getNumUniqueReadsInBam <- function(bam_file) {
19
+    chr_granges <- getSeqlensFromBAM(bam_file)
20
+      .getUniqueReadsByChr <- function(index) {
21
+        sb_param <-
22
+          ScanBamParam(what=c("qname"),
23
+                       which=chr_granges[index])
24
+        res <- scanBam(bam_file, param=sb_param)[[1]]
25
+        unique(res$qname)
26
+      }
27
+
28
+    apply_func <- lapply
29
+    if (parallelize) apply_func <- mclapply
30
+    
31
+    res <- unlist(apply_func(seq_len(length(chr_granges)),
32
+                           .getUniqueReadsByChr))
33
+    length(unique(res))
34
+  }
35
+
36
+  bam_files <- dir(aligner_dir, pattern="\\.bam$", full.names=TRUE)
37
+
38
+  nomapping_index <- grep("\\.no_?mapping\\.bam$", bam_files)
39
+  mult_mapping_index <- grep("_mult[_\\.]", bam_files)
40
+  uniq_mapping_index <- seq_len(length(bam_files))[-c(mult_mapping_index, nomapping_index)]
41
+  nomapping_bam <- bam_files[nomapping_index]
42
+  mult_mapping_bams <- bam_files[mult_mapping_index]
43
+  uniq_mapping_bams <- bam_files[uniq_mapping_index]
44
+
45
+  named_names <- bam_files[-nomapping_index]
46
+  names(named_names) <- named_names
47
+  gsnap_counts <- lapply(named_names,
48
+                         .getNumUniqueReadsInBam)
49
+                         
50
+  names(gsnap_counts) <- sub("^.*\\.gsnap\\.merged\\.", "", names(gsnap_counts))
51
+  names(gsnap_counts) <- sub("\\.bam$", "", names(gsnap_counts))
52
+
53
+  ##handling the 'nomapping' file differently, since nothing this (so
54
+  ##nothing is returned if scanBam is given a ScanBamParam which arg
55
+  nomapper_ids <- unlist(scanBam(nomapping_bam,
56
+                          param=ScanBamParam(what=c("qname")))[[1]],
57
+                         use.names=FALSE)
58
+  num_nomappers <- sum(!duplicated(nomapper_ids))
59
+  gsnap_counts[['nomapping']] <- num_nomappers
60
+
61
+  names(gsnap_counts) <- paste("gsnap", names(gsnap_counts), sep=".")
62
+  names(gsnap_counts) <- paste(names(gsnap_counts), "_reads", sep="")
63
+
64
+  return(gsnap_counts)  
65
+}
0 66
new file mode 100644
... ...
@@ -0,0 +1,177 @@
1
+##' This function parallelizes gsnap across a cluster.
2
+##'
3
+##' @title Parallelized gsnap
4
+##' @param num_machines number of cluster machines to run gsnap across
5
+##' @param procs_per_machine number of gsnap jobs to run on each cluster node
6
+##' @param gsnap location of gsnap executable
7
+##' @param gsnap_params a string of the parameters to pass to gsnap. Note "--split-output" is not allowed.
8
+##' @param input_file location of the input fastq to align
9
+##' @param input_file_2 location of the second input fastq to align (for paired ends)
10
+##' @param paired_end logical indicating if gsnap should be run in paired-end mode
11
+##' @param output_dir directory for gsnap to write its output
12
+##' @param test_mode used for debugging. turns off parallelization.
13
+##' @param intern indicates whether to have STDOUT the return
14
+##' value. identical to 'intern' argument to the system() function
15
+##' @param multifile_out tells gsnap to write to 3 output files for
16
+##' single-end data, or 7 output files for paired-end data
17
+##' @return a list of the return values of the system function for
18
+##' each parallelized piece
19
+##' @author Cory Barr
20
+##' @export
21
+parallelized_gsnap <- function(num_machines,
22
+                               procs_per_machine,
23
+                               gsnap,
24
+                               gsnap_params,
25
+                               input_file,
26
+                               input_file_2=NULL,
27
+                               paired_end=F,
28
+                               output_dir,
29
+                               test_mode=FALSE,
30
+                               intern=FALSE,
31
+                               multifile_out=TRUE,
32
+                               record_sys_call_dir=NA) {
33
+  
34
+  if (length(grep("split-output ", gsnap_params) > 0))
35
+    stop("This function cannot handle gsnap parameters with the --split-output switch enabled")
36
+
37
+  if(procs_per_machine == 1) {
38
+    apply_func <- lapply
39
+  } else {
40
+    apply_func <- mclapply
41
+  }
42
+  
43
+  .gsnap_per_machine <- function(i,
44
+                                 procs_per_machine,
45
+                                 gsnap,
46
+                                 gsnap_params,
47
+                                 paired_end,
48
+                                 input_file,
49
+                                 input_file_2,
50
+                                 output_dir,
51
+                                 total_gsnap_parts,
52
+                                 intern,
53
+                                 multifile_out,
54
+                                 record_sys_call_dir) {
55
+
56
+    ##library calls needed so slaves have access
57
+    library("multicore")
58
+    library("HTSeqGenieBase")
59
+    
60
+    gsnap_parts <-
61
+      (i - 1) * procs_per_machine + (seq_len(procs_per_machine) - 1)
62
+    gsnap_part_params <- paste("--part=",
63
+                               gsnap_parts,
64
+                               "/",
65
+                               total_gsnap_parts,
66
+                               sep="")
67
+
68
+    if (paired_end)
69
+      gsnap_params <- paste(gsnap_params, "-a paired")
70
+    
71
+    ##make the prefix of gsnaps 'split-output' outs be unique
72
+    if (multifile_out) {
73
+      gsnap_multi_out_params <- paste("gsnap_out.", gsnap_parts, sep="")
74
+      gsnap_multi_out_params <- paste(output_dir, gsnap_multi_out_params, sep="/")
75
+      sys_commands <- paste(gsnap,
76
+                            gsnap_params,
77
+                            gsnap_part_params,
78
+                            "--split-output", gsnap_multi_out_params,
79
+                            input_file)
80
+    } else {
81
+      stop("Not tested when not using gsnap's --split-output flag")
82
+    }
83
+    
84
+    if (paired_end) {
85
+      sys_commands <- paste(sys_commands, input_file_2)
86
+    }
87
+    
88
+    sys_commands <- paste("nice", sys_commands)
89
+    sys_commands <- paste(sys_commands, "2>&1")
90
+
91
+    if(!is.na(record_sys_call_dir)) {
92
+      if(!(file.exists(record_sys_call_dir)))
93
+        dir.create(record_sys_call_dir)
94
+
95
+      sys_call_file <- file.path(record_sys_call_dir,
96
+                                 paste("aligner.sys_call",
97
+                                       i - 1,
98
+                                       sep="."))
99
+      writeLines(sys_commands,
100
+                 con=sys_call_file)
101
+    }
102
+    
103
+    ##TODO: this awkward bit is about to 
104
+    if (intern) { ##parsing gsnap output. also sending multimaps out through STDOUT
105
+      stop("intern=TRUE is not working yet")
106
+    } else { #do not alter gsnap output  
107
+      results <- apply_func(sys_commands, system, intern=TRUE)
108
+    }
109
+
110
+    ##check results to see if each process finished
111
+    successful <- sapply(seq_len(length(results)),
112
+                         function(index) {
113
+                           returned_lines <- results[[index]]
114
+                           res_len <- length(returned_lines)
115
+                           grepl("^Processed \\d+ queries in [\\d\\.]+ seconds",
116
+                                 returned_lines[res_len],
117
+                                 perl=TRUE)
118
+                         })
119
+    if(!all(successful))
120
+      stop(paste("aligner did not return successfully. System call is "),
121
+           head(sys_commands[successful == FALSE], n=1))
122
+        
123
+    return(as.integer(successful) - 1)
124
+  }
125
+  
126
+  if (test_mode) {
127
+    results <- lapply(seq_len(num_machines),
128
+                      .gsnap_per_machine,
129
+                      procs_per_machine=procs_per_machine,
130
+                      gsnap=gsnap,
131
+                      gsnap_params=gsnap_params,
132
+                      paired_end=paired_end,
133
+                      input_file=input_file,
134
+                      input_file_2=input_file_2,
135
+                      output_dir=output_dir,
136
+                      total_gsnap_parts=num_machines * procs_per_machine,
137
+                      intern=intern,
138
+                      multifile_out=multifile_out,
139
+                      record_sys_call_dir=record_sys_call_dir)
140
+  } else {
141
+    if (num_machines > 1) {
142
+      library("snow")
143
+      cl <- makeCluster(num_machines, "MPI")
144
+      results <- parLapply(cl,
145
+                           seq_len(num_machines),
146
+                           .gsnap_per_machine,
147
+                           procs_per_machine=procs_per_machine,
148
+                           gsnap=gsnap,
149
+                           gsnap_params=gsnap_params,
150
+                           paired_end=paired_end,
151
+                           input_file=input_file,
152
+                           input_file_2=input_file_2,
153
+                           output_dir=output_dir,
154
+                           total_gsnap_parts=num_machines * procs_per_machine,
155
+                           intern=intern,
156
+                           multifile_out=multifile_out,
157
+                           record_sys_call_dir=record_sys_call_dir)
158
+      stopCluster(cl)
159
+    } else {      
160
+      results <- apply_func(seq_len(num_machines),
161
+                            .gsnap_per_machine,
162
+                            procs_per_machine=procs_per_machine,
163
+                            gsnap=gsnap,
164
+                            gsnap_params=gsnap_params,
165
+                            paired_end=paired_end,
166
+                            input_file=input_file,
167
+                            input_file_2=input_file_2,
168
+                            output_dir=output_dir,
169
+                            total_gsnap_parts=num_machines * procs_per_machine,
170
+                            intern=intern,
171
+                            multifile_out=multifile_out,
172
+                            record_sys_call_dir=record_sys_call_dir)
173
+    }
174
+  }  
175
+  
176
+  return(results)
177
+}
0 178
new file mode 100644
... ...
@@ -0,0 +1,82 @@
1
+#Hey Cory,
2
+#
3
+#I think we should be including class 'het' and the SNPs from class 'mixed' as well. The het class is easy, parsing SNPs out of the mixed class is going to be a bit more of a trick...
4
+#
5
+#Jeremiah
6
+#
7
+#On Fri, Nov 5, 2010 at 5:36 PM, Cory Barr <barr.cory@gene.com> wrote:
8
+#
9
+#I’m just keeping class=’single’ and locType=’exact’.  Too exclusive?
10
+
11
+use warnings;
12
+use strict;
13
+
14
+sub revcomp {
15
+    my @nucs = @_;
16
+    @nucs = map(uc, @nucs);
17
+
18
+    for (my $i=0; $i<@nucs; $i++) {
19
+	if ($nucs[$i] eq "A") {
20
+	    $nucs[$i] = "T";
21
+	}
22
+	elsif ($nucs[$i] eq "C") {
23
+	    $nucs[$i] = "G";
24
+	}
25
+	elsif ($nucs[$i] eq "G") {
26
+	    $nucs[$i] = "C";
27
+	}
28
+	elsif ($nucs[$i] eq "T") {
29
+	    $nucs[$i] = "A";
30
+	}
31
+    }
32
+    
33
+    return(@nucs);
34
+}
35
+
36
+while (my $input=<>) {
37
+    chomp($input);
38
+    my @pieces = split("\t", $input);
39
+    my $chr = $pieces[1];
40
+    my $start = $pieces[2] + 1;
41
+    my $end = $pieces[3];
42
+    my $id = $pieces[4];
43
+    my $strand = $pieces[6];
44
+    my $ref = $pieces[8];
45
+    my $seen = $pieces[9];
46
+    my $type = $pieces[11];
47
+
48
+    if ($type eq 'het') { $seen =~ s|\(HETEROZYGOUS\)/?||; }
49
+    elsif ($type ne 'single' && $type ne 'mixed') {
50
+	next;
51
+    }
52
+
53
+    if (not $ref =~ m/^[ACGT]$/) { next; } #cannot be a SNP unless matches this
54
+
55
+    my @seen_nucs = split("/", $seen);
56
+    if ($type eq 'mixed') {
57
+	@seen_nucs = grep(/^[ACGT]$/, @seen_nucs);
58
+    }
59
+
60
+    if ($strand eq "-") {
61
+	@seen_nucs = revcomp(@seen_nucs);
62
+    }
63
+
64
+    @seen_nucs = grep(!/^$ref/, @seen_nucs);
65
+        
66
+#all entries must be in this format: >rs62211261 21:14379270..14379270 CG
67
+    foreach my $sn (@seen_nucs) {
68
+	print 
69
+	    ">" .
70
+	    $id .
71
+	    " " .
72
+	    $chr .
73
+	    ":" .
74
+	    $start .
75
+	    ".." .
76
+	    $end .
77
+	    " " .
78
+	    $ref .
79
+	    $sn .
80
+	    "\n";
81
+    }
82
+}
0 83
new file mode 100644
... ...
@@ -0,0 +1,9 @@
1
+\name{buildGmapDbSNPIndex}
2
+\alias{buildGmapDbSNPIndex}
3
+\title{Download and Integrate dbSNP into GMAP/GSNAP}
4
+\usage{buildGmapDbSNPIndex(version)}
5
+\description{Downloads a version of dbSNP and integrates it into an installation of GMAP/GSNAP for SNP-tolerant alignment}
6
+\details{.. content for \details{} ..}
7
+\value{0}
8
+\author{Cory Barr}
9
+\arguments{\item{version}{version of dbSNP to obtain}}
0 10
new file mode 100644
... ...
@@ -0,0 +1,10 @@
1
+\name{buildGmapIndex}
2
+\alias{buildGmapIndex}
3
+\title{Create Gmap Reference Genome Index}
4
+\usage{buildGmapIndex(genome)}
5
+\description{Gsnap index of hg19 (with haplotypes and unassembled regions excluded)}
6
+\details{a reference genome into gmap/gsnap by downloading a
7
+genome and constructing an IIT. Currently only hg19 is supported.}
8
+\value{0}
9
+\author{Cory Barr}
10
+\arguments{\item{genome}{}}
0 11
new file mode 100644
... ...
@@ -0,0 +1,21 @@
1
+\name{buildGsnapSNPIIT}
2
+\alias{buildGsnapSNPIIT}
3
+\title{Build SNP-tolerant gsnap alignment}
4
+\usage{buildGsnapSNPIIT(index_file, gsnap_bin, save_dir, gsnap_genome_dir,
5
+    snp_file)}
6
+\description{This function helps create the files needed for SNP-tolerant gsnap
7
+alignment.}
8
+\value{0}
9
+\arguments{\item{index_file}{The fasta-like file containing the SNPs}
10
+\item{gsnap_bin}{location of the gsnap executable. Defaults to
11
+location at gsnap:::globals()$gsnap}
12
+\item{save_dir}{location of the default gmap directory. Defaults
13
+to location at gsnap:::globals()$gsnap_save_dir}
14
+\item{gsnap_genome_dir}{name of the genome, which corresponds to
15
+the subdirectory of save_dir. The IIT files go here. Typical
16
+examples are "hg19" and "mm9"}
17
+\item{snp_file}{This argument will specify the name of the
18
+resulting final IIT of SNPs and consequently the name of the
19
+database specified via gsnap's -v option. For example, if snp_file
20
+is db131, then gsnap will align via gsnap -d genome -v db131
21
+input.fastq}}
0 22
new file mode 100644
... ...
@@ -0,0 +1,15 @@
1
+\name{consolidateGmapFiles}
2
+\alias{consolidateGmapFiles}
3
+\title{Consolidate gmap's output files}
4
+\usage{consolidateGmapFiles(sam_file_dir, paired_end=FALSE,
5
+    remove_merged=FALSE)}
6
+\description{Consolidates all pieces from parallelized gsnap output into
7
+appropriately merged files}
8
+\details{If gmap was run in single-end mode, this will consolidate all
9
+parallelized into the 3 output files. If gmap was run in
10
+paired-end mode, consolidates to the 7 output files.}
11
+\value{list of the names of the files created from consolidation}
12
+\author{Cory Barr}
13
+\arguments{\item{sam_file_dir}{directory where gmap's sam file output is stored}
14
+\item{paired_end}{indicated whether gmap was run in paired_end mode}
15
+\item{remove_merged}{remove the individual pieces after they are merged}}
0 16
new file mode 100644
... ...
@@ -0,0 +1,14 @@
1
+\name{consolidateGsnapFiles}
2
+\alias{consolidateGsnapFiles}
3
+\title{Consolidate gsnap's parallelized output files}
4
+\usage{consolidateGsnapFiles(sam_file_dir, remove_merged=FALSE,
5
+    parallelized=TRUE)}
6
+\description{Consolidates all pieces from parallelized gsnap output into
7
+appropriately merged files}
8
+\details{If gsnap was run in single-end mode, this will consolidate all
9
+parallelized into the 3 output files. If gsnap was run in
10
+paired-end mode, consolidates to the 7 output files.}
11
+\value{list of the names of the files created from consolidation}
12
+\author{Cory Barr}
13
+\arguments{\item{sam_file_dir}{directory where gsnap's sam file output is stored}
14
+\item{remove_merged}{remove the individual pieces after they are merged}}
0 15
new file mode 100644
... ...
@@ -0,0 +1,12 @@
1
+\name{getGsnapFileCounts}
2
+\alias{getGsnapFileCounts}
3
+\title{Return number of reads in each of gsnap's output files}
4
+\usage{getGsnapFileCounts(aligner_dir, parallelize=TRUE)}
5
+\description{Return number of reads in each of gsnap's output files}
6
+\details{With the --split-output argument, gsnap can output multiple
7
+files. If these files are BAM files, this function counts the
8
+number of reads in each file.}
9
+\value{named list of read counts per gsnap output file}
10
+\author{Cory Barr}
11
+\arguments{\item{aligner_dir}{Directory containing output bam files from gsnap}
12
+\item{parallelize}{Boolean indicating if counting should be done in parallel}}
0 13
new file mode 100644
... ...
@@ -0,0 +1,43 @@
1
+\name{gmapR-package}
2
+\alias{gmapR-package}
3
+\alias{gmapR}
4
+\docType{package}
5
+\title{
6
+What the package does (short line)
7
+~~ package title ~~
8
+}
9
+\description{
10
+More about what it does (maybe more than one line)
11
+~~ A concise (1-5 lines) description of the package ~~
12
+}
13
+\details{
14
+\tabular{ll}{
15
+Package: \tab gmapR\cr
16
+Type: \tab Package\cr
17
+Version: \tab 1.0\cr
18
+Date: \tab 2011-07-25\cr
19
+License: \tab What license is it under?\cr
20
+LazyLoad: \tab yes\cr
21
+}
22
+~~ An overview of how to use the package, including the most important ~~
23
+~~ functions ~~
24
+}
25
+\author{
26
+Who wrote it
27
+
28
+Maintainer: Who to complain to <yourfault@somewhere.net>
29
+~~ The author and/or maintainer of the package ~~
30
+}
31
+\references{
32
+~~ Literature or other references for background information ~~
33
+}
34
+~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~
35
+~~ the R documentation directory ~~
36
+\keyword{ package }
37
+\seealso{
38
+~~ Optional links to other man pages, e.g. ~~
39
+~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~
40
+}
41
+\examples{
42
+~~ simple examples of the most important functions ~~
43
+}
0 44
new file mode 100644
... ...
@@ -0,0 +1,8 @@
1
+\name{installGmap}
2
+\alias{installGmap}
3
+\title{Download and Install GMAP}
4
+\usage{installGmap(install_dir)}
5
+\description{Downloads the GMAP source code, builds it, then installs it to a specified directory}
6
+\value{0}
7
+\author{Cory Barr}
8
+\arguments{\item{install_dir}{directory where GMAP will be installed}}
0 9
new file mode 100644
... ...
@@ -0,0 +1,24 @@
1
+\name{parallelized_gsnap}
2
+\alias{parallelized_gsnap}
3
+\title{Parallelized gsnap}
4
+\usage{parallelized_gsnap(num_machines, procs_per_machine, gsnap,
5
+    gsnap_params, input_file, input_file_2, paired_end=F, output_dir,
6
+    test_mode=FALSE, intern=FALSE, multifile_out=TRUE,
7
+    record_sys_call_dir=NA)}
8
+\description{This function parallelizes gsnap across a cluster.}
9
+\value{a list of the return values of the system function for
10
+each parallelized piece}
11
+\author{Cory Barr}
12
+\arguments{\item{num_machines}{number of cluster machines to run gsnap across}
13
+\item{procs_per_machine}{number of gsnap jobs to run on each cluster node}
14
+\item{gsnap}{location of gsnap executable}
15
+\item{gsnap_params}{a string of the parameters to pass to gsnap. Note "--split-output" is not allowed.}
16
+\item{input_file}{location of the input fastq to align}
17
+\item{input_file_2}{location of the second input fastq to align (for paired ends)}
18
+\item{paired_end}{logical indicating if gsnap should be run in paired-end mode}
19
+\item{output_dir}{directory for gsnap to write its output}
20
+\item{test_mode}{used for debugging. turns off parallelization.}
21
+\item{intern}{indicates whether to have STDOUT the return
22
+value. identical to 'intern' argument to the system() function}
23
+\item{multifile_out}{tells gsnap to write to 3 output files for
24
+single-end data, or 7 output files for paired-end data}}