Browse code

Add R_Genome_getSeq for efficiently retrieving sequence from a GMAP genome index. This is super fast.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@74871 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 26/03/2013 21:38:39
Showing 6 changed files

... ...
@@ -2,7 +2,7 @@ SUBDIRS = gmap gstruct
2 2
 
3 3
 .PHONY: all clean $(SUBDIRS)
4 4
 
5
-OBJECTS = bamreader.o bamtally.o iit.o variantsummary.o R_init_gmapR.o
5
+OBJECTS = bamreader.o bamtally.o iit.o variantsummary.o genome.o R_init_gmapR.o
6 6
 
7 7
 R_SRC_DIR = ${CURDIR}
8 8
 PREFIX = ${R_SRC_DIR}/../inst/usr
... ...
@@ -7,11 +7,14 @@
7 7
 static const R_CallMethodDef callMethods[] = {
8 8
 
9 9
   /* bamtally.c */
10
-  CALLMETHOD_DEF(R_Bamtally_iit, 17),
10
+  CALLMETHOD_DEF(R_Bamtally_iit, 18),
11 11
   CALLMETHOD_DEF(R_tally_iit_parse, 4),
12 12
   
13 13
   /* bamreader.c */
14 14
   CALLMETHOD_DEF(R_Bamread_new, 1),
15
+
16
+  /* genome.c */
17
+  CALLMETHOD_DEF(R_Genome_getSeq, 6),
15 18
   
16 19
   {NULL, NULL, 0}
17 20
 };
... ...
@@ -1,17 +1,13 @@
1
-#include <stdlib.h>
2
-#include <string.h>
3
-
4 1
 #include <gstruct/bool.h>
5
-#include <gstruct/datadir.h>
6 2
 #include <gstruct/iit-read.h>
7 3
 #include <gstruct/genome.h>
8
-#include <gstruct/interval.h>
9 4
 #include <gstruct/genomicpos.h>
10 5
 #include <gstruct/bamread.h>
11 6
 #include <gstruct/bamtally.h>
12 7
 
13 8
 #include "gmapR.h"
14 9
 #include "iit.h"
10
+#include "genome.h"
15 11
 
16 12
 SEXP
17 13
 R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
... ...
@@ -20,7 +16,7 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
20 16
                 SEXP minimum_mapq_R, SEXP good_unique_mapq_R,
21 17
                 SEXP maximum_nhits_R,
22 18
                 SEXP need_concordant_p_R, SEXP need_unique_p_R,
23
-                SEXP need_primary_p_R,
19
+                SEXP need_primary_p_R, SEXP ignore_duplicates_p_R,
24 20
                 SEXP min_depth_R, SEXP variant_strands_R,
25 21
                 SEXP ignore_query_Ns_p_R,
26 22
                 SEXP print_indels_p_R,
... ...
@@ -38,30 +34,16 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
38 34
   bool need_concordant_p = asLogical(need_concordant_p_R);
39 35
   bool need_unique_p = asLogical(need_unique_p_R);
40 36
   bool need_primary_p = asLogical(need_primary_p_R);
37
+  bool ignore_duplicates_p = asLogical(ignore_duplicates_p_R);
41 38
   int min_depth = asInteger(min_depth_R);
42 39
   int variant_strands = asInteger(variant_strands_R);
43 40
   bool ignore_query_Ns_p = asLogical(ignore_query_Ns_p_R);
44 41
   bool print_indels_p = asLogical(print_indels_p_R);
45 42
   int blocksize = asInteger(blocksize_R);
46 43
   int verbosep = asLogical(verbosep_R);
47
-  
48
-  char *genomesubdir, *fileroot, *dbversion, *iitfile;
49
-  genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion,
50
-                                           (char *)genome_dir, (char *)db);
51
-  
52
-  iitfile = (char *) calloc(strlen(genomesubdir) + strlen("/") +
53
-                            strlen(fileroot) + strlen(".chromosome.iit") + 1,
54
-                            sizeof(char));
55
-  sprintf(iitfile, "%s/%s.chromosome.iit", genomesubdir, fileroot);
56
-  IIT_T chromosome_iit = IIT_read(iitfile, /*name*/NULL, /*readonlyp*/true,
57
-                                  /*divread*/READ_ALL, /*divstring*/NULL,
58
-                                  /*add_iit_p*/false, /*labels_read_p*/true);
59
-  Genome_T genome = Genome_new(genomesubdir, fileroot, /*snps_root*/NULL,
60
-                               /*uncompressedp*/false, /*access*/USE_MMAP_ONLY);
61
-  free(iitfile);
62
-  free(fileroot);
63
-  free(dbversion);
64
-  free(genomesubdir);
44
+
45
+  Genome_T genome = createGenome(genome_dir, db);
46
+  IIT_T chromosome_iit = readChromosomeIIT(genome_dir, db);
65 47
   
66 48
   const char *chr = NULL;
67 49
   Genomicpos_T start = 0;
... ...
@@ -78,6 +60,7 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
78 60
                                  alloclength, minimum_mapq, good_unique_mapq,
79 61
                                  maximum_nhits, need_concordant_p,
80 62
                                  need_unique_p, need_primary_p,
63
+                                 ignore_duplicates_p,
81 64
                                  min_depth, variant_strands, ignore_query_Ns_p,
82 65
                                  print_indels_p, blocksize, verbosep);
83 66
   IIT_free(&chromosome_iit);
84 67
new file mode 100644
... ...
@@ -0,0 +1,116 @@
1
+#include <stdlib.h>
2
+#include <string.h>
3
+
4
+#include <gstruct/bool.h>
5
+#include <gstruct/datadir.h>
6
+#include <gstruct/iit-read.h>
7
+#include <gstruct/genome.h>
8
+#include <gstruct/interval.h>
9
+#include <gstruct/genomicpos.h>
10
+#include <gstruct/complement.h>
11
+
12
+#include "gmapR.h"
13
+#include "genome.h"
14
+
15
+IIT_T
16
+readChromosomeIIT (const char *genome_dir, const char *db) {
17
+  char *genomesubdir, *fileroot, *dbversion, *iitfile;
18
+  genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion,
19
+                                           (char *)genome_dir, (char *)db);
20
+  
21
+  iitfile = (char *) calloc(strlen(genomesubdir) + strlen("/") +
22
+                            strlen(fileroot) + strlen(".chromosome.iit") + 1,
23
+                            sizeof(char));
24
+  sprintf(iitfile, "%s/%s.chromosome.iit", genomesubdir, fileroot);
25
+  IIT_T chromosome_iit = IIT_read(iitfile, /*name*/NULL, /*readonlyp*/true,
26
+                                  /*divread*/READ_ALL, /*divstring*/NULL,
27
+                                  /*add_iit_p*/false, /*labels_read_p*/true);
28
+  free(iitfile);
29
+  free(fileroot);
30
+  free(dbversion);
31
+  free(genomesubdir);
32
+
33
+  return chromosome_iit;
34
+}
35
+
36
+Genome_T
37
+createGenome (const char *genome_dir, const char *db) {
38
+  char *genomesubdir, *fileroot, *dbversion;
39
+  genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion,
40
+                                           (char *)genome_dir, (char *)db);
41
+  
42
+  Genome_T genome = Genome_new(genomesubdir, fileroot, /*snps_root*/NULL,
43
+                               /*uncompressedp*/false, /*access*/USE_MMAP_ONLY);
44
+  free(fileroot);
45
+  free(dbversion);
46
+  free(genomesubdir);
47
+
48
+  return genome;
49
+}
50
+
51
+static char complCode[128] = COMPLEMENT_LC;
52
+
53
+static void
54
+make_complement_inplace (char *sequence, Genomicpos_T length) {
55
+  char temp;
56
+  int i, j;
57
+
58
+  for (i = 0, j = length-1; i < length/2; i++, j--) {
59
+    temp = complCode[(int) sequence[i]];
60
+    sequence[i] = complCode[(int) sequence[j]];
61
+    sequence[j] = temp;
62
+  }
63
+  if (i == j) {
64
+    sequence[i] = complCode[(int) sequence[i]];
65
+  }
66
+
67
+  return;
68
+}
69
+
70
+SEXP
71
+R_Genome_getSeq (SEXP genome_dir_R, SEXP db_R,
72
+                 SEXP seqnames_R, SEXP start_R, SEXP width_R, SEXP strand_R)
73
+{
74
+  const char *genome_dir =
75
+    genome_dir_R == R_NilValue ? NULL : CHAR(asChar(genome_dir_R));
76
+  const char *db = CHAR(asChar(db_R));
77
+  Genome_T genome = createGenome(genome_dir, db);
78
+  IIT_T chromosome_iit = readChromosomeIIT(genome_dir, db);
79
+  int *start = INTEGER(start_R);
80
+  int *width = INTEGER(width_R);
81
+
82
+  int max_width = 0;
83
+  for (int i = 0; i < length(width_R); i++) {
84
+    if (width[i] > max_width)
85
+      max_width = width[i];
86
+  }
87
+  char *buffer = (char *) R_alloc(sizeof(char), max_width + 1);
88
+
89
+  int seqname_index;
90
+  const char *old_seqname = NULL; // premature optimization
91
+  SEXP result;
92
+  PROTECT(result = allocVector(STRSXP, length(start_R)));
93
+  for (int i = 0; i < length(start_R); i++) {
94
+    const char *seqname = CHAR(STRING_ELT(seqnames_R, i));
95
+    if (old_seqname == NULL || strcmp(seqname, old_seqname)) {
96
+      seqname_index = IIT_find_linear(chromosome_iit, (char *)seqname);
97
+      if (seqname_index < 0) {
98
+        error("Cannot find chromosome %s in genome", seqname);
99
+      }
100
+    }
101
+    old_seqname = seqname;
102
+    int chroffset = Interval_low(IIT_interval(chromosome_iit, seqname_index));
103
+    Genome_fill_buffer_simple(genome, chroffset + start[i] - 1, width[i],
104
+                              buffer);
105
+    if (CHAR(STRING_ELT(strand_R, i))[0] == '-')
106
+      make_complement_inplace(buffer, width[i]);
107
+    SET_STRING_ELT(result, i, mkChar(buffer));
108
+  }
109
+
110
+  IIT_free(&chromosome_iit);
111
+  Genome_free(&genome);
112
+
113
+  UNPROTECT(1);
114
+  return result;
115
+}
116
+
0 117
new file mode 100644
... ...
@@ -0,0 +1,12 @@
1
+#ifndef GENOME_H
2
+#define GENOME_H
3
+
4
+#include <gstruct/iit-read.h>
5
+#include <gstruct/genome.h>
6
+
7
+/* Internal genome helper functions */
8
+
9
+IIT_T readChromosomeIIT (const char *genome_dir, const char *db);
10
+Genome_T createGenome (const char *genome_dir, const char *db);
11
+
12
+#endif
... ...
@@ -15,7 +15,7 @@ R_Bamtally_iit (SEXP bamreader_R, SEXP genome_dir_R, SEXP db_R,
15 15
                 SEXP minimum_mapq_R, SEXP good_unique_mapq_R,
16 16
                 SEXP maximum_nhits_R,
17 17
                 SEXP need_concordant_p_R, SEXP need_unique_p_R,
18
-                SEXP need_primary_p_R,
18
+                SEXP need_primary_p_R, SEXP ignore_duplicates_p_R,
19 19
                 SEXP min_depth_R, SEXP variant_strands_R,
20 20
                 SEXP ignore_query_Ns_p,
21 21
                 SEXP print_indels_p_R,
... ...
@@ -26,4 +26,8 @@ SEXP
26 26
 R_tally_iit_parse(SEXP tally_iit_R, SEXP cycle_breaks_R,
27 27
                   SEXP high_base_quality, SEXP which_R);
28 28
 
29
+SEXP
30
+R_Genome_getSeq (SEXP genome_dir_R, SEXP db_R,
31
+                 SEXP seqnames_R, SEXP start_R, SEXP width_R, SEXP strand_R);
32
+
29 33
 #endif