Browse code

Add R_Genome_getSeq for efficiently retrieving sequence from a GMAP genome index. This is super fast.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/gmapR@74871 bc3139a8-67e5-0310-9ffc-ced21a209358

Michael Lawrence authored on 26/03/2013 21:38:39
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,116 @@
1
+#include <stdlib.h>
2
+#include <string.h>
3
+
4
+#include <gstruct/bool.h>
5
+#include <gstruct/datadir.h>
6
+#include <gstruct/iit-read.h>
7
+#include <gstruct/genome.h>
8
+#include <gstruct/interval.h>
9
+#include <gstruct/genomicpos.h>
10
+#include <gstruct/complement.h>
11
+
12
+#include "gmapR.h"
13
+#include "genome.h"
14
+
15
+IIT_T
16
+readChromosomeIIT (const char *genome_dir, const char *db) {
17
+  char *genomesubdir, *fileroot, *dbversion, *iitfile;
18
+  genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion,
19
+                                           (char *)genome_dir, (char *)db);
20
+  
21
+  iitfile = (char *) calloc(strlen(genomesubdir) + strlen("/") +
22
+                            strlen(fileroot) + strlen(".chromosome.iit") + 1,
23
+                            sizeof(char));
24
+  sprintf(iitfile, "%s/%s.chromosome.iit", genomesubdir, fileroot);
25
+  IIT_T chromosome_iit = IIT_read(iitfile, /*name*/NULL, /*readonlyp*/true,
26
+                                  /*divread*/READ_ALL, /*divstring*/NULL,
27
+                                  /*add_iit_p*/false, /*labels_read_p*/true);
28
+  free(iitfile);
29
+  free(fileroot);
30
+  free(dbversion);
31
+  free(genomesubdir);
32
+
33
+  return chromosome_iit;
34
+}
35
+
36
+Genome_T
37
+createGenome (const char *genome_dir, const char *db) {
38
+  char *genomesubdir, *fileroot, *dbversion;
39
+  genomesubdir = Datadir_find_genomesubdir(&fileroot, &dbversion,
40
+                                           (char *)genome_dir, (char *)db);
41
+  
42
+  Genome_T genome = Genome_new(genomesubdir, fileroot, /*snps_root*/NULL,
43
+                               /*uncompressedp*/false, /*access*/USE_MMAP_ONLY);
44
+  free(fileroot);
45
+  free(dbversion);
46
+  free(genomesubdir);
47
+
48
+  return genome;
49
+}
50
+
51
+static char complCode[128] = COMPLEMENT_LC;
52
+
53
+static void
54
+make_complement_inplace (char *sequence, Genomicpos_T length) {
55
+  char temp;
56
+  int i, j;
57
+
58
+  for (i = 0, j = length-1; i < length/2; i++, j--) {
59
+    temp = complCode[(int) sequence[i]];
60
+    sequence[i] = complCode[(int) sequence[j]];
61
+    sequence[j] = temp;
62
+  }
63
+  if (i == j) {
64
+    sequence[i] = complCode[(int) sequence[i]];
65
+  }
66
+
67
+  return;
68
+}
69
+
70
+SEXP
71
+R_Genome_getSeq (SEXP genome_dir_R, SEXP db_R,
72
+                 SEXP seqnames_R, SEXP start_R, SEXP width_R, SEXP strand_R)
73
+{
74
+  const char *genome_dir =
75
+    genome_dir_R == R_NilValue ? NULL : CHAR(asChar(genome_dir_R));
76
+  const char *db = CHAR(asChar(db_R));
77
+  Genome_T genome = createGenome(genome_dir, db);
78
+  IIT_T chromosome_iit = readChromosomeIIT(genome_dir, db);
79
+  int *start = INTEGER(start_R);
80
+  int *width = INTEGER(width_R);
81
+
82
+  int max_width = 0;
83
+  for (int i = 0; i < length(width_R); i++) {
84
+    if (width[i] > max_width)
85
+      max_width = width[i];
86
+  }
87
+  char *buffer = (char *) R_alloc(sizeof(char), max_width + 1);
88
+
89
+  int seqname_index;
90
+  const char *old_seqname = NULL; // premature optimization
91
+  SEXP result;
92
+  PROTECT(result = allocVector(STRSXP, length(start_R)));
93
+  for (int i = 0; i < length(start_R); i++) {
94
+    const char *seqname = CHAR(STRING_ELT(seqnames_R, i));
95
+    if (old_seqname == NULL || strcmp(seqname, old_seqname)) {
96
+      seqname_index = IIT_find_linear(chromosome_iit, (char *)seqname);
97
+      if (seqname_index < 0) {
98
+        error("Cannot find chromosome %s in genome", seqname);
99
+      }
100
+    }
101
+    old_seqname = seqname;
102
+    int chroffset = Interval_low(IIT_interval(chromosome_iit, seqname_index));
103
+    Genome_fill_buffer_simple(genome, chroffset + start[i] - 1, width[i],
104
+                              buffer);
105
+    if (CHAR(STRING_ELT(strand_R, i))[0] == '-')
106
+      make_complement_inplace(buffer, width[i]);
107
+    SET_STRING_ELT(result, i, mkChar(buffer));
108
+  }
109
+
110
+  IIT_free(&chromosome_iit);
111
+  Genome_free(&genome);
112
+
113
+  UNPROTECT(1);
114
+  return result;
115
+}
116
+