1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,217 @@ |
1 |
+#include <ctype.h> |
|
2 |
+#include "bam.h" |
|
3 |
+#include "khash.h" |
|
4 |
+typedef char *str_p; |
|
5 |
+KHASH_MAP_INIT_STR(s, int) |
|
6 |
+KHASH_MAP_INIT_STR(r2l, str_p) |
|
7 |
+ |
|
8 |
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) |
|
9 |
+{ |
|
10 |
+ int ori_len = b->data_len; |
|
11 |
+ b->data_len += 3 + len; |
|
12 |
+ b->l_aux += 3 + len; |
|
13 |
+ if (b->m_data < b->data_len) { |
|
14 |
+ b->m_data = b->data_len; |
|
15 |
+ kroundup32(b->m_data); |
|
16 |
+ b->data = (uint8_t*)realloc(b->data, b->m_data); |
|
17 |
+ } |
|
18 |
+ b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; |
|
19 |
+ b->data[ori_len + 2] = type; |
|
20 |
+ memcpy(b->data + ori_len + 3, data, len); |
|
21 |
+} |
|
22 |
+ |
|
23 |
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) |
|
24 |
+{ |
|
25 |
+ return bam_aux_get(b, tag); |
|
26 |
+} |
|
27 |
+ |
|
28 |
+#define __skip_tag(s) do { \ |
|
29 |
+ int type = toupper(*(s)); \ |
|
30 |
+ ++(s); \ |
|
31 |
+ if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ |
|
32 |
+ else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ |
|
33 |
+ else (s) += bam_aux_type2size(type); \ |
|
34 |
+ } while(0) |
|
35 |
+ |
|
36 |
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) |
|
37 |
+{ |
|
38 |
+ uint8_t *s; |
|
39 |
+ int y = tag[0]<<8 | tag[1]; |
|
40 |
+ s = bam1_aux(b); |
|
41 |
+ while (s < b->data + b->data_len) { |
|
42 |
+ int x = (int)s[0]<<8 | s[1]; |
|
43 |
+ s += 2; |
|
44 |
+ if (x == y) return s; |
|
45 |
+ __skip_tag(s); |
|
46 |
+ } |
|
47 |
+ return 0; |
|
48 |
+} |
|
49 |
+// s MUST BE returned by bam_aux_get() |
|
50 |
+int bam_aux_del(bam1_t *b, uint8_t *s) |
|
51 |
+{ |
|
52 |
+ uint8_t *p, *aux; |
|
53 |
+ aux = bam1_aux(b); |
|
54 |
+ p = s - 2; |
|
55 |
+ __skip_tag(s); |
|
56 |
+ memmove(p, s, b->l_aux - (s - aux)); |
|
57 |
+ b->data_len -= s - p; |
|
58 |
+ b->l_aux -= s - p; |
|
59 |
+ return 0; |
|
60 |
+} |
|
61 |
+ |
|
62 |
+int bam_aux_drop_other(bam1_t *b, uint8_t *s) |
|
63 |
+{ |
|
64 |
+ if (s) { |
|
65 |
+ uint8_t *p, *aux; |
|
66 |
+ aux = bam1_aux(b); |
|
67 |
+ p = s - 2; |
|
68 |
+ __skip_tag(s); |
|
69 |
+ memmove(aux, p, s - p); |
|
70 |
+ b->data_len -= b->l_aux - (s - p); |
|
71 |
+ b->l_aux = s - p; |
|
72 |
+ } else { |
|
73 |
+ b->data_len -= b->l_aux; |
|
74 |
+ b->l_aux = 0; |
|
75 |
+ } |
|
76 |
+ return 0; |
|
77 |
+} |
|
78 |
+ |
|
79 |
+void bam_init_header_hash(bam_header_t *header) |
|
80 |
+{ |
|
81 |
+ if (header->hash == 0) { |
|
82 |
+ int ret, i; |
|
83 |
+ khiter_t iter; |
|
84 |
+ khash_t(s) *h; |
|
85 |
+ header->hash = h = kh_init(s); |
|
86 |
+ for (i = 0; i < header->n_targets; ++i) { |
|
87 |
+ iter = kh_put(s, h, header->target_name[i], &ret); |
|
88 |
+ kh_value(h, iter) = i; |
|
89 |
+ } |
|
90 |
+ } |
|
91 |
+} |
|
92 |
+ |
|
93 |
+void bam_destroy_header_hash(bam_header_t *header) |
|
94 |
+{ |
|
95 |
+ if (header->hash) |
|
96 |
+ kh_destroy(s, (khash_t(s)*)header->hash); |
|
97 |
+} |
|
98 |
+ |
|
99 |
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) |
|
100 |
+{ |
|
101 |
+ khint_t k; |
|
102 |
+ khash_t(s) *h = (khash_t(s)*)header->hash; |
|
103 |
+ k = kh_get(s, h, seq_name); |
|
104 |
+ return k == kh_end(h)? -1 : kh_value(h, k); |
|
105 |
+} |
|
106 |
+ |
|
107 |
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) |
|
108 |
+{ |
|
109 |
+ char *s; |
|
110 |
+ int i, l, k, name_end; |
|
111 |
+ khiter_t iter; |
|
112 |
+ khash_t(s) *h; |
|
113 |
+ |
|
114 |
+ bam_init_header_hash(header); |
|
115 |
+ h = (khash_t(s)*)header->hash; |
|
116 |
+ |
|
117 |
+ *ref_id = *beg = *end = -1; |
|
118 |
+ name_end = l = strlen(str); |
|
119 |
+ s = (char*)malloc(l+1); |
|
120 |
+ // remove space |
|
121 |
+ for (i = k = 0; i < l; ++i) |
|
122 |
+ if (!isspace(str[i])) s[k++] = str[i]; |
|
123 |
+ s[k] = 0; l = k; |
|
124 |
+ // determine the sequence name |
|
125 |
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end |
|
126 |
+ if (i >= 0) name_end = i; |
|
127 |
+ if (name_end < l) { // check if this is really the end |
|
128 |
+ int n_hyphen = 0; |
|
129 |
+ for (i = name_end + 1; i < l; ++i) { |
|
130 |
+ if (s[i] == '-') ++n_hyphen; |
|
131 |
+ else if (!isdigit(s[i]) && s[i] != ',') break; |
|
132 |
+ } |
|
133 |
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name |
|
134 |
+ s[name_end] = 0; |
|
135 |
+ iter = kh_get(s, h, s); |
|
136 |
+ if (iter == kh_end(h)) { // cannot find the sequence name |
|
137 |
+ iter = kh_get(s, h, str); // try str as the name |
|
138 |
+ if (iter == kh_end(h)) { |
|
139 |
+ if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); |
|
140 |
+ free(s); return -1; |
|
141 |
+ } else s[name_end] = ':', name_end = l; |
|
142 |
+ } |
|
143 |
+ } else iter = kh_get(s, h, str); |
|
144 |
+ if (iter == kh_end(h)) { |
|
145 |
+ free(s); |
|
146 |
+ return -1; |
|
147 |
+ } |
|
148 |
+ *ref_id = kh_val(h, iter); |
|
149 |
+ // parse the interval |
|
150 |
+ if (name_end < l) { |
|
151 |
+ for (i = k = name_end + 1; i < l; ++i) |
|
152 |
+ if (s[i] != ',') s[k++] = s[i]; |
|
153 |
+ s[k] = 0; |
|
154 |
+ *beg = atoi(s + name_end + 1); |
|
155 |
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; |
|
156 |
+ *end = i < k? atoi(s + i + 1) : 1<<29; |
|
157 |
+ if (*beg > 0) --*beg; |
|
158 |
+ } else *beg = 0, *end = 1<<29; |
|
159 |
+ free(s); |
|
160 |
+ return *beg <= *end? 0 : -1; |
|
161 |
+} |
|
162 |
+ |
|
163 |
+int32_t bam_aux2i(const uint8_t *s) |
|
164 |
+{ |
|
165 |
+ int type; |
|
166 |
+ if (s == 0) return 0; |
|
167 |
+ type = *s++; |
|
168 |
+ if (type == 'c') return (int32_t)*(int8_t*)s; |
|
169 |
+ else if (type == 'C') return (int32_t)*(uint8_t*)s; |
|
170 |
+ else if (type == 's') return (int32_t)*(int16_t*)s; |
|
171 |
+ else if (type == 'S') return (int32_t)*(uint16_t*)s; |
|
172 |
+ else if (type == 'i' || type == 'I') return *(int32_t*)s; |
|
173 |
+ else return 0; |
|
174 |
+} |
|
175 |
+ |
|
176 |
+float bam_aux2f(const uint8_t *s) |
|
177 |
+{ |
|
178 |
+ int type; |
|
179 |
+ type = *s++; |
|
180 |
+ if (s == 0) return 0.0; |
|
181 |
+ if (type == 'f') return *(float*)s; |
|
182 |
+ else return 0.0; |
|
183 |
+} |
|
184 |
+ |
|
185 |
+double bam_aux2d(const uint8_t *s) |
|
186 |
+{ |
|
187 |
+ int type; |
|
188 |
+ type = *s++; |
|
189 |
+ if (s == 0) return 0.0; |
|
190 |
+ if (type == 'd') return *(double*)s; |
|
191 |
+ else return 0.0; |
|
192 |
+} |
|
193 |
+ |
|
194 |
+char bam_aux2A(const uint8_t *s) |
|
195 |
+{ |
|
196 |
+ int type; |
|
197 |
+ type = *s++; |
|
198 |
+ if (s == 0) return 0; |
|
199 |
+ if (type == 'A') return *(char*)s; |
|
200 |
+ else return 0; |
|
201 |
+} |
|
202 |
+ |
|
203 |
+char *bam_aux2Z(const uint8_t *s) |
|
204 |
+{ |
|
205 |
+ int type; |
|
206 |
+ type = *s++; |
|
207 |
+ if (s == 0) return 0; |
|
208 |
+ if (type == 'Z' || type == 'H') return (char*)s; |
|
209 |
+ else return 0; |
|
210 |
+} |
|
211 |
+ |
|
212 |
+#ifdef _WIN32 |
|
213 |
+double drand48() |
|
214 |
+{ |
|
215 |
+ return (double)rand() / RAND_MAX; |
|
216 |
+} |
|
217 |
+#endif |