Browse code

embed samtools sources directly, since Rsamtools has moved to htslib

Michael Lawrence authored on 11/02/2019 20:33:07
Showing37 changed files

... ...
@@ -113,5 +113,5 @@ src/gstruct/src/libgstruct_1.0_la-ucharlist.lo
113 113
 src/gstruct/src/libgstruct_1.0_la-uintlist.lo
114 114
 src/gstruct/src/libgstruct_1.0_la-uinttable.lo
115 115
 src/gstruct/src/stamp-h1
116
-src/samtools/
117 116
 inst/user
117
+inst/usr
... ...
@@ -9,7 +9,7 @@ Description: GSNAP and GMAP are a pair of tools to align short-read
9 9
         methods to work with GMAP and GSNAP from within R. In addition,
10 10
         it provides methods to tally alignment results on a
11 11
         per-nucleotide basis using the bam_tally tool.
12
-Version: 1.25.2
12
+Version: 1.25.3
13 13
 Depends: R (>= 2.15.0), methods, GenomeInfoDb (>= 1.1.3),
14 14
         GenomicRanges (>= 1.31.8), Rsamtools (>= 1.31.2)
15 15
 Imports: S4Vectors (>= 0.17.25), IRanges (>= 2.13.12), BiocGenerics (>= 0.25.1),
... ...
@@ -13,8 +13,21 @@ GSTRUCT_INCLUDE_DIR = $(INCLUDE_DIR)/gstruct
13 13
 
14 14
 SAMTOOLS_LIB = samtools/libbam.a
15 15
 
16
+PATCH_O = samtools_patch.o
17
+KNETFILE_O = knetfile.o
18
+BAMOBJ_0 = \
19
+  bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o	\
20
+  bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \
21
+  $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o $(PATCH_O)
22
+BAMOBJ=$(BAMOBJ_0:%=samtools/%)
23
+DFLAGS = -D_USE_KNETFILE -D_FILE_OFFSET_BITS=64 \
24
+  -U_FORTIFY_SOURCE -DBGZF_CACHE \
25
+  -Dfprintf=_samtools_fprintf \
26
+  -Dexit=_samtools_exit \
27
+  -Dabort=_samtools_abort
28
+
16 29
 PKG_CPPFLAGS += -I$(INCLUDE_DIR)
17
-PKG_CFLAGS += -g -O3
30
+PKG_CFLAGS += -g -O3 $(DFLAGS)
18 31
 PKG_LIBS += $(GSTRUCT_LIB) $(SAMTOOLS_LIB) -lz
19 32
 
20 33
 SHLIB = gmapR.so
... ...
@@ -28,22 +41,14 @@ $(OBJECTS): $(GSTRUCT_INCLUDE_DIR) $(OBJECTS:%.o=%.c)
28 41
 
29 42
 $(GSTRUCT_LIB) $(GSTRUCT_INCLUDE_DIR): gstruct
30 43
 
31
-RSAMTOOLS_PATH := $(shell R_LIBS=$(R_LIBRARY_DIR) $(R_HOME)/bin/Rscript \
32
-		    --vanilla -e 'cat(system.file(package="Rsamtools"))')
33
-${R_SRC_DIR}/samtools: $(RSAMTOOLS_PATH)
34
-## gmap/gstruct assume samtools headers and libs are in one directory,
35
-## so we need to create one and populate it with links to Rsamtools.
36
-	rm -rf samtools; mkdir samtools
37
-	ln -sf $(RSAMTOOLS_PATH)/usrlib/$(R_ARCH)/libbam.a samtools/libbam.a
38
-	for header in $(RSAMTOOLS_PATH)/include/samtools/*.h; do \
39
-	  ln -sf $$header samtools/$(notdir $(header)); \
40
-	done
44
+$(SAMTOOLS_LIB): $(BAMOBJ)
45
+	$(AR) -crus $@ $(BAMOBJ)
41 46
 
42
-$(SUBDIRS): %: %/Makefile
47
+$(SUBDIRS): %: %/Makefile $(SAMTOOLS_LIB)
43 48
 	cd $@; \
44 49
 	$(MAKE) install
45 50
 
46
-gstruct/Makefile: gstruct/configure ${R_SRC_DIR}/samtools 
51
+gstruct/Makefile: gstruct/configure
47 52
 	cd $(dir $@); \
48 53
         CFLAGS="-g -O3" \
49 54
 	./configure --enable-static --disable-shared \
50 55
new file mode 100644
... ...
@@ -0,0 +1,474 @@
1
+#include <stdio.h>
2
+#include <ctype.h>
3
+#include <errno.h>
4
+#include <assert.h>
5
+#include "bam.h"
6
+#include "bam_endian.h"
7
+#include "kstring.h"
8
+#include "sam_header.h"
9
+
10
+int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0;
11
+char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
12
+
13
+/**************************
14
+ * CIGAR related routines *
15
+ **************************/
16
+
17
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
18
+{
19
+	int k, end = c->pos;
20
+	for (k = 0; k < c->n_cigar; ++k) {
21
+		int op  = bam_cigar_op(cigar[k]);
22
+		int len = bam_cigar_oplen(cigar[k]);
23
+		if (op == BAM_CBACK) { // move backward
24
+			int l, u, v;
25
+			if (k == c->n_cigar - 1) break; // skip trailing 'B'
26
+			for (l = k - 1, u = v = 0; l >= 0; --l) {
27
+				int op1  = bam_cigar_op(cigar[l]);
28
+				int len1 = bam_cigar_oplen(cigar[l]);
29
+				if (bam_cigar_type(op1)&1) { // consume query
30
+					if (u + len1 >= len) { // stop
31
+						if (bam_cigar_type(op1)&2) v += len - u;
32
+						break;
33
+					} else u += len1;
34
+				}
35
+				if (bam_cigar_type(op1)&2) v += len1;
36
+			}
37
+			end = l < 0? c->pos : end - v;
38
+		} else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]);
39
+	}
40
+	return end;
41
+}
42
+
43
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
44
+{
45
+	uint32_t k;
46
+	int32_t l = 0;
47
+	for (k = 0; k < c->n_cigar; ++k)
48
+		if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
49
+			l += bam_cigar_oplen(cigar[k]);
50
+	return l;
51
+}
52
+
53
+/********************
54
+ * BAM I/O routines *
55
+ ********************/
56
+
57
+bam_header_t *bam_header_init()
58
+{
59
+	bam_is_be = bam_is_big_endian();
60
+	return (bam_header_t*)calloc(1, sizeof(bam_header_t));
61
+}
62
+
63
+void bam_header_destroy(bam_header_t *header)
64
+{
65
+	int32_t i;
66
+	extern void bam_destroy_header_hash(bam_header_t *header);
67
+	if (header == 0) return;
68
+	if (header->target_name) {
69
+		for (i = 0; i < header->n_targets; ++i)
70
+			free(header->target_name[i]);
71
+		free(header->target_name);
72
+		free(header->target_len);
73
+	}
74
+	free(header->text);
75
+	if (header->dict) sam_header_free(header->dict);
76
+	if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
77
+	bam_destroy_header_hash(header);
78
+	free(header);
79
+}
80
+
81
+bam_header_t *bam_header_read(bamFile fp)
82
+{
83
+	bam_header_t *header;
84
+	char buf[4];
85
+	int magic_len;
86
+	int32_t i = 1, name_len;
87
+	// check EOF
88
+	i = bgzf_check_EOF(fp);
89
+	if (i < 0) {
90
+		// If the file is a pipe, checking the EOF marker will *always* fail
91
+		// with ESPIPE.  Suppress the error message in this case.
92
+		if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF");
93
+	}
94
+	else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n");
95
+	// read "BAM1"
96
+	magic_len = bam_read(fp, buf, 4);
97
+	if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) {
98
+		fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n");
99
+		return 0;
100
+	}
101
+	header = bam_header_init();
102
+	// read plain text and the number of reference sequences
103
+	bam_read(fp, &header->l_text, 4);
104
+	if (bam_is_be) bam_swap_endian_4p(&header->l_text);
105
+	header->text = (char*)calloc(header->l_text + 1, 1);
106
+	bam_read(fp, header->text, header->l_text);
107
+	bam_read(fp, &header->n_targets, 4);
108
+	if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
109
+	// read reference sequence names and lengths
110
+	header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
111
+	header->target_len = (uint32_t*)calloc(header->n_targets, 4);
112
+	for (i = 0; i != header->n_targets; ++i) {
113
+		bam_read(fp, &name_len, 4);
114
+		if (bam_is_be) bam_swap_endian_4p(&name_len);
115
+		header->target_name[i] = (char*)calloc(name_len, 1);
116
+		bam_read(fp, header->target_name[i], name_len);
117
+		bam_read(fp, &header->target_len[i], 4);
118
+		if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
119
+	}
120
+	return header;
121
+}
122
+
123
+int bam_header_write(bamFile fp, const bam_header_t *header)
124
+{
125
+	char buf[4];
126
+	int32_t i, name_len, x;
127
+	// write "BAM1"
128
+	strncpy(buf, "BAM\001", 4);
129
+	bam_write(fp, buf, 4);
130
+	// write plain text and the number of reference sequences
131
+	if (bam_is_be) {
132
+		x = bam_swap_endian_4(header->l_text);
133
+		bam_write(fp, &x, 4);
134
+		if (header->l_text) bam_write(fp, header->text, header->l_text);
135
+		x = bam_swap_endian_4(header->n_targets);
136
+		bam_write(fp, &x, 4);
137
+	} else {
138
+		bam_write(fp, &header->l_text, 4);
139
+		if (header->l_text) bam_write(fp, header->text, header->l_text);
140
+		bam_write(fp, &header->n_targets, 4);
141
+	}
142
+	// write sequence names and lengths
143
+	for (i = 0; i != header->n_targets; ++i) {
144
+		char *p = header->target_name[i];
145
+		name_len = strlen(p) + 1;
146
+		if (bam_is_be) {
147
+			x = bam_swap_endian_4(name_len);
148
+			bam_write(fp, &x, 4);
149
+		} else bam_write(fp, &name_len, 4);
150
+		bam_write(fp, p, name_len);
151
+		if (bam_is_be) {
152
+			x = bam_swap_endian_4(header->target_len[i]);
153
+			bam_write(fp, &x, 4);
154
+		} else bam_write(fp, &header->target_len[i], 4);
155
+	}
156
+	bgzf_flush(fp);
157
+	return 0;
158
+}
159
+
160
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
161
+{
162
+	uint8_t *s;
163
+	uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
164
+	s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
165
+	for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
166
+	while (s < data + data_len) {
167
+		uint8_t type;
168
+		s += 2; // skip key
169
+		type = toupper(*s); ++s; // skip type
170
+		if (type == 'C' || type == 'A') ++s;
171
+		else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
172
+		else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
173
+		else if (type == 'D') { bam_swap_endian_8p(s); s += 8; }
174
+		else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
175
+		else if (type == 'B') {
176
+			int32_t n, Bsize = bam_aux_type2size(*s);
177
+			memcpy(&n, s + 1, 4);
178
+			if (1 == Bsize) {
179
+			} else if (2 == Bsize) {
180
+				for (i = 0; i < n; i += 2)
181
+					bam_swap_endian_2p(s + 5 + i);
182
+			} else if (4 == Bsize) {
183
+				for (i = 0; i < n; i += 4)
184
+					bam_swap_endian_4p(s + 5 + i);
185
+			}
186
+			bam_swap_endian_4p(s+1); 
187
+		}
188
+	}
189
+}
190
+
191
+int bam_read1(bamFile fp, bam1_t *b)
192
+{
193
+	bam1_core_t *c = &b->core;
194
+	int32_t block_len, ret, i;
195
+	uint32_t x[8];
196
+
197
+	assert(BAM_CORE_SIZE == 32);
198
+	if ((ret = bam_read(fp, &block_len, 4)) != 4) {
199
+		if (ret == 0) return -1; // normal end-of-file
200
+		else return -2; // truncated
201
+	}
202
+	if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
203
+	if (bam_is_be) {
204
+		bam_swap_endian_4p(&block_len);
205
+		for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
206
+	}
207
+	c->tid = x[0]; c->pos = x[1];
208
+	c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
209
+	c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
210
+	c->l_qseq = x[4];
211
+	c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
212
+	b->data_len = block_len - BAM_CORE_SIZE;
213
+	if (b->m_data < b->data_len) {
214
+		b->m_data = b->data_len;
215
+		kroundup32(b->m_data);
216
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
217
+	}
218
+	if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
219
+	b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
220
+	if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
221
+	if (bam_no_B) bam_remove_B(b);
222
+	return 4 + block_len;
223
+}
224
+
225
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
226
+{
227
+	uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
228
+	int i;
229
+	assert(BAM_CORE_SIZE == 32);
230
+	x[0] = c->tid;
231
+	x[1] = c->pos;
232
+	x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
233
+	x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
234
+	x[4] = c->l_qseq;
235
+	x[5] = c->mtid;
236
+	x[6] = c->mpos;
237
+	x[7] = c->isize;
238
+	bgzf_flush_try(fp, 4 + block_len);
239
+	if (bam_is_be) {
240
+		for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
241
+		y = block_len;
242
+		bam_write(fp, bam_swap_endian_4p(&y), 4);
243
+		swap_endian_data(c, data_len, data);
244
+	} else bam_write(fp, &block_len, 4);
245
+	bam_write(fp, x, BAM_CORE_SIZE);
246
+	bam_write(fp, data, data_len);
247
+	if (bam_is_be) swap_endian_data(c, data_len, data);
248
+	return 4 + block_len;
249
+}
250
+
251
+int bam_write1(bamFile fp, const bam1_t *b)
252
+{
253
+	return bam_write1_core(fp, &b->core, b->data_len, b->data);
254
+}
255
+
256
+char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of)
257
+{
258
+	uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
259
+	int i;
260
+	const bam1_core_t *c = &b->core;
261
+	kstring_t str;
262
+	str.l = str.m = 0; str.s = 0;
263
+
264
+	kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str);
265
+	if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); }
266
+	else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag);
267
+	else { // BAM_OFSTR
268
+		for (i = 0; i < 16; ++i)
269
+			if ((c->flag & 1<<i) && bam_flag2char_table[i])
270
+				kputc(bam_flag2char_table[i], &str);
271
+		kputc('\t', &str);
272
+	}
273
+	if (c->tid < 0) kputsn("*\t", 2, &str);
274
+	else {
275
+		if (header) kputs(header->target_name[c->tid] , &str);
276
+		else kputw(c->tid, &str);
277
+		kputc('\t', &str);
278
+	}
279
+	kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
280
+	if (c->n_cigar == 0) kputc('*', &str);
281
+	else {
282
+		uint32_t *cigar = bam1_cigar(b);
283
+		for (i = 0; i < c->n_cigar; ++i) {
284
+			kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
285
+			kputc(bam_cigar_opchr(cigar[i]), &str);
286
+		}
287
+	}
288
+	kputc('\t', &str);
289
+	if (c->mtid < 0) kputsn("*\t", 2, &str);
290
+	else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
291
+	else {
292
+		if (header) kputs(header->target_name[c->mtid], &str);
293
+		else kputw(c->mtid, &str);
294
+		kputc('\t', &str);
295
+	}
296
+	kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
297
+	if (c->l_qseq) {
298
+		for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
299
+		kputc('\t', &str);
300
+		if (t[0] == 0xff) kputc('*', &str);
301
+		else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str);
302
+	} else kputsn("*\t*", 3, &str);
303
+	s = bam1_aux(b);
304
+	while (s < b->data + b->data_len) {
305
+		uint8_t type, key[2];
306
+		key[0] = s[0]; key[1] = s[1];
307
+		s += 2; type = *s; ++s;
308
+		kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str);
309
+		if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; }
310
+		else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; }
311
+		else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; }
312
+		else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; }
313
+		else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; }
314
+		else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; }
315
+		else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; }
316
+		else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; }
317
+		else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; }
318
+		else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; }
319
+		else if (type == 'B') {
320
+			uint8_t sub_type = *(s++);
321
+			int32_t n;
322
+			memcpy(&n, s, 4);
323
+			s += 4; // no point to the start of the array
324
+			kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing
325
+			for (i = 0; i < n; ++i) {
326
+				kputc(',', &str);
327
+				if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; }
328
+				else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; }
329
+				else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; }
330
+				else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; }
331
+				else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; }
332
+				else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; }
333
+				else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; }
334
+			}
335
+		}
336
+	}
337
+	return str.s;
338
+}
339
+
340
+char *bam_format1(const bam_header_t *header, const bam1_t *b)
341
+{
342
+	return bam_format1_core(header, b, BAM_OFDEC);
343
+}
344
+
345
+void bam_view1(const bam_header_t *header, const bam1_t *b)
346
+{
347
+	char *s = bam_format1(header, b);
348
+	puts(s);
349
+	free(s);
350
+}
351
+
352
+int bam_validate1(const bam_header_t *header, const bam1_t *b)
353
+{
354
+	char *s;
355
+
356
+	if (b->core.tid < -1 || b->core.mtid < -1) return 0;
357
+	if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
358
+
359
+	if (b->data_len < b->core.l_qname) return 0;
360
+	s = memchr(bam1_qname(b), '\0', b->core.l_qname);
361
+	if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
362
+
363
+	// FIXME: Other fields could also be checked, especially the auxiliary data
364
+
365
+	return 1;
366
+}
367
+
368
+// FIXME: we should also check the LB tag associated with each alignment
369
+const char *bam_get_library(bam_header_t *h, const bam1_t *b)
370
+{
371
+	const uint8_t *rg;
372
+	if (h->dict == 0) h->dict = sam_header_parse2(h->text);
373
+	if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
374
+	rg = bam_aux_get(b, "RG");
375
+	return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1));
376
+}
377
+
378
+/************
379
+ * Remove B *
380
+ ************/
381
+
382
+int bam_remove_B(bam1_t *b)
383
+{
384
+	int i, j, end_j, k, l, no_qual;
385
+	uint32_t *cigar, *new_cigar;
386
+	uint8_t *seq, *qual, *p;
387
+	// test if removal is necessary
388
+	if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing
389
+	cigar = bam1_cigar(b);
390
+	for (k = 0; k < b->core.n_cigar; ++k)
391
+		if (bam_cigar_op(cigar[k]) == BAM_CBACK) break;
392
+	if (k == b->core.n_cigar) return 0; // no 'B'
393
+	if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed
394
+	// allocate memory for the new CIGAR
395
+	if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory
396
+		b->m_data = b->data_len + b->core.n_cigar * 4;
397
+		kroundup32(b->m_data);
398
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
399
+		cigar = bam1_cigar(b); // after realloc, cigar may be changed
400
+	}
401
+	new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data
402
+	// the core loop
403
+	seq = bam1_seq(b); qual = bam1_qual(b);
404
+	no_qual = (qual[0] == 0xff); // test whether base quality is available
405
+	i = j = 0; end_j = -1;
406
+	for (k = l = 0; k < b->core.n_cigar; ++k) {
407
+		int op  = bam_cigar_op(cigar[k]);
408
+		int len = bam_cigar_oplen(cigar[k]);
409
+		if (op == BAM_CBACK) { // the backward operation
410
+			int t, u;
411
+			if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR
412
+			if (len > j) goto rmB_err; // an excessively long backward
413
+			for (t = l - 1, u = 0; t >= 0; --t) { // look back
414
+				int op1  = bam_cigar_op(new_cigar[t]);
415
+				int len1 = bam_cigar_oplen(new_cigar[t]);
416
+				if (bam_cigar_type(op1)&1) { // consume the query
417
+					if (u + len1 >= len) { // stop
418
+						new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT;
419
+						break;
420
+					} else u += len1;
421
+				}
422
+			}
423
+			if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation
424
+			l = t + 1;
425
+			end_j = j; j -= len;
426
+		} else { // other CIGAR operations
427
+			new_cigar[l++] = cigar[k];
428
+			if (bam_cigar_type(op)&1) { // consume the query
429
+				if (i != j) { // no need to copy if i == j
430
+					int u, c, c0;
431
+					for (u = 0; u < len; ++u) { // construct the consensus
432
+						c = bam1_seqi(seq, i+u);
433
+						if (j + u < end_j) { // in an overlap
434
+							c0 = bam1_seqi(seq, j+u);
435
+							if (c != c0) { // a mismatch; choose the better base
436
+								if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better
437
+									bam1_seq_seti(seq, j+u, c);
438
+									qual[j+u] = qual[i+u] - qual[j+u];
439
+								} else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality
440
+							} else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u];
441
+						} else { // not in an overlap; copy over
442
+							bam1_seq_seti(seq, j+u, c);
443
+							qual[j+u] = qual[i+u];
444
+						}
445
+					}
446
+				}
447
+				i += len, j += len;
448
+			}
449
+		}
450
+	}
451
+	if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified
452
+	// merge adjacent operations if possible
453
+	for (k = 1; k < l; ++k)
454
+		if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1]))
455
+			new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf;
456
+	// kill zero length operations
457
+	for (k = i = 0; k < l; ++k)
458
+		if (new_cigar[k] >> BAM_CIGAR_SHIFT)
459
+			new_cigar[i++] = new_cigar[k];
460
+	l = i;
461
+	// update b
462
+	memcpy(cigar, new_cigar, l * 4); // set CIGAR
463
+	p = b->data + b->core.l_qname + l * 4;
464
+	memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ
465
+	memmove(p, qual, j); p += j; // set QUAL
466
+	memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields
467
+	b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length
468
+	b->data_len = p - b->data; // update record length
469
+	return 0;
470
+
471
+rmB_err:
472
+	b->core.flag |= BAM_FUNMAP;
473
+	return -1;
474
+}
0 475
new file mode 100644
... ...
@@ -0,0 +1,793 @@
1
+/* The MIT License
2
+
3
+   Copyright (c) 2008-2010 Genome Research Ltd (GRL).
4
+
5
+   Permission is hereby granted, free of charge, to any person obtaining
6
+   a copy of this software and associated documentation files (the
7
+   "Software"), to deal in the Software without restriction, including
8
+   without limitation the rights to use, copy, modify, merge, publish,
9
+   distribute, sublicense, and/or sell copies of the Software, and to
10
+   permit persons to whom the Software is furnished to do so, subject to
11
+   the following conditions:
12
+
13
+   The above copyright notice and this permission notice shall be
14
+   included in all copies or substantial portions of the Software.
15
+
16
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
+   SOFTWARE.
24
+*/
25
+
26
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
27
+
28
+#ifndef BAM_BAM_H
29
+#define BAM_BAM_H
30
+
31
+/*!
32
+  @header
33
+
34
+  BAM library provides I/O and various operations on manipulating files
35
+  in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map)
36
+  format. It now supports importing from or exporting to SAM, sorting,
37
+  merging, generating pileup, and quickly retrieval of reads overlapped
38
+  with a specified region.
39
+
40
+  @copyright Genome Research Ltd.
41
+ */
42
+
43
+#define BAM_VERSION "0.1.19-96b5f2294a"
44
+
45
+#include <stdint.h>
46
+#include <stdlib.h>
47
+#include <string.h>
48
+#include <stdio.h>
49
+
50
+#ifndef BAM_LITE
51
+#define BAM_VIRTUAL_OFFSET16
52
+#include "bgzf.h"
53
+/*! @abstract BAM file handler */
54
+typedef BGZF *bamFile;
55
+#define bam_open(fn, mode) bgzf_open(fn, mode)
56
+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
57
+#define bam_close(fp) bgzf_close(fp)
58
+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
59
+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
60
+#define bam_tell(fp) bgzf_tell(fp)
61
+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
62
+#else
63
+#define BAM_TRUE_OFFSET
64
+#include <zlib.h>
65
+typedef gzFile bamFile;
66
+#define bam_open(fn, mode) gzopen(fn, mode)
67
+#define bam_dopen(fd, mode) gzdopen(fd, mode)
68
+#define bam_close(fp) gzclose(fp)
69
+#define bam_read(fp, buf, size) gzread(fp, buf, size)
70
+/* no bam_write/bam_tell/bam_seek() here */
71
+#endif
72
+
73
+/*! @typedef
74
+  @abstract Structure for the alignment header.
75
+  @field n_targets   number of reference sequences
76
+  @field target_name names of the reference sequences
77
+  @field target_len  lengths of the referene sequences
78
+  @field dict        header dictionary
79
+  @field hash        hash table for fast name lookup
80
+  @field rg2lib      hash table for @RG-ID -> LB lookup
81
+  @field l_text      length of the plain text in the header
82
+  @field text        plain text
83
+
84
+  @discussion Field hash points to null by default. It is a private
85
+  member.
86
+ */
87
+typedef struct {
88
+	int32_t n_targets;
89
+	char **target_name;
90
+	uint32_t *target_len;
91
+	void *dict, *hash, *rg2lib;
92
+	uint32_t l_text, n_text;
93
+	char *text;
94
+} bam_header_t;
95
+
96
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
97
+#define BAM_FPAIRED        1
98
+/*! @abstract the read is mapped in a proper pair */
99
+#define BAM_FPROPER_PAIR   2
100
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
101
+#define BAM_FUNMAP         4
102
+/*! @abstract the mate is unmapped */
103
+#define BAM_FMUNMAP        8
104
+/*! @abstract the read is mapped to the reverse strand */
105
+#define BAM_FREVERSE      16
106
+/*! @abstract the mate is mapped to the reverse strand */
107
+#define BAM_FMREVERSE     32
108
+/*! @abstract this is read1 */
109
+#define BAM_FREAD1        64
110
+/*! @abstract this is read2 */
111
+#define BAM_FREAD2       128
112
+/*! @abstract not primary alignment */
113
+#define BAM_FSECONDARY   256
114
+/*! @abstract QC failure */
115
+#define BAM_FQCFAIL      512
116
+/*! @abstract optical or PCR duplicate */
117
+#define BAM_FDUP        1024
118
+
119
+#define BAM_OFDEC          0
120
+#define BAM_OFHEX          1
121
+#define BAM_OFSTR          2
122
+
123
+/*! @abstract defautl mask for pileup */
124
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
125
+
126
+#define BAM_CORE_SIZE   sizeof(bam1_core_t)
127
+
128
+/**
129
+ * Describing how CIGAR operation/length is packed in a 32-bit integer.
130
+ */
131
+#define BAM_CIGAR_SHIFT 4
132
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
133
+
134
+/*
135
+  CIGAR operations.
136
+ */
137
+/*! @abstract CIGAR: M = match or mismatch*/
138
+#define BAM_CMATCH      0
139
+/*! @abstract CIGAR: I = insertion to the reference */
140
+#define BAM_CINS        1
141
+/*! @abstract CIGAR: D = deletion from the reference */
142
+#define BAM_CDEL        2
143
+/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */
144
+#define BAM_CREF_SKIP   3
145
+/*! @abstract CIGAR: S = clip on the read with clipped sequence
146
+  present in qseq */
147
+#define BAM_CSOFT_CLIP  4
148
+/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */
149
+#define BAM_CHARD_CLIP  5
150
+/*! @abstract CIGAR: P = padding */
151
+#define BAM_CPAD        6
152
+/*! @abstract CIGAR: equals = match */
153
+#define BAM_CEQUAL      7
154
+/*! @abstract CIGAR: X = mismatch */
155
+#define BAM_CDIFF       8
156
+#define BAM_CBACK       9
157
+
158
+#define BAM_CIGAR_STR  "MIDNSHP=XB"
159
+#define BAM_CIGAR_TYPE 0x3C1A7
160
+
161
+#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK)
162
+#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT)
163
+#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)])
164
+#define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o))
165
+#define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
166
+
167
+/*! @typedef
168
+  @abstract Structure for core alignment information.
169
+  @field  tid     chromosome ID, defined by bam_header_t
170
+  @field  pos     0-based leftmost coordinate
171
+  @field  bin     bin calculated by bam_reg2bin()
172
+  @field  qual    mapping quality
173
+  @field  l_qname length of the query name
174
+  @field  flag    bitwise flag
175
+  @field  n_cigar number of CIGAR operations
176
+  @field  l_qseq  length of the query sequence (read)
177
+ */
178
+typedef struct {
179
+	int32_t tid;
180
+	int32_t pos;
181
+	uint32_t bin:16, qual:8, l_qname:8;
182
+	uint32_t flag:16, n_cigar:16;
183
+	int32_t l_qseq;
184
+	int32_t mtid;
185
+	int32_t mpos;
186
+	int32_t isize;
187
+} bam1_core_t;
188
+
189
+/*! @typedef
190
+  @abstract Structure for one alignment.
191
+  @field  core       core information about the alignment
192
+  @field  l_aux      length of auxiliary data
193
+  @field  data_len   current length of bam1_t::data
194
+  @field  m_data     maximum length of bam1_t::data
195
+  @field  data       all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux
196
+
197
+  @discussion Notes:
198
+ 
199
+   1. qname is zero tailing and core.l_qname includes the tailing '\0'.
200
+   2. l_qseq is calculated from the total length of an alignment block
201
+      on reading or from CIGAR.
202
+   3. cigar data is encoded 4 bytes per CIGAR operation.
203
+   4. seq is nybble-encoded according to bam_nt16_table.
204
+ */
205
+typedef struct {
206
+	bam1_core_t core;
207
+	int l_aux, data_len, m_data;
208
+	uint8_t *data;
209
+} bam1_t;
210
+
211
+typedef struct __bam_iter_t *bam_iter_t;
212
+
213
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
214
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
215
+
216
+/*! @function
217
+  @abstract  Get the CIGAR array
218
+  @param  b  pointer to an alignment
219
+  @return    pointer to the CIGAR array
220
+
221
+  @discussion In the CIGAR array, each element is a 32-bit integer. The
222
+  lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
223
+  length of a CIGAR.
224
+ */
225
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
226
+
227
+/*! @function
228
+  @abstract  Get the name of the query
229
+  @param  b  pointer to an alignment
230
+  @return    pointer to the name string, null terminated
231
+ */
232
+#define bam1_qname(b) ((char*)((b)->data))
233
+
234
+/*! @function
235
+  @abstract  Get query sequence
236
+  @param  b  pointer to an alignment
237
+  @return    pointer to sequence
238
+
239
+  @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
240
+  8 for T and 15 for N. Two bases are packed in one byte with the base
241
+  at the higher 4 bits having smaller coordinate on the read. It is
242
+  recommended to use bam1_seqi() macro to get the base.
243
+ */
244
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
245
+
246
+/*! @function
247
+  @abstract  Get query quality
248
+  @param  b  pointer to an alignment
249
+  @return    pointer to quality string
250
+ */
251
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
252
+
253
+/*! @function
254
+  @abstract  Get a base on read
255
+  @param  s  Query sequence returned by bam1_seq()
256
+  @param  i  The i-th position, 0-based
257
+  @return    4-bit integer representing the base.
258
+ */
259
+//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
260
+#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf)
261
+
262
+#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
263
+
264
+/*! @function
265
+  @abstract  Get query sequence and quality
266
+  @param  b  pointer to an alignment
267
+  @return    pointer to the concatenated auxiliary data
268
+ */
269
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
270
+
271
+#ifndef kroundup32
272
+/*! @function
273
+  @abstract  Round an integer to the next closest power-2 integer.
274
+  @param  x  integer to be rounded (in place)
275
+  @discussion x will be modified.
276
+ */
277
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
278
+#endif
279
+
280
+/*!
281
+  @abstract Whether the machine is big-endian; modified only in
282
+  bam_header_init().
283
+ */
284
+extern int bam_is_be;
285
+
286
+/*!
287
+  @abstract Verbose level between 0 and 3; 0 is supposed to disable all
288
+  debugging information, though this may not have been implemented.
289
+ */
290
+extern int bam_verbose;
291
+
292
+extern int bam_no_B;
293
+
294
+/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
295
+extern unsigned char bam_nt16_table[256];
296
+
297
+/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
298
+extern char *bam_nt16_rev_table;
299
+
300
+extern char bam_nt16_nt4_table[];
301
+
302
+#ifdef __cplusplus
303
+extern "C" {
304
+#endif
305
+
306
+	/*********************
307
+	 * Low-level SAM I/O *
308
+	 *********************/
309
+
310
+	/*! @abstract TAM file handler */
311
+	typedef struct __tamFile_t *tamFile;
312
+
313
+	/*!
314
+	  @abstract   Open a SAM file for reading, either uncompressed or compressed by gzip/zlib.
315
+	  @param  fn  SAM file name
316
+	  @return     SAM file handler
317
+	 */
318
+	tamFile sam_open(const char *fn);
319
+
320
+	/*!
321
+	  @abstract   Close a SAM file handler
322
+	  @param  fp  SAM file handler
323
+	 */
324
+	void sam_close(tamFile fp);
325
+
326
+	/*!
327
+	  @abstract      Read one alignment from a SAM file handler
328
+	  @param  fp     SAM file handler
329
+	  @param  header header information (ordered names of chromosomes)
330
+	  @param  b      read alignment; all members in b will be updated
331
+	  @return        0 if successful; otherwise negative
332
+	 */
333
+	int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
334
+
335
+	/*!
336
+	  @abstract       Read header information from a TAB-delimited list file.
337
+	  @param  fn_list file name for the list
338
+	  @return         a pointer to the header structure
339
+
340
+	  @discussion Each line in this file consists of chromosome name and
341
+	  the length of chromosome.
342
+	 */
343
+	bam_header_t *sam_header_read2(const char *fn_list);
344
+
345
+	/*!
346
+	  @abstract       Read header from a SAM file (if present)
347
+	  @param  fp      SAM file handler
348
+	  @return         pointer to header struct; 0 if no @SQ lines available
349
+	 */
350
+	bam_header_t *sam_header_read(tamFile fp);
351
+
352
+	/*!
353
+	  @abstract       Parse @SQ lines a update a header struct
354
+	  @param  h       pointer to the header struct to be updated
355
+	  @return         number of target sequences
356
+
357
+	  @discussion bam_header_t::{n_targets,target_len,target_name} will
358
+	  be destroyed in the first place.
359
+	 */
360
+	int sam_header_parse(bam_header_t *h);
361
+	int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
362
+
363
+	/*!
364
+	  @abstract       Parse @RG lines a update a header struct
365
+	  @param  h       pointer to the header struct to be updated
366
+	  @return         number of @RG lines
367
+
368
+	  @discussion bam_header_t::rg2lib will be destroyed in the first
369
+	  place.
370
+	 */
371
+	int sam_header_parse_rg(bam_header_t *h);
372
+
373
+#define sam_write1(header, b) bam_view1(header, b)
374
+
375
+
376
+	/********************************
377
+	 * APIs for string dictionaries *
378
+	 ********************************/
379
+
380
+	int bam_strmap_put(void *strmap, const char *rg, const char *lib);
381
+	const char *bam_strmap_get(const void *strmap, const char *rg);
382
+	void *bam_strmap_dup(const void*);
383
+	void *bam_strmap_init();
384
+	void bam_strmap_destroy(void *strmap);
385
+
386
+
387
+	/*********************
388
+	 * Low-level BAM I/O *
389
+	 *********************/
390
+
391
+	/*!
392
+	  @abstract Initialize a header structure.
393
+	  @return   the pointer to the header structure
394
+
395
+	  @discussion This function also modifies the global variable
396
+	  bam_is_be.
397
+	 */
398
+	bam_header_t *bam_header_init();
399
+
400
+	/*!
401
+	  @abstract        Destroy a header structure.
402
+	  @param  header  pointer to the header
403
+	 */
404
+	void bam_header_destroy(bam_header_t *header);
405
+
406
+	/*!
407
+	  @abstract   Read a header structure from BAM.
408
+	  @param  fp  BAM file handler, opened by bam_open()
409
+	  @return     pointer to the header structure
410
+
411
+	  @discussion The file position indicator must be placed at the
412
+	  beginning of the file. Upon success, the position indicator will
413
+	  be set at the start of the first alignment.
414
+	 */
415
+	bam_header_t *bam_header_read(bamFile fp);
416
+
417
+	/*!
418
+	  @abstract      Write a header structure to BAM.
419
+	  @param  fp     BAM file handler
420
+	  @param  header pointer to the header structure
421
+	  @return        always 0 currently
422
+	 */
423
+	int bam_header_write(bamFile fp, const bam_header_t *header);
424
+
425
+	/*!
426
+	  @abstract   Read an alignment from BAM.
427
+	  @param  fp  BAM file handler
428
+	  @param  b   read alignment; all members are updated.
429
+	  @return     number of bytes read from the file
430
+
431
+	  @discussion The file position indicator must be
432
+	  placed right before an alignment. Upon success, this function
433
+	  will set the position indicator to the start of the next
434
+	  alignment. This function is not affected by the machine
435
+	  endianness.
436
+	 */
437
+	int bam_read1(bamFile fp, bam1_t *b);
438
+
439
+	int bam_remove_B(bam1_t *b);
440
+
441
+	/*!
442
+	  @abstract Write an alignment to BAM.
443
+	  @param  fp       BAM file handler
444
+	  @param  c        pointer to the bam1_core_t structure
445
+	  @param  data_len total length of variable size data related to
446
+	                   the alignment
447
+	  @param  data     pointer to the concatenated data
448
+	  @return          number of bytes written to the file
449
+
450
+	  @discussion This function is not affected by the machine
451
+	  endianness.
452
+	 */
453
+	int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
454
+
455
+	/*!
456
+	  @abstract   Write an alignment to BAM.
457
+	  @param  fp  BAM file handler
458
+	  @param  b   alignment to write
459
+	  @return     number of bytes written to the file
460
+
461
+	  @abstract It is equivalent to:
462
+	    bam_write1_core(fp, &b->core, b->data_len, b->data)
463
+	 */
464
+	int bam_write1(bamFile fp, const bam1_t *b);
465
+
466
+	/*! @function
467
+	  @abstract  Initiate a pointer to bam1_t struct
468
+	 */
469
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
470
+
471
+	/*! @function
472
+	  @abstract  Free the memory allocated for an alignment.
473
+	  @param  b  pointer to an alignment
474
+	 */
475
+#define bam_destroy1(b) do {					\
476
+		if (b) { free((b)->data); free(b); }	\
477
+	} while (0)
478
+
479
+	/*!
480
+	  @abstract       Format a BAM record in the SAM format
481
+	  @param  header  pointer to the header structure
482
+	  @param  b       alignment to print
483
+	  @return         a pointer to the SAM string
484
+	 */
485
+	char *bam_format1(const bam_header_t *header, const bam1_t *b);
486
+
487
+	char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
488
+
489
+	/*!
490
+	  @abstract       Check whether a BAM record is plausibly valid
491
+	  @param  header  associated header structure, or NULL if unavailable
492
+	  @param  b       alignment to validate
493
+	  @return         0 if the alignment is invalid; non-zero otherwise
494
+
495
+	  @discussion  Simple consistency check of some of the fields of the
496
+	  alignment record.  If the header is provided, several additional checks
497
+	  are made.  Not all fields are checked, so a non-zero result is not a
498
+	  guarantee that the record is valid.  However it is usually good enough
499
+	  to detect when bam_seek() has been called with a virtual file offset
500
+	  that is not the offset of an alignment record.
501
+	 */
502
+	int bam_validate1(const bam_header_t *header, const bam1_t *b);
503
+
504
+	const char *bam_get_library(bam_header_t *header, const bam1_t *b);
505
+
506
+
507
+	/***************
508
+	 * pileup APIs *
509
+	 ***************/
510
+
511
+	/*! @typedef
512
+	  @abstract Structure for one alignment covering the pileup position.
513
+	  @field  b      pointer to the alignment
514
+	  @field  qpos   position of the read base at the pileup site, 0-based
515
+	  @field  indel  indel length; 0 for no indel, positive for ins and negative for del
516
+	  @field  is_del 1 iff the base on the padded read is a deletion
517
+	  @field  level  the level of the read in the "viewer" mode
518
+
519
+	  @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
520
+	  difference between the two functions is that the former does not
521
+	  set bam_pileup1_t::level, while the later does. Level helps the
522
+	  implementation of alignment viewers, but calculating this has some
523
+	  overhead.
524
+	 */
525
+	typedef struct {
526
+		bam1_t *b;
527
+		int32_t qpos;
528
+		int indel, level;
529
+		uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
530
+	} bam_pileup1_t;
531
+
532
+	typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
533
+
534
+	struct __bam_plp_t;
535
+	typedef struct __bam_plp_t *bam_plp_t;
536
+
537
+	bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data);
538
+	int bam_plp_push(bam_plp_t iter, const bam1_t *b);
539
+	const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
540
+	const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
541
+	void bam_plp_set_mask(bam_plp_t iter, int mask);
542
+	void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
543
+	void bam_plp_reset(bam_plp_t iter);
544
+	void bam_plp_destroy(bam_plp_t iter);
545
+
546
+	struct __bam_mplp_t;
547
+	typedef struct __bam_mplp_t *bam_mplp_t;
548
+
549
+	bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
550
+	void bam_mplp_destroy(bam_mplp_t iter);
551
+	void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
552
+	int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
553
+
554
+	/*! @typedef
555
+	  @abstract    Type of function to be called by bam_plbuf_push().
556
+	  @param  tid  chromosome ID as is defined in the header
557
+	  @param  pos  start coordinate of the alignment, 0-based
558
+	  @param  n    number of elements in pl array
559
+	  @param  pl   array of alignments
560
+	  @param  data user provided data
561
+	  @discussion  See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
562
+	 */
563
+	typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
564
+
565
+	typedef struct {
566
+		bam_plp_t iter;
567
+		bam_pileup_f func;
568
+		void *data;
569
+	} bam_plbuf_t;
570
+
571
+	void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
572
+	void bam_plbuf_reset(bam_plbuf_t *buf);
573
+	bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
574
+	void bam_plbuf_destroy(bam_plbuf_t *buf);
575
+	int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
576
+
577
+	int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
578
+
579
+	struct __bam_lplbuf_t;
580
+	typedef struct __bam_lplbuf_t bam_lplbuf_t;
581
+
582
+	void bam_lplbuf_reset(bam_lplbuf_t *buf);
583
+
584
+	/*! @abstract  bam_plbuf_init() equivalent with level calculated. */
585
+	bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
586
+
587
+	/*! @abstract  bam_plbuf_destroy() equivalent with level calculated. */
588
+	void bam_lplbuf_destroy(bam_lplbuf_t *tv);
589
+
590
+	/*! @abstract  bam_plbuf_push() equivalent with level calculated. */
591
+	int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
592
+
593
+
594
+	/*********************
595
+	 * BAM indexing APIs *
596
+	 *********************/
597
+
598
+	struct __bam_index_t;
599
+	typedef struct __bam_index_t bam_index_t;
600
+
601
+	/*!
602
+	  @abstract   Build index for a BAM file.
603
+	  @discussion Index file "fn.bai" will be created.
604
+	  @param  fn  name of the BAM file
605
+	  @return     always 0 currently
606
+	 */
607
+	int bam_index_build(const char *fn);
608
+
609
+	/*!
610
+	  @abstract   Load index from file "fn.bai".
611
+	  @param  fn  name of the BAM file (NOT the index file)
612
+	  @return     pointer to the index structure
613
+	 */
614
+	bam_index_t *bam_index_load(const char *fn);
615
+
616
+	/*!
617
+	  @abstract    Destroy an index structure.
618
+	  @param  idx  pointer to the index structure
619
+	 */
620
+	void bam_index_destroy(bam_index_t *idx);
621
+
622
+	/*! @typedef
623
+	  @abstract      Type of function to be called by bam_fetch().
624
+	  @param  b     the alignment
625
+	  @param  data  user provided data
626
+	 */
627
+	typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
628
+
629
+	/*!
630
+	  @abstract Retrieve the alignments that are overlapped with the
631
+	  specified region.
632
+
633
+	  @discussion A user defined function will be called for each
634
+	  retrieved alignment ordered by its start position.
635
+
636
+	  @param  fp    BAM file handler
637
+	  @param  idx   pointer to the alignment index
638
+	  @param  tid   chromosome ID as is defined in the header
639
+	  @param  beg   start coordinate, 0-based
640
+	  @param  end   end coordinate, 0-based
641
+	  @param  data  user provided data (will be transferred to func)
642
+	  @param  func  user defined function
643
+	 */
644
+	int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
645
+
646
+	bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end);
647
+	int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b);
648
+	void bam_iter_destroy(bam_iter_t iter);
649
+
650
+	/*!
651
+	  @abstract       Parse a region in the format: "chr2:100,000-200,000".
652
+	  @discussion     bam_header_t::hash will be initialized if empty.
653
+	  @param  header  pointer to the header structure
654
+	  @param  str     string to be parsed
655
+	  @param  ref_id  the returned chromosome ID
656
+	  @param  begin   the returned start coordinate
657
+	  @param  end     the returned end coordinate
658
+	  @return         0 on success; -1 on failure
659
+	 */
660
+	int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
661
+
662
+
663
+	/**************************
664
+	 * APIs for optional tags *
665
+	 **************************/
666
+
667
+	/*!
668
+	  @abstract       Retrieve data of a tag
669
+	  @param  b       pointer to an alignment struct
670
+	  @param  tag     two-character tag to be retrieved
671
+
672
+	  @return  pointer to the type and data. The first character is the
673
+	  type that can be 'iIsScCdfAZH'.
674
+
675
+	  @discussion  Use bam_aux2?() series to convert the returned data to
676
+	  the corresponding type.
677
+	*/
678
+	uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
679
+
680
+	int32_t bam_aux2i(const uint8_t *s);
681
+	float bam_aux2f(const uint8_t *s);
682
+	double bam_aux2d(const uint8_t *s);
683
+	char bam_aux2A(const uint8_t *s);
684
+	char *bam_aux2Z(const uint8_t *s);
685
+
686
+	int bam_aux_del(bam1_t *b, uint8_t *s);
687
+	void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
688
+	uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
689
+
690
+
691
+	/*****************
692
+	 * Miscellaneous *
693
+	 *****************/
694
+
695
+	/*!  
696
+	  @abstract Calculate the rightmost coordinate of an alignment on the
697
+	  reference genome.
698
+
699
+	  @param  c      pointer to the bam1_core_t structure
700
+	  @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
701
+	  @return        the rightmost coordinate, 0-based
702
+	*/
703
+	uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
704
+
705
+	/*!
706
+	  @abstract      Calculate the length of the query sequence from CIGAR.
707
+	  @param  c      pointer to the bam1_core_t structure
708
+	  @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
709
+	  @return        length of the query sequence
710
+	*/
711
+	int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
712
+
713
+#ifdef __cplusplus
714
+}
715
+#endif
716
+
717
+/*!
718
+  @abstract    Calculate the minimum bin that contains a region [beg,end).
719
+  @param  beg  start of the region, 0-based
720
+  @param  end  end of the region, 0-based
721
+  @return      bin
722
+ */
723
+static inline int bam_reg2bin(uint32_t beg, uint32_t end)
724
+{
725
+	--end;
726
+	if (beg>>14 == end>>14) return 4681 + (beg>>14);
727
+	if (beg>>17 == end>>17) return  585 + (beg>>17);
728
+	if (beg>>20 == end>>20) return   73 + (beg>>20);
729
+	if (beg>>23 == end>>23) return    9 + (beg>>23);
730
+	if (beg>>26 == end>>26) return    1 + (beg>>26);
731
+	return 0;
732
+}
733
+
734
+/*!
735
+  @abstract     Copy an alignment
736
+  @param  bdst  destination alignment struct
737
+  @param  bsrc  source alignment struct
738
+  @return       pointer to the destination alignment struct
739
+ */
740
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
741
+{
742
+	uint8_t *data = bdst->data;
743
+	int m_data = bdst->m_data;   // backup data and m_data
744
+	if (m_data < bsrc->data_len) { // double the capacity
745
+		m_data = bsrc->data_len; kroundup32(m_data);
746
+		data = (uint8_t*)realloc(data, m_data);
747
+	}
748
+	memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
749
+	*bdst = *bsrc; // copy the rest
750
+	// restore the backup
751
+	bdst->m_data = m_data;
752
+	bdst->data = data;
753
+	return bdst;
754
+}
755
+
756
+/*!
757
+  @abstract     Duplicate an alignment
758
+  @param  src   source alignment struct
759
+  @return       pointer to the destination alignment struct
760
+ */
761
+static inline bam1_t *bam_dup1(const bam1_t *src)
762
+{
763
+	bam1_t *b;
764
+	b = bam_init1();
765
+	*b = *src;
766
+	b->m_data = b->data_len;
767
+	b->data = (uint8_t*)calloc(b->data_len, 1);
768
+	memcpy(b->data, src->data, b->data_len);
769
+	return b;
770
+}
771
+
772
+static inline int bam_aux_type2size(int x)
773
+{
774
+	if (x == 'C' || x == 'c' || x == 'A') return 1;
775
+	else if (x == 'S' || x == 's') return 2;
776
+	else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4;
777
+	else return 0;
778
+}
779
+
780
+/*********************************
781
+ *** Compatibility with htslib ***
782
+ *********************************/
783
+
784
+typedef bam_header_t bam_hdr_t;
785
+
786
+#define bam_get_qname(b) bam1_qname(b)
787
+#define bam_get_cigar(b) bam1_cigar(b)
788
+
789
+#define bam_hdr_read(fp) bam_header_read(fp)
790
+#define bam_hdr_write(fp, h) bam_header_write(fp, h)
791
+#define bam_hdr_destroy(fp) bam_header_destroy(fp)
792
+
793
+#endif
0 794
new file mode 100644
... ...
@@ -0,0 +1,217 @@
1
+#include <ctype.h>
2
+#include "bam.h"
3
+#include "khash.h"
4
+typedef char *str_p;
5
+KHASH_MAP_INIT_STR(s, int)
6
+KHASH_MAP_INIT_STR(r2l, str_p)
7
+
8
+void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
9
+{
10
+	int ori_len = b->data_len;
11
+	b->data_len += 3 + len;
12
+	b->l_aux += 3 + len;
13
+	if (b->m_data < b->data_len) {
14
+		b->m_data = b->data_len;
15
+		kroundup32(b->m_data);
16
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
17
+	}
18
+	b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
19
+	b->data[ori_len + 2] = type;
20
+	memcpy(b->data + ori_len + 3, data, len);
21
+}
22
+
23
+uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
24
+{
25
+	return bam_aux_get(b, tag);
26
+}
27
+
28
+#define __skip_tag(s) do { \
29
+		int type = toupper(*(s)); \
30
+		++(s); \
31
+		if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \
32
+		else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \
33
+		else (s) += bam_aux_type2size(type); \
34
+	} while(0)
35
+
36
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
37
+{
38
+	uint8_t *s;
39
+	int y = tag[0]<<8 | tag[1];
40
+	s = bam1_aux(b);
41
+	while (s < b->data + b->data_len) {
42
+		int x = (int)s[0]<<8 | s[1];
43
+		s += 2;
44
+		if (x == y) return s;
45
+		__skip_tag(s);
46
+	}
47
+	return 0;
48
+}
49
+// s MUST BE returned by bam_aux_get()
50
+int bam_aux_del(bam1_t *b, uint8_t *s)
51
+{
52
+	uint8_t *p, *aux;
53
+	aux = bam1_aux(b);
54
+	p = s - 2;
55
+	__skip_tag(s);
56
+	memmove(p, s, b->l_aux - (s - aux));
57
+	b->data_len -= s - p;
58
+	b->l_aux -= s - p;
59
+	return 0;
60
+}
61
+
62
+int bam_aux_drop_other(bam1_t *b, uint8_t *s)
63
+{
64
+	if (s) {
65
+		uint8_t *p, *aux;
66
+		aux = bam1_aux(b);
67
+		p = s - 2;
68
+		__skip_tag(s);
69
+		memmove(aux, p, s - p);
70
+		b->data_len -= b->l_aux - (s - p);
71
+		b->l_aux = s - p;
72
+	} else {
73
+		b->data_len -= b->l_aux;
74
+		b->l_aux = 0;
75
+	}
76
+	return 0;
77
+}
78
+
79
+void bam_init_header_hash(bam_header_t *header)
80
+{
81
+	if (header->hash == 0) {
82
+		int ret, i;
83
+		khiter_t iter;
84
+		khash_t(s) *h;
85
+		header->hash = h = kh_init(s);
86
+		for (i = 0; i < header->n_targets; ++i) {
87
+			iter = kh_put(s, h, header->target_name[i], &ret);
88
+			kh_value(h, iter) = i;
89
+		}
90
+	}
91
+}
92
+
93
+void bam_destroy_header_hash(bam_header_t *header)
94
+{
95
+	if (header->hash)
96
+		kh_destroy(s, (khash_t(s)*)header->hash);
97
+}
98
+
99
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
100
+{
101
+	khint_t k;
102
+	khash_t(s) *h = (khash_t(s)*)header->hash;
103
+	k = kh_get(s, h, seq_name);
104
+	return k == kh_end(h)? -1 : kh_value(h, k);
105
+}
106
+
107
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
108
+{
109
+	char *s;
110
+	int i, l, k, name_end;
111
+	khiter_t iter;
112
+	khash_t(s) *h;
113
+
114
+	bam_init_header_hash(header);
115
+	h = (khash_t(s)*)header->hash;
116
+
117
+	*ref_id = *beg = *end = -1;
118
+	name_end = l = strlen(str);
119
+	s = (char*)malloc(l+1);
120
+	// remove space
121
+	for (i = k = 0; i < l; ++i)
122
+		if (!isspace(str[i])) s[k++] = str[i];
123
+	s[k] = 0; l = k;
124
+	// determine the sequence name
125
+	for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
126
+	if (i >= 0) name_end = i;
127
+	if (name_end < l) { // check if this is really the end
128
+		int n_hyphen = 0;
129
+		for (i = name_end + 1; i < l; ++i) {
130
+			if (s[i] == '-') ++n_hyphen;
131
+			else if (!isdigit(s[i]) && s[i] != ',') break;
132
+		}
133
+		if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
134
+		s[name_end] = 0;
135
+		iter = kh_get(s, h, s);
136
+		if (iter == kh_end(h)) { // cannot find the sequence name
137
+			iter = kh_get(s, h, str); // try str as the name
138
+			if (iter == kh_end(h)) {
139
+				if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
140
+				free(s); return -1;
141
+			} else s[name_end] = ':', name_end = l;
142
+		}
143
+	} else iter = kh_get(s, h, str);
144
+        if (iter == kh_end(h)) {
145
+          free(s); 
146
+          return -1;
147
+        }
148
+	*ref_id = kh_val(h, iter);
149
+	// parse the interval
150
+	if (name_end < l) {
151
+		for (i = k = name_end + 1; i < l; ++i)
152
+			if (s[i] != ',') s[k++] = s[i];
153
+		s[k] = 0;
154
+		*beg = atoi(s + name_end + 1);
155
+		for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
156
+		*end = i < k? atoi(s + i + 1) : 1<<29;
157
+		if (*beg > 0) --*beg;
158
+	} else *beg = 0, *end = 1<<29;
159
+	free(s);
160
+	return *beg <= *end? 0 : -1;
161
+}
162
+
163
+int32_t bam_aux2i(const uint8_t *s)
164
+{
165
+	int type;
166
+	if (s == 0) return 0;
167
+	type = *s++;
168
+	if (type == 'c') return (int32_t)*(int8_t*)s;
169
+	else if (type == 'C') return (int32_t)*(uint8_t*)s;
170
+	else if (type == 's') return (int32_t)*(int16_t*)s;
171
+	else if (type == 'S') return (int32_t)*(uint16_t*)s;
172
+	else if (type == 'i' || type == 'I') return *(int32_t*)s;
173
+	else return 0;
174
+}
175
+
176
+float bam_aux2f(const uint8_t *s)
177
+{
178
+	int type;
179
+	type = *s++;
180
+	if (s == 0) return 0.0;
181
+	if (type == 'f') return *(float*)s;
182
+	else return 0.0;
183
+}
184
+
185
+double bam_aux2d(const uint8_t *s)
186
+{
187
+	int type;
188
+	type = *s++;
189
+	if (s == 0) return 0.0;
190
+	if (type == 'd') return *(double*)s;
191
+	else return 0.0;
192
+}
193
+
194
+char bam_aux2A(const uint8_t *s)
195
+{
196
+	int type;
197
+	type = *s++;
198
+	if (s == 0) return 0;
199
+	if (type == 'A') return *(char*)s;
200
+	else return 0;
201
+}
202
+
203
+char *bam_aux2Z(const uint8_t *s)
204
+{
205
+	int type;
206
+	type = *s++;
207
+	if (s == 0) return 0;
208
+	if (type == 'Z' || type == 'H') return (char*)s;
209
+	else return 0;
210
+}
211
+
212
+#ifdef _WIN32
213
+double drand48()
214
+{
215
+	return (double)rand() / RAND_MAX;
216
+}
217
+#endif
0 218
new file mode 100644
... ...
@@ -0,0 +1,42 @@
1
+#ifndef BAM_ENDIAN_H
2
+#define BAM_ENDIAN_H
3
+
4
+#include <stdint.h>
5
+
6
+static inline int bam_is_big_endian()
7
+{
8
+	long one= 1;
9
+	return !(*((char *)(&one)));
10
+}
11
+static inline uint16_t bam_swap_endian_2(uint16_t v)
12
+{
13
+	return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
14
+}
15
+static inline void *bam_swap_endian_2p(void *x)
16
+{
17
+	*(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
18
+	return x;
19
+}
20
+static inline uint32_t bam_swap_endian_4(uint32_t v)
21
+{
22
+	v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
23
+	return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
24
+}
25
+static inline void *bam_swap_endian_4p(void *x)
26
+{
27
+	*(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
28
+	return x;
29
+}
30
+static inline uint64_t bam_swap_endian_8(uint64_t v)
31
+{
32
+	v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
33
+	v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
34
+	return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
35
+}
36
+static inline void *bam_swap_endian_8p(void *x)
37
+{
38
+	*(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
39
+	return x;
40
+}
41
+
42
+#endif
0 43
new file mode 100644
... ...
@@ -0,0 +1,491 @@
1
+#define _POSIX_C_SOURCE 200112L /* Rsamtools: c99 fileno */
2
+#define _SVID_SOURCE            /* Rsamtools: c99 strdup */
3
+#include <zlib.h>
4
+#include <stdio.h>
5
+#include <ctype.h>
6
+#include <string.h>
7
+#include <stdlib.h>
8
+#include <unistd.h>
9
+#include <assert.h>
10
+#ifdef _WIN32
11
+#include <fcntl.h>
12
+#endif
13
+#include "kstring.h"
14
+#include "bam.h"
15
+#include "sam_header.h"
16
+#include "kseq.h"
17
+#include "khash.h"
18
+
19
+KSTREAM_INIT(gzFile, gzread, 16384)
20
+KHASH_MAP_INIT_STR(ref, uint64_t)
21
+
22
+void bam_init_header_hash(bam_header_t *header);
23
+void bam_destroy_header_hash(bam_header_t *header);
24
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
25
+
26
+unsigned char bam_nt16_table[256] = {
27
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
28
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
29
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
30
+	 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
31
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
32
+	15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
33
+	15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
34
+	15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
35
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
36
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
37
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
38
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
39
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
40
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
41
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
42
+	15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
43
+};
44
+
45
+unsigned short bam_char2flag_table[256] = {
46
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
47
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
48
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
49
+	0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
50
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
51
+	BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0,
52
+	0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0,
53
+	BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0,
54
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
55
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
56
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
57
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
58
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
59
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
60
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
61
+	0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
62
+};
63
+
64
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
65
+
66
+struct __tamFile_t {
67
+	gzFile fp;
68
+	kstream_t *ks;
69
+	kstring_t *str;
70
+	uint64_t n_lines;
71
+	int is_first;
72
+};
73
+
74
+char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only
75
+{
76
+	char **list = 0, *s;
77
+	int n = 0, dret, m = 0;
78
+	gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
79
+	kstream_t *ks;
80
+	kstring_t *str;
81
+	str = (kstring_t*)calloc(1, sizeof(kstring_t));
82
+	ks = ks_init(fp);
83
+	while (ks_getuntil(ks, '\n', str, &dret) > 0) {
84
+		if (n == m) {
85
+			m = m? m << 1 : 16;
86
+			list = (char**)realloc(list, m * sizeof(char*));
87
+		}
88
+		if (str->s[str->l-1] == '\r')
89
+			str->s[--str->l] = '\0';
90
+		s = list[n++] = (char*)calloc(str->l + 1, 1);
91
+		strcpy(s, str->s);
92
+	}
93
+	ks_destroy(ks);
94
+	gzclose(fp);
95
+	free(str->s); free(str);
96
+	*_n = n;
97
+	return list;
98
+}
99
+
100
+static bam_header_t *hash2header(const kh_ref_t *hash)
101
+{
102
+	bam_header_t *header;
103
+	khiter_t k;
104
+	header = bam_header_init();
105
+	header->n_targets = kh_size(hash);
106
+	header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
107
+	header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
108
+	for (k = kh_begin(hash); k != kh_end(hash); ++k) {
109
+		if (kh_exist(hash, k)) {
110
+			int i = (int)kh_value(hash, k);
111
+			header->target_name[i] = (char*)kh_key(hash, k);
112
+			header->target_len[i] = kh_value(hash, k)>>32;
113
+		}
114
+	}
115
+	bam_init_header_hash(header);
116
+	return header;
117
+}
118
+bam_header_t *sam_header_read2(const char *fn)
119
+{
120
+	bam_header_t *header;
121
+	int c, dret, ret, error = 0;
122
+	gzFile fp;
123
+	kstream_t *ks;
124
+	kstring_t *str;
125
+	kh_ref_t *hash;
126
+	khiter_t k;
127
+	if (fn == 0) return 0;
128
+	fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
129
+	if (fp == 0) return 0;
130
+	hash = kh_init(ref);
131
+	ks = ks_init(fp);
132
+	str = (kstring_t*)calloc(1, sizeof(kstring_t));
133
+	while (ks_getuntil(ks, 0, str, &dret) > 0) {
134
+		char *s = strdup(str->s);
135
+		int len, i;
136
+		i = kh_size(hash);
137
+		ks_getuntil(ks, 0, str, &dret);
138
+		len = atoi(str->s);
139
+		k = kh_put(ref, hash, s, &ret);
140
+		if (ret == 0) {
141
+			fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s);
142
+			error = 1;
143
+		}
144
+		kh_value(hash, k) = (uint64_t)len<<32 | i;
145
+		if (dret != '\n')
146
+			while ((c = ks_getc(ks)) != '\n' && c != -1);
147
+	}
148
+	ks_destroy(ks);
149
+	gzclose(fp);
150
+	free(str->s); free(str);
151
+	fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
152
+	if (error) return 0;
153
+	header = hash2header(hash);
154
+	kh_destroy(ref, hash);
155
+	return header;
156
+}
157
+static inline uint8_t *alloc_data(bam1_t *b, int size)
158
+{
159
+	if (b->m_data < size) {
160
+		b->m_data = size;
161
+		kroundup32(b->m_data);
162
+		b->data = (uint8_t*)realloc(b->data, b->m_data);
163
+	}
164
+	return b->data;
165
+}
166
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
167
+{
168
+	fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
169
+	abort();
170
+}
171
+static inline void append_text(bam_header_t *header, kstring_t *str)
172
+{
173
+	size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
174
+	kroundup32(x); kroundup32(y);
175
+	if (x < y) 
176
+    {
177
+        header->n_text = y;
178
+        header->text = (char*)realloc(header->text, y);
179
+        if ( !header->text ) 
180
+        {
181
+            fprintf(stderr,"realloc failed to alloc %ld bytes\n", y);
182
+            abort();
183
+        }
184
+    }
185
+    // Sanity check
186
+    if ( header->l_text+str->l+1 >= header->n_text )
187
+    {
188
+        fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n",  header->l_text+str->l+1,(long)header->n_text,x,y);
189
+        abort();
190
+    }
191
+	strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
192
+	header->l_text += str->l + 1;
193
+	header->text[header->l_text] = 0;
194
+}
195
+
196
+int sam_header_parse(bam_header_t *h)
197
+{
198
+	char **tmp;
199
+	int i;
200
+	free(h->target_len); free(h->target_name);
201
+	h->n_targets = 0; h->target_len = 0; h->target_name = 0;
202
+	if (h->l_text < 3) return 0;
203
+	if (h->dict == 0) h->dict = sam_header_parse2(h->text);
204
+	tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets);
205
+	if (h->n_targets == 0) return 0;
206
+	h->target_name = calloc(h->n_targets, sizeof(void*));
207
+	for (i = 0; i < h->n_targets; ++i)
208
+		h->target_name[i] = strdup(tmp[i]);
209
+	free(tmp);
210
+	tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets);
211
+	h->target_len = calloc(h->n_targets, 4);
212
+	for (i = 0; i < h->n_targets; ++i)
213
+		h->target_len[i] = atoi(tmp[i]);
214
+	free(tmp);
215
+	return h->n_targets;
216
+}
217
+
218
+bam_header_t *sam_header_read(tamFile fp)
219
+{
220
+	int ret, dret;
221
+	bam_header_t *header = bam_header_init();
222
+	kstring_t *str = fp->str;
223
+	while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
224
+		str->s[str->l] = dret; // note that str->s is NOT null terminated!!
225
+		append_text(header, str);
226
+		if (dret != '\n') {
227
+			ret = ks_getuntil(fp->ks, '\n', str, &dret);
228
+			str->s[str->l] = '\n'; // NOT null terminated!!
229
+			append_text(header, str);
230
+		}
231
+		++fp->n_lines;
232
+	}
233
+	sam_header_parse(header);
234
+	bam_init_header_hash(header);
235
+	fp->is_first = ret >= 0; /* MTM: not header-only SAM */
236
+	return header;
237
+}
238
+
239
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
240
+{
241
+	int ret, doff, doff0, dret, z = 0;
242
+	bam1_core_t *c = &b->core;
243
+	kstring_t *str = fp->str;
244
+	kstream_t *ks = fp->ks;
245
+
246
+	if (fp->is_first) {
247
+		fp->is_first = 0;
248
+		ret = str->l;
249
+	} else {
250
+		do { // special consideration for empty lines
251
+			ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret);
252
+			if (ret >= 0) z += str->l + 1;
253
+		} while (ret == 0);
254
+	}
255
+	if (ret < 0) return -1;
256
+	++fp->n_lines;
257
+	doff = 0;
258
+
259
+	{ // name
260
+		c->l_qname = strlen(str->s) + 1;
261
+		memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
262
+		doff += c->l_qname;
263
+	}
264
+	{ // flag
265
+		long flag;
266
+		char *s;
267
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
268
+		flag = strtol((char*)str->s, &s, 0);
269
+		if (*s) { // not the end of the string
270
+			flag = 0;
271
+			for (s = str->s; *s; ++s)
272
+				flag |= bam_char2flag_table[(int)*s];
273
+		}
274
+		c->flag = flag;
275
+	}
276
+	{ // tid, pos, qual
277
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s);
278
+		if (c->tid < 0 && strcmp(str->s, "*")) {
279
+			if (header->n_targets == 0) {
280
+				fprintf(stderr, "[sam_read1] missing header? Abort!\n");
281
+				exit(1);
282
+			} else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s);
283
+		}
284
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
285
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
286
+		if (ret < 0) return -2;
287
+	}
288
+	{ // cigar
289
+		char *s, *t;
290
+		int i, op;
291
+		long x;
292
+		c->n_cigar = 0;
293
+		if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3;
294
+		z += str->l + 1;
295
+		if (str->s[0] != '*') {
296
+			uint32_t *cigar;
297
+			for (s = str->s; *s; ++s) {
298
+				if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar;
299
+				else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
300
+			}
301
+			b->data = alloc_data(b, doff + c->n_cigar * 4);
302
+			cigar = bam1_cigar(b);
303
+			for (i = 0, s = str->s; i != c->n_cigar; ++i) {
304
+				x = strtol(s, &t, 10);
305
+				op = toupper(*t);
306
+				if (op == 'M') op = BAM_CMATCH;
307
+				else if (op == 'I') op = BAM_CINS;
308
+				else if (op == 'D') op = BAM_CDEL;
309
+				else if (op == 'N') op = BAM_CREF_SKIP;
310
+				else if (op == 'S') op = BAM_CSOFT_CLIP;
311
+				else if (op == 'H') op = BAM_CHARD_CLIP;
312
+				else if (op == 'P') op = BAM_CPAD;
313
+				else if (op == '=') op = BAM_CEQUAL;
314
+				else if (op == 'X') op = BAM_CDIFF;
315
+				else if (op == 'B') op = BAM_CBACK;
316
+				else parse_error(fp->n_lines, "invalid CIGAR operation");
317
+				s = t + 1;
318
+				cigar[i] = bam_cigar_gen(x, op);
319
+			}
320
+			if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
321
+			c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar));
322
+			doff += c->n_cigar * 4;
323
+		} else {
324
+			if (!(c->flag&BAM_FUNMAP)) {
325
+				fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines);
326
+				c->flag |= BAM_FUNMAP;
327
+			}
328
+			c->bin = bam_reg2bin(c->pos, c->pos + 1);
329
+		}
330
+	}
331
+	{ // mtid, mpos, isize
332
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
333
+		c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
334
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
335
+		c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
336
+		ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1;
337
+		c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
338
+		if (ret < 0) return -4;
339
+	}
340
+	{ // seq and qual
341
+		int i;
342
+		uint8_t *p = 0;
343
+		if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq
344
+		z += str->l + 1;
345
+		if (strcmp(str->s, "*")) {
346
+			c->l_qseq = strlen(str->s);
347
+			if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
348
+				fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n",
349
+						(long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
350
+				parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
351
+			}
352
+			p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
353
+			memset(p, 0, (c->l_qseq+1)/2);
354
+			for (i = 0; i < c->l_qseq; ++i)
355
+				p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
356
+		} else c->l_qseq = 0;
357
+		if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual
358
+		z += str->l + 1;
359
+		if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s))
360
+			parse_error(fp->n_lines, "sequence and quality are inconsistent");
361
+		p += (c->l_qseq+1)/2;
362
+		if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff;
363
+		else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
364
+		doff += c->l_qseq + (c->l_qseq+1)/2;
365
+	}
366
+	doff0 = doff;
367
+	if (dret != '\n' && dret != '\r') { // aux
368
+		while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) {
369
+			uint8_t *s, type, key[2];