Browse code

Added the seqbias package to the repository.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqbias@52013 bc3139a8-67e5-0310-9ffc-ced21a209358

Chao-Jen Wong authored on 22/01/2011 00:26:36
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,454 @@
1
+#include "stream.h"
2
+#include <iostream>
3
+#include "exp.h"
4
+
5
+#ifndef YAML_PREFETCH_SIZE
6
+#define YAML_PREFETCH_SIZE 2048
7
+#endif
8
+
9
+#define S_ARRAY_SIZE( A ) (sizeof(A)/sizeof(*(A)))
10
+#define S_ARRAY_END( A ) ((A) + S_ARRAY_SIZE(A))
11
+
12
+#define CP_REPLACEMENT_CHARACTER (0xFFFD)
13
+
14
+namespace YAML
15
+{
16
+	enum UtfIntroState {
17
+		uis_start,
18
+		uis_utfbe_b1,
19
+		uis_utf32be_b2,
20
+		uis_utf32be_bom3,
21
+		uis_utf32be,
22
+		uis_utf16be,
23
+		uis_utf16be_bom1,
24
+		uis_utfle_bom1,
25
+		uis_utf16le_bom2,
26
+		uis_utf32le_bom3,
27
+		uis_utf16le,
28
+		uis_utf32le,
29
+		uis_utf8_imp,
30
+		uis_utf16le_imp,
31
+		uis_utf32le_imp3,
32
+		uis_utf8_bom1,
33
+		uis_utf8_bom2,
34
+		uis_utf8,
35
+		uis_error
36
+	};
37
+
38
+	enum UtfIntroCharType {
39
+		uict00,
40
+		uictBB,
41
+		uictBF,
42
+		uictEF,
43
+		uictFE,
44
+		uictFF,
45
+		uictAscii,
46
+		uictOther,
47
+		uictMax
48
+	};
49
+
50
+	static bool s_introFinalState[] = {
51
+		false, //uis_start
52
+		false, //uis_utfbe_b1
53
+		false, //uis_utf32be_b2
54
+		false, //uis_utf32be_bom3
55
+		true,  //uis_utf32be
56
+		true,  //uis_utf16be
57
+		false, //uis_utf16be_bom1
58
+		false, //uis_utfle_bom1
59
+		false, //uis_utf16le_bom2
60
+		false, //uis_utf32le_bom3
61
+		true,  //uis_utf16le
62
+		true,  //uis_utf32le
63
+		false, //uis_utf8_imp
64
+		false, //uis_utf16le_imp
65
+		false, //uis_utf32le_imp3
66
+		false, //uis_utf8_bom1
67
+		false, //uis_utf8_bom2
68
+		true,  //uis_utf8
69
+		true,  //uis_error
70
+	};
71
+
72
+	static UtfIntroState s_introTransitions[][uictMax] = {
73
+		// uict00,           uictBB,           uictBF,           uictEF,           uictFE,           uictFF,           uictAscii,        uictOther
74
+		  {uis_utfbe_b1,     uis_utf8,         uis_utf8,         uis_utf8_bom1,    uis_utf16be_bom1, uis_utfle_bom1,   uis_utf8_imp,     uis_utf8},
75
+		  {uis_utf32be_b2,   uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16be,      uis_utf8},
76
+		  {uis_utf32be,      uis_utf8,         uis_utf8,         uis_utf8,         uis_utf32be_bom3, uis_utf8,         uis_utf8,         uis_utf8},
77
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf32be,      uis_utf8,         uis_utf8},
78
+		  {uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be,      uis_utf32be},
79
+		  {uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be,      uis_utf16be},
80
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16be,      uis_utf8,         uis_utf8},
81
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf16le_bom2, uis_utf8,         uis_utf8,         uis_utf8},
82
+		  {uis_utf32le_bom3, uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
83
+		  {uis_utf32le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
84
+		  {uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
85
+		  {uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le,      uis_utf32le},
86
+		  {uis_utf16le_imp,  uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
87
+		  {uis_utf32le_imp3, uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
88
+		  {uis_utf32le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le,      uis_utf16le},
89
+		  {uis_utf8,         uis_utf8_bom2,    uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
90
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
91
+		  {uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8,         uis_utf8},
92
+	};
93
+
94
+	static char s_introUngetCount[][uictMax] = {
95
+		// uict00, uictBB, uictBF, uictEF, uictFE, uictFF, uictAscii, uictOther
96
+		  {0,      1,      1,      0,      0,      0,      0,         1},
97
+		  {0,      2,      2,      2,      2,      2,      2,         2},
98
+		  {3,      3,      3,      3,      0,      3,      3,         3},
99
+		  {4,      4,      4,      4,      4,      0,      4,         4},
100
+		  {1,      1,      1,      1,      1,      1,      1,         1},
101
+		  {1,      1,      1,      1,      1,      1,      1,         1},
102
+		  {2,      2,      2,      2,      2,      0,      2,         2},
103
+		  {2,      2,      2,      2,      0,      2,      2,         2},
104
+		  {0,      1,      1,      1,      1,      1,      1,         1},
105
+		  {0,      2,      2,      2,      2,      2,      2,         2},
106
+		  {1,      1,      1,      1,      1,      1,      1,         1},
107
+		  {1,      1,      1,      1,      1,      1,      1,         1},
108
+		  {0,      2,      2,      2,      2,      2,      2,         2},
109
+		  {0,      3,      3,      3,      3,      3,      3,         3},
110
+		  {4,      4,      4,      4,      4,      4,      4,         4},
111
+		  {2,      0,      2,      2,      2,      2,      2,         2},
112
+		  {3,      3,      0,      3,      3,      3,      3,         3},
113
+		  {1,      1,      1,      1,      1,      1,      1,         1},
114
+	};
115
+
116
+	inline UtfIntroCharType IntroCharTypeOf(std::istream::int_type ch)
117
+	{
118
+		if (std::istream::traits_type::eof() == ch) {
119
+			return uictOther;
120
+		}
121
+
122
+		switch (ch) {
123
+		case 0: return uict00;
124
+		case 0xBB: return uictBB;
125
+		case 0xBF: return uictBF;
126
+		case 0xEF: return uictEF;
127
+		case 0xFE: return uictFE;
128
+		case 0xFF: return uictFF;
129
+		}
130
+
131
+		if ((ch > 0) && (ch < 0xFF)) {
132
+			return uictAscii;
133
+		}
134
+
135
+		return uictOther;
136
+	}
137
+
138
+	inline char Utf8Adjust(unsigned long ch, unsigned char lead_bits, unsigned char rshift)
139
+	{
140
+		const unsigned char header = ((1 << lead_bits) - 1) << (8 - lead_bits);
141
+		const unsigned char mask = (0xFF >> (lead_bits + 1));
142
+		return static_cast<char>(static_cast<unsigned char>(
143
+			header | ((ch >> rshift) & mask)
144
+			));
145
+	}
146
+
147
+	inline void QueueUnicodeCodepoint(std::deque<char>& q, unsigned long ch)
148
+	{
149
+		// We are not allowed to queue the Stream::eof() codepoint, so
150
+		// replace it with CP_REPLACEMENT_CHARACTER
151
+		if (static_cast<unsigned long>(Stream::eof()) == ch)
152
+		{
153
+			ch = CP_REPLACEMENT_CHARACTER;
154
+		}
155
+
156
+		if (ch < 0x80)
157
+		{
158
+			q.push_back(Utf8Adjust(ch, 0, 0));
159
+		}
160
+		else if (ch < 0x800)
161
+		{
162
+			q.push_back(Utf8Adjust(ch, 2, 6));
163
+			q.push_back(Utf8Adjust(ch, 1, 0));
164
+		}
165
+		else if (ch < 0x10000)
166
+		{
167
+			q.push_back(Utf8Adjust(ch, 3, 12));
168
+			q.push_back(Utf8Adjust(ch, 1, 6));
169
+			q.push_back(Utf8Adjust(ch, 1, 0));
170
+		}
171
+		else
172
+		{
173
+			q.push_back(Utf8Adjust(ch, 4, 18));
174
+			q.push_back(Utf8Adjust(ch, 1, 12));
175
+			q.push_back(Utf8Adjust(ch, 1, 6));
176
+			q.push_back(Utf8Adjust(ch, 1, 0));
177
+		}
178
+	}
179
+
180
+	Stream::Stream(std::istream& input)
181
+		: m_input(input), m_nPushedBack(0),
182
+		m_pPrefetched(new unsigned char[YAML_PREFETCH_SIZE]), 
183
+		m_nPrefetchedAvailable(0), m_nPrefetchedUsed(0)
184
+	{
185
+		typedef std::istream::traits_type char_traits;
186
+
187
+		if(!input)
188
+			return;
189
+
190
+		// Determine (or guess) the character-set by reading the BOM, if any.  See
191
+		// the YAML specification for the determination algorithm.
192
+		char_traits::int_type intro[4];
193
+		int nIntroUsed = 0;
194
+		UtfIntroState state = uis_start;
195
+		for (; !s_introFinalState[state]; ) {
196
+			std::istream::int_type ch = input.get();
197
+			intro[nIntroUsed++] = ch;
198
+			UtfIntroCharType charType = IntroCharTypeOf(ch);
199
+			UtfIntroState newState = s_introTransitions[state][charType];
200
+			int nUngets = s_introUngetCount[state][charType];
201
+			if (nUngets > 0) {
202
+				for (; nUngets > 0; --nUngets) {
203
+					if (char_traits::eof() != intro[--nIntroUsed]) {
204
+						m_bufPushback[m_nPushedBack++] = 
205
+							char_traits::to_char_type(intro[nIntroUsed]);
206
+					}
207
+				}
208
+			}
209
+			state = newState;
210
+		}
211
+
212
+		switch (state) {
213
+		case uis_utf8: m_charSet = utf8; break;
214
+		case uis_utf16le: m_charSet = utf16le; break;
215
+		case uis_utf16be: m_charSet = utf16be; break;
216
+		case uis_utf32le: m_charSet = utf32le; break;
217
+		case uis_utf32be: m_charSet = utf32be; break;
218
+		default: m_charSet = utf8; break;
219
+		}
220
+
221
+		ReadAheadTo(0);
222
+	}
223
+
224
+	Stream::~Stream()
225
+	{
226
+		delete[] m_pPrefetched;
227
+	}
228
+
229
+	char Stream::peek() const
230
+	{
231
+		if (m_readahead.empty())
232
+		{
233
+			return Stream::eof();
234
+		}
235
+
236
+		return m_readahead[0];
237
+	}
238
+	
239
+	Stream::operator bool() const
240
+	{
241
+		return m_input.good() || (!m_readahead.empty() && m_readahead[0] != Stream::eof());
242
+	}
243
+
244
+	// get
245
+	// . Extracts a character from the stream and updates our position
246
+	char Stream::get()
247
+	{
248
+		char ch = peek();
249
+		AdvanceCurrent();
250
+		m_mark.column++;
251
+		
252
+		if(ch == '\n') {
253
+			m_mark.column = 0;
254
+			m_mark.line++;
255
+		}
256
+		
257
+		return ch;
258
+	}
259
+
260
+	// get
261
+	// . Extracts 'n' characters from the stream and updates our position
262
+	std::string Stream::get(int n)
263
+	{
264
+		std::string ret;
265
+		ret.reserve(n);
266
+		for(int i=0;i<n;i++)
267
+			ret += get();
268
+		return ret;
269
+	}
270
+
271
+	// eat
272
+	// . Eats 'n' characters and updates our position.
273
+	void Stream::eat(int n)
274
+	{
275
+		for(int i=0;i<n;i++)
276
+			get();
277
+	}
278
+
279
+	void Stream::AdvanceCurrent()
280
+	{
281
+		if (!m_readahead.empty())
282
+		{
283
+			m_readahead.pop_front();
284
+			m_mark.pos++;
285
+		}
286
+
287
+		ReadAheadTo(0);
288
+	}
289
+
290
+	bool Stream::_ReadAheadTo(size_t i) const
291
+	{
292
+		while (m_input.good() && (m_readahead.size() <= i))
293
+		{
294
+			switch (m_charSet)
295
+			{
296
+			case utf8: StreamInUtf8(); break;
297
+			case utf16le: StreamInUtf16(); break;
298
+			case utf16be: StreamInUtf16(); break;
299
+			case utf32le: StreamInUtf32(); break;
300
+			case utf32be: StreamInUtf32(); break;
301
+			}
302
+		}
303
+		
304
+		// signal end of stream
305
+		if(!m_input.good())
306
+			m_readahead.push_back(Stream::eof());
307
+
308
+		return m_readahead.size() > i;
309
+	}
310
+
311
+	void Stream::StreamInUtf8() const
312
+	{
313
+		unsigned char b = GetNextByte();
314
+		if (m_input.good())
315
+		{
316
+			m_readahead.push_back(b);
317
+		}
318
+	}
319
+
320
+	void Stream::StreamInUtf16() const
321
+	{
322
+		unsigned long ch = 0;
323
+		unsigned char bytes[2];
324
+		int nBigEnd = (m_charSet == utf16be) ? 0 : 1;
325
+
326
+		bytes[0] = GetNextByte();
327
+		bytes[1] = GetNextByte();
328
+		if (!m_input.good())
329
+		{
330
+			return;
331
+		}
332
+		ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
333
+			static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
334
+
335
+		if (ch >= 0xDC00 && ch < 0xE000)
336
+		{
337
+			// Trailing (low) surrogate...ugh, wrong order
338
+			QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
339
+			return;
340
+		}
341
+		else if (ch >= 0xD800 && ch < 0xDC00)
342
+		{
343
+			// ch is a leading (high) surrogate
344
+
345
+			// Four byte UTF-8 code point
346
+
347
+			// Read the trailing (low) surrogate
348
+			for (;;)
349
+			{
350
+				bytes[0] = GetNextByte();
351
+				bytes[1] = GetNextByte();
352
+				if (!m_input.good())
353
+				{
354
+					QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
355
+					return;
356
+				}
357
+				unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
358
+					static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
359
+				if (chLow < 0xDC00 || ch >= 0xE000)
360
+				{
361
+					// Trouble...not a low surrogate.  Dump a REPLACEMENT CHARACTER into the stream.
362
+					QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
363
+
364
+					// Deal with the next UTF-16 unit
365
+					if (chLow < 0xD800 || ch >= 0xE000)
366
+					{
367
+						// Easiest case: queue the codepoint and return
368
+						QueueUnicodeCodepoint(m_readahead, ch);
369
+						return;
370
+					}
371
+					else
372
+					{
373
+						// Start the loop over with the new high surrogate
374
+						ch = chLow;
375
+						continue;
376
+					}
377
+				}
378
+
379
+				// Select the payload bits from the high surrogate
380
+				ch &= 0x3FF;
381
+				ch <<= 10;
382
+
383
+				// Include bits from low surrogate
384
+				ch |= (chLow & 0x3FF);
385
+
386
+				// Add the surrogacy offset
387
+				ch += 0x10000;
388
+			}
389
+		}
390
+
391
+		QueueUnicodeCodepoint(m_readahead, ch);
392
+	}
393
+
394
+	inline char* ReadBuffer(unsigned char* pBuffer)
395
+	{
396
+		return reinterpret_cast<char*>(pBuffer);
397
+	}
398
+
399
+	unsigned char Stream::GetNextByte() const
400
+	{
401
+		if (m_nPushedBack)
402
+		{
403
+			return m_bufPushback[--m_nPushedBack];
404
+		}
405
+
406
+		if (m_nPrefetchedUsed >= m_nPrefetchedAvailable)
407
+		{
408
+			std::streambuf *pBuf = m_input.rdbuf();
409
+			m_nPrefetchedAvailable = pBuf->sgetn(ReadBuffer(m_pPrefetched), 
410
+				YAML_PREFETCH_SIZE);
411
+			m_nPrefetchedUsed = 0;
412
+			if (!m_nPrefetchedAvailable)
413
+			{
414
+				m_input.setstate(std::ios_base::eofbit);
415
+			}
416
+
417
+			if (0 == m_nPrefetchedAvailable)
418
+			{
419
+				return 0;
420
+			}
421
+		}
422
+
423
+		return m_pPrefetched[m_nPrefetchedUsed++];
424
+	}
425
+
426
+	void Stream::StreamInUtf32() const
427
+	{
428
+		static int indexes[2][4] = {
429
+			{3, 2, 1, 0},
430
+			{0, 1, 2, 3}
431
+		};
432
+
433
+		unsigned long ch = 0;
434
+		unsigned char bytes[4];
435
+		int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];
436
+
437
+		bytes[0] = GetNextByte();
438
+		bytes[1] = GetNextByte();
439
+		bytes[2] = GetNextByte();
440
+		bytes[3] = GetNextByte();
441
+		if (!m_input.good())
442
+		{
443
+			return;
444
+		}
445
+
446
+		for (int i = 0; i < 4; ++i)
447
+		{
448
+			ch <<= 8;
449
+			ch |= bytes[pIndexes[i]];
450
+		}
451
+
452
+		QueueUnicodeCodepoint(m_readahead, ch);
453
+	}
454
+}