Browse code

add seqTools package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@95415 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 13/10/2014 19:02:34
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,713 @@
1
+/*
2
+ * dna_fasta_stream.h
3
+ *
4
+ *  Created on: 02.10.2013
5
+ *      Author: wolfgang
6
+ */
7
+
8
+#ifndef DNA_FASTA_STREAM_H_
9
+#define DNA_FASTA_STREAM_H_
10
+
11
+# include <zlib.h>
12
+# include "stat_defs.h"
13
+
14
+// Assumes that line size >= k
15
+
16
+/*
17
+ *
18
+ * http://en.wikipedia.org/wiki/FASTA_format
19
+ *
20
+ * The description line is distinguished from the sequence data by a greater-than (">")
21
+ * symbol in the first column.
22
+ * The word following the ">" symbol is the identifier of the sequence
23
+ * and the rest of the line is the description (both are optional).
24
+ * There should be no space between the ">" and the first letter of the identifier.
25
+ * It is recommended that all lines of text be shorter than 80 characters.
26
+ */
27
+
28
+
29
+
30
+///////////////////////////////////////////////////////////////////////////////////////////////////
31
+//
32
+// File stream (can handle text and gz files)
33
+//
34
+///////////////////////////////////////////////////////////////////////////////////////////////////
35
+
36
+
37
+static const unsigned faf_file_open		= 0;
38
+static const unsigned faf_file_eof		= 1;
39
+static const unsigned faf_file_closed	= 2;
40
+static const unsigned faf_txt    		= 4;
41
+static const unsigned faf_gz     		= 8;
42
+static const unsigned faf_error  		= 16;
43
+
44
+typedef struct fasta_file_stream
45
+{
46
+	unsigned type;
47
+	unsigned status;
48
+
49
+	FILE *f;
50
+	gzFile gz;
51
+
52
+} fafStream;
53
+
54
+static inline unsigned faf_isEof(fafStream *faf) { return faf->status & faf_file_eof; }
55
+static inline unsigned faf_isOpen(fafStream *faf)
56
+{
57
+	if(faf->status == faf_file_open)
58
+		return 1;
59
+	return 0;
60
+}
61
+
62
+static fafStream* faf_stream_init(const char* filename, unsigned mode)
63
+{
64
+	// Construct fafStream object from opened file
65
+	fafStream *faf = calloc(sizeof(fafStream), 1);
66
+	if(mode == faf_gz)
67
+	{
68
+		faf->type = faf_gz;
69
+		faf->gz = gzopen(filename,"rb");
70
+		if(faf->gz)
71
+			faf->status = faf_file_open;
72
+		else
73
+			faf->status = faf_file_closed;
74
+	}
75
+	else
76
+	{
77
+		faf->type = faf_txt;
78
+		faf->f = fopen(filename, "r");
79
+		if(faf->f)
80
+			faf->status = faf_file_open;
81
+		else
82
+			faf->status = faf_file_closed;
83
+	}
84
+	return faf;
85
+}
86
+
87
+static void faf_destroy(fafStream *faf)
88
+{
89
+	if(!(faf->status & faf_file_closed))
90
+	{
91
+		if(faf->type == faf_gz)
92
+			gzclose(faf->gz);
93
+		else
94
+			fclose(faf->f);
95
+	}
96
+	free(faf);
97
+}
98
+
99
+static size_t inline faf_read(fafStream *faf, char *dest, unsigned size)
100
+{
101
+	if(faf->status == faf_file_open)
102
+	{
103
+		unsigned nchar;
104
+		if(faf->type == faf_gz)
105
+		{
106
+			nchar = gzread(faf->gz, dest, sizeof(char) * size);
107
+
108
+			if(gzeof(faf->gz))
109
+			{
110
+				faf->status = faf_file_eof;
111
+				gzclose(faf->gz);
112
+				faf->gz = 0;
113
+				faf->status |= faf_file_closed;
114
+			}
115
+		}
116
+		else
117
+		{
118
+			nchar = fread(dest, sizeof(char), size, faf->f);
119
+			if(feof(faf->f))
120
+			{
121
+				faf->status = faf_file_eof;
122
+				fclose(faf->f);
123
+				faf->f = 0;
124
+				faf->status |= faf_file_closed;
125
+			}
126
+		}
127
+		return nchar;
128
+	}
129
+	return 0;
130
+}
131
+
132
+
133
+///////////////////////////////////////////////////////////////////////////////////////////////////
134
+//
135
+// fasta Stream
136
+//
137
+///////////////////////////////////////////////////////////////////////////////////////////////////
138
+
139
+
140
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
141
+//
142
+// Define buffer size and delimiting characters
143
+//
144
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
145
+
146
+static const unsigned long	fas_loc_Nchar	= 10;
147
+static const unsigned long	fas_size		= 10+1;
148
+static const char			fas_seq_delim	= '>';
149
+static const char			fas_comment		= ';';
150
+static const char			fas_loc_NewLine	= '\n';
151
+static const char			fas_eoc			= '\0';
152
+
153
+
154
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
155
+//
156
+// Define status flags
157
+//
158
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
159
+
160
+static const int fas_err				= -1;	// Unrecoverable error state
161
+static const int fas_ok					= 0;	// Must be 0 !!!
162
+
163
+static const int fas_loc_kReady			= 1;
164
+static const int fas_nuc_ready          = 2;
165
+// Recoverable reasons for transfer interruption
166
+static const int fas_loc_newLine		= 4;
167
+static const int fas_loc_newSeq			= 8;
168
+static const int fas_loc_comment		= 16;
169
+static const int fas_loc_N				= 32;
170
+// File and stream running out of content
171
+static const int fas_stream_eof			= 64;
172
+static const int fas_stream_empty	    = 128;
173
+static const int fas_nuc_empty		    = 256;
174
+
175
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
176
+
177
+
178
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
179
+//
180
+// 	Struct definition
181
+//
182
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
183
+
184
+typedef struct fasta_stream
185
+{
186
+
187
+	fafStream *fasta;
188
+
189
+	char * fas;			// Raw fasta content
190
+	char * nuc;			// Processed pure Nucleotide sequence
191
+
192
+	char * iter;		// iterator for fas
193
+	char * nuc_iter;	// iterator for nuc
194
+
195
+	unsigned ffc;		// future fas content = fas_loc_Nchar - (iter-fas)
196
+	unsigned pfc;		// past   fas content = iter-fas (on demand)
197
+
198
+	unsigned fnc;		// future nuc content
199
+	unsigned pnc;		// past   nuc content
200
+
201
+	int stream_state;	// Carries stream state flags
202
+
203
+	unsigned nN;		// Number of N's so far seen
204
+	unsigned nSeq;		// Number of seq's so far seen
205
+	unsigned k;			// K-mer length
206
+
207
+} faStream;
208
+
209
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
210
+//
211
+// Constructing and and File operations
212
+//
213
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
214
+
215
+void faStream_destroy(faStream *fa)
216
+{
217
+	faf_destroy(fa->fasta);
218
+	fa->fasta = 0;
219
+
220
+	free(fa->fas);
221
+	fa->fas = 0;
222
+
223
+	free(fa->nuc);
224
+	fa->nuc = 0;
225
+
226
+	free(fa);
227
+}
228
+
229
+
230
+int fas_fill(faStream *fa)
231
+{
232
+	if(!faf_isEof(fa->fasta))
233
+	{
234
+		// Unused suffix in array?
235
+		if(fa->ffc > 0)
236
+		{
237
+			fa->pfc = fa->iter - fa->fas;
238
+
239
+			// Enough space at begin of array?
240
+			if(fa->pfc < fa->ffc)
241
+			{
242
+				fa->stream_state = fas_err;
243
+				return fa->stream_state;
244
+			}
245
+
246
+			// Shift unused suffix to begin
247
+			memcpy(fa->fas, fa->iter, fa->ffc);
248
+
249
+			// Fill rest of array from file
250
+			fa->ffc += faf_read(fa->fasta, fa->fas + fa->ffc, fa->pfc);
251
+		}
252
+		else
253
+		{
254
+			// Refill whole array
255
+			fa->ffc = faf_read(fa->fasta, fa->fas, fas_loc_Nchar);
256
+		}
257
+		fa->iter = fa->fas;
258
+	}
259
+
260
+	// Set flags
261
+	if(faf_isEof(fa->fasta))
262
+		fa->stream_state |= fas_stream_eof;
263
+
264
+	if(fa->ffc == 0)
265
+	{
266
+		fa->stream_state |= fas_stream_empty;
267
+		return fa->stream_state;
268
+	}
269
+
270
+	// Return success
271
+	fa->stream_state &= (~fas_stream_empty);
272
+	return fas_ok;
273
+}
274
+
275
+faStream * faStream_init(const char* filename, unsigned k, unsigned mode)
276
+{
277
+	if(k > fas_size)
278
+	{
279
+		printf("[faStream_init] k > fas_size!\n");
280
+		return 0;
281
+	}
282
+
283
+	faStream *fa = calloc(sizeof(faStream), 1);
284
+
285
+	if(mode == faf_gz)
286
+		fa->fasta = faf_stream_init(filename, faf_gz);
287
+	else
288
+		fa->fasta = faf_stream_init(filename, faf_txt);
289
+
290
+	if(!faf_isOpen(fa->fasta))
291
+	{
292
+		printf("[faStream_init] Cannot open file '%s'!\n", filename);
293
+		faStream_destroy(fa);
294
+		return 0;
295
+	}
296
+
297
+	fa->k=k;
298
+	fa->fas = calloc(fas_size, sizeof(char));
299
+	fa->nuc = calloc(fas_size, sizeof(char));
300
+
301
+	// fas_fill will read whole array
302
+	fa->stream_state = fas_stream_empty;
303
+	fa->ffc = 0;
304
+
305
+	// Provide initial filling
306
+	if(!fas_fill(fa))
307
+	{
308
+		printf("[faStream_init] Initial array filling failed!\n");
309
+		faStream_destroy(fa);
310
+		return 0;
311
+	}
312
+	return fa;
313
+}
314
+
315
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
316
+//
317
+// Check routines
318
+//
319
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
320
+
321
+static inline int fas_fas_end(faStream *fa)		{ return fa->ffc == 0; }
322
+static inline int fas_nuc_end(faStream *fa)		{ return fa->fnc == 0; }
323
+
324
+
325
+
326
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
327
+//
328
+// Skipping routines for Line feed, entire line and 'N' Nucleotides
329
+//
330
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
331
+
332
+
333
+static inline int fas_checkNewLine(faStream *fa)
334
+{
335
+	if(*fa->iter == fas_loc_NewLine)
336
+	{
337
+		fa->stream_state |= fas_loc_newLine;
338
+		return fa->stream_state;
339
+	}
340
+	return fas_ok;
341
+}
342
+
343
+static inline int fas_skipNewLine(faStream *fa)
344
+{
345
+	// Skips LF ('\n') character and
346
+	// eventually tries to fill array
347
+
348
+	if(*fa->iter == fas_loc_NewLine)
349
+	{
350
+		if(fa->ffc == 0)
351
+		{
352
+			if(!fas_fill(fa))
353
+				return fa->stream_state;
354
+		}
355
+		// Eat newline
356
+		++fa->iter;
357
+		--fa->ffc;
358
+		// Unset flag
359
+		fa->stream_state &= (~fas_loc_newLine);
360
+		return fas_ok;
361
+	}
362
+	return fa->stream_state;
363
+}
364
+
365
+
366
+static inline int fas_skipLine(faStream *fa)
367
+{
368
+	// Traverses until first position of next line
369
+	// and eventually tries to refill array
370
+	while(*fa->iter != fas_loc_NewLine)
371
+	{
372
+		if(fa->ffc > 0)
373
+		{
374
+			++fa->iter;
375
+			--fa->ffc;
376
+		}
377
+		else
378
+		{
379
+			if(!fas_fill(fa))
380
+				return fa->stream_state;
381
+		}
382
+	}
383
+	return fas_skipNewLine(fa);
384
+}
385
+
386
+static inline int fas_checkNewSeq(faStream *fa)
387
+{
388
+	// Checks for Seq-delimiter at current position
389
+	// and eventually sets flag
390
+	// (does not proceed)
391
+	if(*fa->iter == fas_seq_delim)
392
+	{
393
+		fa->stream_state |= fas_loc_newSeq;
394
+		return fa->stream_state;
395
+	}
396
+	return fas_ok;
397
+}
398
+
399
+static inline int fas_skipSeqHeader(faStream *fa)
400
+{
401
+	if(*fa->iter==fas_seq_delim)
402
+	{
403
+		if(!fas_skipLine(fa))
404
+		{
405
+			fa->stream_state&=(~fas_loc_newSeq);
406
+			return fas_ok;
407
+		}
408
+	}
409
+	return fa->stream_state;
410
+}
411
+
412
+static inline int fas_checkComment(faStream *fa)
413
+{
414
+	if(*fa->iter == fas_comment)
415
+	{
416
+		fa->stream_state |= fas_loc_comment;
417
+		return fa->stream_state;
418
+	}
419
+	return fas_ok;
420
+}
421
+
422
+static inline int fas_skipComment(faStream *fa)
423
+{
424
+	if(*fa->iter == fas_comment)
425
+	{
426
+		if(!fas_skipLine(fa))
427
+		{
428
+			fa->stream_state &= (~fas_loc_comment);
429
+			return fas_ok;
430
+		}
431
+	}
432
+	return fa->stream_state;
433
+}
434
+
435
+static inline int fas_checkN(faStream *fa)
436
+{
437
+	if(ACGTN[ (unsigned)*fa->iter ] == nval)
438
+	{
439
+		fa->stream_state |= fas_loc_N;
440
+		return fa->stream_state;
441
+	}
442
+	return fas_ok;
443
+}
444
+
445
+static inline int fas_skipN(faStream *fa)
446
+{
447
+	// Proceeds in array as long as 'N' nucleotide is present.
448
+	// Tries to skip Newline (LF) characters
449
+
450
+	// Eventually checks for new sequence and eventually
451
+	// sets flag (No initializing on new sequence).
452
+
453
+	// Eventually tries to refill array.
454
+
455
+	while(ACGTN[(unsigned)*fa->iter] == nval)
456
+	{
457
+		++fa->iter;
458
+		--fa->ffc;
459
+		++fa->nN;
460
+
461
+		// Array exhausted
462
+		if(fa->ffc == 0)
463
+		{
464
+			if(!fas_fill(fa))
465
+				return fa->stream_state;
466
+		}
467
+		// Newline found
468
+		if(!fas_checkNewLine(fa))
469
+		{
470
+			if(!fas_skipNewLine(fa))
471
+				return fa->stream_state;
472
+
473
+			while(!fas_checkComment(fa))
474
+			{
475
+				if(!fas_skipComment(fa))
476
+					return fa->stream_state;
477
+			}
478
+			if(!fas_checkNewSeq(fa))
479
+				return fa->stream_state;
480
+		}
481
+	}
482
+	// Unset N-flag
483
+	fa->stream_state &= (~fas_loc_N);
484
+	return fas_ok;
485
+}
486
+
487
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
488
+//
489
+// Routines:
490
+// 			fas_getSeqName
491
+//			fas_findNextKmer
492
+//			fas_initSeq
493
+//
494
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
495
+
496
+char * fas_getSeqName(faStream *fa)
497
+{
498
+	// From actual sequence delimiter:
499
+	// Copies text until empty space (or newline)
500
+	// into new char array.
501
+
502
+	// Char array will be returned und must be free'd from outside.
503
+
504
+	// Eventually tries to refill array
505
+
506
+	if(*fa->iter == fas_seq_delim)
507
+	{
508
+		char *seq_name = fa->iter;
509
+		unsigned ffc = fa->ffc;
510
+
511
+		while(!( (*seq_name==' ') || (*seq_name==fas_loc_NewLine) || (ffc==0) ) )
512
+		{
513
+			++seq_name;
514
+			--ffc;
515
+
516
+			// End of array reached
517
+			if(ffc==0)
518
+			{
519
+				// Try to refill
520
+				if(!(fa->stream_state & fas_stream_eof))
521
+					fas_fill(fa);
522
+				else
523
+					return 0;
524
+
525
+				// Restart search
526
+				seq_name = fa->iter;
527
+				ffc = fa->ffc;
528
+			}
529
+		}
530
+
531
+		// Skip first char='>'
532
+		unsigned i, nchar;
533
+
534
+		nchar = seq_name-fa->iter - 1;
535
+
536
+		// Must be free'd from outside
537
+		char *ret = calloc(sizeof(char), nchar + 1);
538
+
539
+		seq_name = fa->iter;
540
+		++seq_name;
541
+		for(i=0; i<nchar; ++i)
542
+		{
543
+			ret[i] = *seq_name;
544
+			++seq_name;
545
+		}
546
+		return ret;
547
+	}
548
+	return 0;
549
+}
550
+
551
+int fas_initSeq(faStream *fa)
552
+{
553
+	// + + + + + + + + + + + + + + + + + + + + + //
554
+	// Performs initializing steps when new
555
+	// sequence delimiter ('>') is found:
556
+	//
557
+	// A) Clears newSeq flag
558
+	// B) Skip actual line
559
+	// C) Skip following comment lines
560
+	// D) Calls findNextKmer
561
+	// (there is no testing for comments
562
+	// at other positions).
563
+	// + + + + + + + + + + + + + + + + + + + + + //
564
+
565
+	// Clear newSeq flag
566
+	fa->stream_state &= (~fas_loc_newSeq);
567
+
568
+	if(*fa->iter == fas_seq_delim)
569
+	{
570
+		// Skip header line
571
+		if(fas_skipLine(fa) != fas_ok)
572
+			return fa->stream_state;
573
+
574
+		// Skip comment lines
575
+		while(*fa->iter == fas_comment)
576
+		{
577
+			if(fas_skipLine(fa) != fas_ok)
578
+				return fa->stream_state;
579
+		}
580
+	}
581
+	return fas_ok;
582
+}
583
+
584
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
585
+//
586
+// Routines:
587
+// 			fas_returnFromTranfer
588
+//			fas_TransferNucArray
589
+//
590
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
591
+
592
+static inline int fas_returnFromTranfer(faStream *fa)
593
+{
594
+	// Joint terminating routine for
595
+	// TransferNucArray function.
596
+
597
+	if((fa->nuc_iter - fa->nuc) >= fa->k)
598
+	{
599
+		// A)	At least k nucleotides copied
600
+		// 		-> Success:
601
+		//		Add string terminator.
602
+		if(fa->fnc > 0)
603
+			*fa->nuc_iter = fas_eoc;
604
+
605
+		//		Set nuc_ready state.
606
+		fa->stream_state |= fas_nuc_ready;
607
+		fa->stream_state &= (~fas_nuc_empty);
608
+		return fas_ok;
609
+	}
610
+	else
611
+	{
612
+		// B)	Not enough nucleotides
613
+		//		four counting
614
+		//		-> Failure:
615
+		//		Reset nuc array.
616
+		fa->nuc_iter = fa->nuc;
617
+		*fa->nuc = '\0';
618
+
619
+		//		Set nuc empty state
620
+		fa->stream_state &= (~fas_nuc_ready);
621
+		fa->stream_state |= fas_nuc_empty;
622
+		return fa->stream_state;
623
+	}
624
+}
625
+
626
+int fas_TransferNucArray(faStream *fa)
627
+{
628
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
629
+	// Copies continuous Nucleotide sequence from
630
+	// input  fasta stream		(fas array) to
631
+	// output nucleotide stream	(nuc array).
632
+	//
633
+	// Tries to recover when newline character is found or
634
+	// end of fas-array is reached.
635
+	//
636
+	// Transaction skips newline characters and terminates
637
+	// when either 'N' or '>' (sequence delimiter)
638
+	// is found.
639
+	//
640
+	// Copying always starts at begin of nuc array
641
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
642
+
643
+	// Goto begin of nuc array.
644
+	fa->nuc_iter = fa->nuc;
645
+	fa->fnc = fas_loc_Nchar;
646
+
647
+
648
+	while(!fas_nuc_end(fa))
649
+	{
650
+		// Copy nucleotides
651
+		while( (ACGT[ (unsigned)*fa->iter ] != zval) && (fa->ffc > 0) && (fa->fnc > 0) )
652
+		{
653
+			*fa->nuc_iter = *fa->iter;
654
+			++fa->iter;
655
+			++fa->nuc_iter;
656
+			--fa->ffc;
657
+			--fa->fnc;
658
+		}
659
+
660
+		if(fas_checkNewLine(fa))
661
+		{
662
+			if(!fas_skipNewLine(fa))
663
+				return fas_returnFromTranfer(fa);
664
+
665
+			if(fas_checkNewSeq(fa))
666
+				return fas_returnFromTranfer(fa);
667
+
668
+			while(fas_checkComment(fa))
669
+			{
670
+				if(fas_skipComment(fa))
671
+					return fas_returnFromTranfer(fa);
672
+			}
673
+			// Here: NewLine readily skipped
674
+		}
675
+		else if(fas_fas_end(fa))
676
+		{
677
+			if(!fas_fill(fa))
678
+				return fas_returnFromTranfer(fa);
679
+		}
680
+		else if(fas_checkN(fa))
681
+			return fas_returnFromTranfer(fa);
682
+	}
683
+
684
+	// Nuc array is full
685
+	fa->stream_state |= fas_nuc_ready;
686
+	return fas_ok;
687
+}
688
+
689
+
690
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
691
+//
692
+// Routines: Accessors for stream flags.
693
+//
694
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
695
+
696
+
697
+int fa_empty(faStream *fa)    { return (fa->stream_state & fas_stream_empty); }
698
+int fa_NucReady(faStream *fa) { return (fa->stream_state & fas_nuc_ready);    }
699
+int fa_K_Ready(faStream *fa)  { return (fa->stream_state & fas_loc_kReady);   }
700
+int fa_NewSeq(faStream *fa)   { return (fa->stream_state & fas_loc_newSeq);   }
701
+int fa_N_Nuc(faStream *fa)    { return (ACGTN[(unsigned)*fa->iter] == nval);  }
702
+
703
+// Is unset when Nuc-Array is processed.
704
+void fa_unsetNucReady(faStream *fa)
705
+{
706
+	fa->stream_state &= (~fas_nuc_ready);
707
+	fa->stream_state |= fas_nuc_empty;
708
+}
709
+
710
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
711
+
712
+
713
+#endif /* DNA_FASTA_STREAM_H_ */