Browse code

add seqTools package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@95415 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 13/10/2014 19:02:34
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,570 @@
1
+/*
2
+ * fa_traverse.h
3
+ *
4
+ *  Created on: 08.10.2013
5
+ *      Author: kaisers
6
+ */
7
+
8
+#ifndef FA_TRAVERSE_H_
9
+#define FA_TRAVERSE_H_
10
+
11
+#include "stat_defs.h"
12
+#include "dna_astream.h"
13
+
14
+///////////////////////////////////////////////////////////////////////////////////////////////////
15
+//
16
+// 	Fasta traverse:
17
+//				Contains dna_astream which is traversed.
18
+//				Identified Nucleotide sequences of length >= k will be copied to
19
+//				das.pos array.
20
+//
21
+//				Defines subroutines for skipping header-lines, N-array and comment lines.
22
+//
23
+///////////////////////////////////////////////////////////////////////////////////////////////////
24
+
25
+
26
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
27
+//
28
+// 	DNA file stream:
29
+//				Struct layer for reading text into char * buffer
30
+//				from either uncompressed or compressed files.
31
+//
32
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
33
+
34
+
35
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
36
+//
37
+// Definition of static constants
38
+//
39
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
40
+
41
+static const unsigned  		fat_array_size  =0x1000;	// char array capacity
42
+
43
+static const char			fat_char_newSeq	='>';		// Fasta sequence delimiter
44
+static const char			fat_char_cmt	=';'; 		// Fasta comment  delimiter
45
+static const char			fat_char_lf		='\n';		// Line feed
46
+static const char			fat_char_eos	='\0';		// End of string
47
+
48
+static const int			fat_ok			=0;
49
+static const int			fat_empty		=1;
50
+static const int			fat_loc			=2;
51
+static const int			fat_newSeq		=4;
52
+static const int			fat_nucReady	=8;
53
+
54
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
55
+//
56
+// 	Struct definition, basic accessors; constructor and destructors:
57
+//		struct faTraverse
58
+//		fatEmpty, fatNewSeq, fatNucReady
59
+//		fat_destroy, fat_init
60
+//
61
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
62
+
63
+typedef struct fasta_traverse
64
+{
65
+	daStream *das;
66
+
67
+	int state;
68
+	unsigned k;
69
+
70
+	unsigned nN;		// Number of N's so far seen
71
+	unsigned nSeq;		// Number of sequences
72
+	unsigned nCom;		// Number of comment lines
73
+	unsigned nFill;		// Number of fill operations.
74
+
75
+} faTraverse;
76
+
77
+int fatEmpty (faTraverse* fat)  { return dasEmpty (fat->das); }
78
+int fatIsOpen(faTraverse* fat)	{ return dasIsOpen(fat->das); }
79
+int fatIsEof (faTraverse* fat)	{ return dasIsEof (fat->das); }
80
+int fatNewSeq(faTraverse *fat)  { return fat->state & fat_newSeq; }
81
+
82
+// Only purposed for use in top-level:
83
+// resets nucReady when called.
84
+int fatNucReady(faTraverse *fat)
85
+{
86
+	if(fat->state & fat_nucReady)
87
+	{
88
+		fat->state &= (~fat_nucReady);
89
+		return 1;
90
+	}
91
+	return 0;
92
+}
93
+
94
+
95
+void fat_destroy(faTraverse *fat)
96
+{
97
+	if(fat)
98
+	{
99
+		das_destroy(fat->das);
100
+		free(fat);
101
+	}
102
+}
103
+
104
+faTraverse * fat_init(const char* filename, int k)
105
+{
106
+	faTraverse * fat = (faTraverse*) calloc(sizeof(faTraverse), 1);
107
+	if(!fat)
108
+		return 0;
109
+
110
+	fat->das = das_init(filename, fat_array_size);
111
+	if(!fat->das)
112
+	{
113
+		//printf("[fat_init] das_init returned 0!\n");
114
+		free(fat);
115
+		return 0;
116
+	}
117
+	fat->k = (unsigned) k;
118
+
119
+	// Returns memory initialized object.
120
+	// File may be closed.
121
+	// File may be empty.
122
+	// Stream is not yet initialized.
123
+	return fat;
124
+}
125
+
126
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
127
+// Check-Fill: Checks for empty rfc (frs==0) and eventually fills das.
128
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
129
+
130
+int fatCheckFill(faTraverse *fat)
131
+{
132
+	//if(fat->das->frs==0)
133
+	if(dasEmpty(fat->das) )
134
+	{
135
+		//printf("[fatCheckFill] ...\n");
136
+		if(das_fill(fat->das))
137
+		{
138
+			++(fat->nFill);
139
+			fat->state |= fat_empty;
140
+			//printf("[fatCheckFill] return empty.\n");
141
+			return fat->state;
142
+		}
143
+		fat->state &= (~fat_empty);
144
+	}
145
+	return fat_ok;
146
+}
147
+
148
+
149
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
150
+// Check section (Must be placed beforehand skip-routines)
151
+//		fat_checkNewLine
152
+//		fat_checkNewSeq
153
+//		fat_checkComment
154
+//		fat_checkN
155
+//		fat_check_nuc
156
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
157
+
158
+static R_INLINE int fat_checkNewLine(faTraverse *fat)
159
+{
160
+	if(LFCR[(unsigned)*fat->das->r_iter] == lfcv)
161
+	{
162
+		fat->state |= fat_loc;
163
+		return fat->state;
164
+	}
165
+	return fat_ok;
166
+}
167
+
168
+static R_INLINE int fat_checkNewSeq(faTraverse *fat)
169
+{
170
+	if(*fat->das->r_iter == fat_char_newSeq)
171
+	{
172
+		fat->state |= fat_newSeq;
173
+		return fat->state;
174
+	}
175
+	return fat_ok;
176
+}
177
+
178
+static R_INLINE int fat_checkComment(faTraverse *fat)
179
+{
180
+	if(*fat->das->r_iter==fat_char_cmt)
181
+	{
182
+		//printf("[checkCmt] Comment found!\n");
183
+		fat->state|=fat_loc;
184
+		return fat->state;
185
+	}
186
+	return fat_ok;
187
+}
188
+
189
+static R_INLINE int fat_checkN(faTraverse *fat)
190
+{
191
+	if(ACGTN[(unsigned)*fat->das->r_iter] == nval)
192
+	{
193
+
194
+		fat->state |= fat_loc;
195
+		return fat->state;
196
+	}
197
+	return fat_ok;
198
+}
199
+
200
+static R_INLINE int fat_check_nuc(faTraverse *fat)
201
+{
202
+	if(ACGT[ ((unsigned) *fat->das->r_iter) ] == zval)
203
+	{
204
+		fat->state |= fat_loc;
205
+		return fat->state;
206
+	}
207
+	return fat_ok;
208
+}
209
+
210
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
211
+// Skip section (Do not reorder...)
212
+//			fat_skipNewLine
213
+//			fat_skipLine
214
+//			fat_skipComment
215
+//			fat_skipN
216
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
217
+
218
+static R_INLINE int fat_skipNewLine(faTraverse *fat)
219
+{
220
+	// Skips LF ('\n', ASCII 10)  and CR (ASCII 13) character and
221
+	// eventually tries to fill array
222
+	if(LFCR[ (unsigned)*fat->das->r_iter ] == lfcv)
223
+	{
224
+		if(fatCheckFill(fat))
225
+			return fat->state;
226
+
227
+		// Eat newline
228
+		++fat->das->r_iter;
229
+
230
+		// Unset flag
231
+		fat->state &= (~fat_loc);
232
+
233
+		return fat_ok;
234
+	}
235
+	return fat->state;
236
+}
237
+
238
+static R_INLINE int fat_skipLine(faTraverse *fat)
239
+{
240
+	// Traverses until first position of next line
241
+	// and eventually tries to refill array
242
+
243
+	daStream *das = fat->das;
244
+	while(LFCR[ (unsigned)*fat->das->r_iter ] == zval)
245
+	{
246
+		++das->r_iter;
247
+		if(fatCheckFill(fat))
248
+			return fat->state;
249
+	}
250
+	//printf("[fat_skipLine] post iter: '%s'\n",das->r_iter);
251
+	return fat_skipNewLine(fat);
252
+}
253
+
254
+static R_INLINE int fat_skipComment(faTraverse *fat)
255
+{
256
+	if(*fat->das->r_iter == fat_char_cmt)
257
+	{
258
+		++fat->nCom;
259
+		if(fat_skipLine(fat))
260
+		{
261
+			fat->state &= (~fat_loc);
262
+			return fat_ok;
263
+		}
264
+	}
265
+	return fat->state;
266
+}
267
+
268
+static R_INLINE int fat_skipN(faTraverse *fat)
269
+{
270
+	// Proceeds in array as long as 'N' nucleotide is present.
271
+	// Tries to skip Newline (LF) characters
272
+
273
+	// Eventually checks for new sequence and eventually
274
+	// sets flag (No initializing on new sequence).
275
+
276
+	// Eventually tries to refill array.
277
+
278
+	if(fatEmpty(fat))
279
+		return fat->state;
280
+
281
+	daStream *das=fat->das;
282
+	while(ACGTN[ (unsigned)*das->r_iter ] == nval)
283
+	{
284
+		++das->r_iter;
285
+		++fat->nN;
286
+
287
+		if(fatCheckFill(fat))
288
+			return fat->state;
289
+
290
+		//printf("[skip  N] Post fill '%s'\n",das->r_iter);
291
+
292
+		// Function does not return
293
+		// with r_iter on newline
294
+		if(fat_checkNewLine(fat))
295
+			if(fat_skipNewLine(fat))
296
+				return fat->state;
297
+	}
298
+
299
+	// Unset N-flag
300
+	fat->state &= (~fat_loc);
301
+	return fat_ok;
302
+}
303
+
304
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
305
+//
306
+// Routines:
307
+// 			fat_getSeqName
308
+//			fat_findNextKmer
309
+//			fat_initSeq
310
+//
311
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
312
+
313
+char * fat_getSeqName(faTraverse *fat)
314
+{
315
+	// From actual sequence delimiter:
316
+	// Copies text until empty space (or newline)
317
+	// into new char array.
318
+
319
+	// Char array will be returned und must be free'd from outside.
320
+
321
+	// Eventually tries to refill array
322
+	daStream *das = fat->das;
323
+	unsigned nchar = 0;
324
+
325
+	//printf("[fat_getSeqName] '%s'\n",fat->das->r_iter);
326
+
327
+	if(*das->r_iter == fat_char_newSeq)
328
+	{
329
+		char *seq_name = das->r_iter;
330
+		while( !( (*seq_name==' ') || (*seq_name==fat_char_lf) ) )
331
+		{
332
+			++seq_name;
333
+			++nchar;
334
+
335
+			// End of array reached
336
+			//printf("[fat_getSeqName]  seq_name: '%s'\n",seq_name);
337
+			if(seq_name == das->r_end)
338
+			{
339
+				if(das_fill(das))
340
+				{
341
+					//printf("[fat_getSeqName] empty seq name.\n");
342
+					return 0;
343
+				}
344
+				// Restart search
345
+				seq_name = fat->das->r_iter;
346
+				nchar = 0;
347
+			}
348
+		}
349
+
350
+		if(nchar == 0)
351
+		{
352
+			//printf("[fat_getSeqName] empty seq name.\n");
353
+			return 0;
354
+		}
355
+
356
+		// Skip first char='>'
357
+		--nchar;
358
+		seq_name=fat->das->r_iter;
359
+		++seq_name;
360
+
361
+		//printf("[fat_getSeqName] copy: '%s'\tnchar=%u\n",das->r_iter,nchar);
362
+
363
+		// Must be free'd from outside
364
+		char *ret = calloc(sizeof(char), nchar + 1);
365
+		unsigned i;
366
+
367
+		for(i=0; i<nchar; ++i)
368
+		{
369
+			ret[i] = *seq_name;
370
+			++seq_name;
371
+		}
372
+		//printf("[seq name] success.\t'%s'\n",fat->das->r_iter);
373
+		return ret;
374
+	}
375
+	return 0;
376
+}
377
+
378
+
379
+// ToDo: Change into loop entry point from top_level
380
+int fat_skipSeqHeader(faTraverse *fat)
381
+{
382
+	if(fat->state & fat_newSeq)
383
+	{
384
+		++fat->nSeq;
385
+		fat->state &= (~fat_newSeq);
386
+		// Skip header line
387
+		if(fat_skipLine(fat))
388
+			return fat->state;
389
+	}
390
+	return fat_ok;
391
+}
392
+
393
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
394
+//
395
+// Routines:
396
+// 			fat_returnFromTranfer
397
+//			fat_procNuc
398
+//
399
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
400
+
401
+static R_INLINE int fat_retProcNuc(faTraverse *fat)
402
+{
403
+	// Joint terminating routine for
404
+	// procNuc function.
405
+	daStream *das=fat->das;
406
+	//printf("[ret proc] plen: %lu\t'%s'\n",(das->p_iter-das->pos),das->pos);
407
+
408
+	if( (das->p_iter-das->pos) >= (fat->k) )
409
+	{
410
+		// A)	At least k nucleotides copied
411
+		// 		-> Success:
412
+		//		Add string terminator.
413
+		if((das->p_iter-das->pos) > 0)
414
+		{
415
+			das->npPos = (int)(das->p_iter - das->pos);
416
+			*das->p_iter = fat_char_eos;
417
+		}
418
+
419
+		//		Set nuc_ready state.
420
+		fat->state |= fat_nucReady;
421
+		fat->state &= (~fat_loc);
422
+
423
+		//printf("[proc nuc] NUC READY: '%s'\n",das->pos);
424
+		return fat->state;
425
+	}
426
+	else
427
+	{
428
+		// B)	Not enough nucleotides
429
+		//		four counting
430
+		//		-> Failure:
431
+		//		Reset pos array.
432
+		das->p_iter = das->pos;
433
+		*das->p_iter = '\0';
434
+		//		Set pos empty state
435
+		fat->state &= (~fat_nucReady);
436
+		fat->state |= fat_loc;
437
+		//printf("[proc nuc] Nuc_empty\n");
438
+		return fat_ok;
439
+	}
440
+}
441
+
442
+
443
+int fat_procNuc(faTraverse *fat)
444
+{
445
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
446
+	// Copies continuous Nucleotide sequence from
447
+	// input  fasta stream		(rfc array) to
448
+	// output nucleotide stream	(pos array).
449
+	//
450
+	// Tries to recover when newline character is found or
451
+	// end of rfc-array is reached.
452
+	//
453
+	// Copying always starts at begin of nuc array
454
+	//
455
+	// Returns fat_ok (=0) when *NO* output is created.
456
+	//
457
+	// Allows 'if(fat_procNuc) => process output'
458
+	// testing.
459
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
460
+
461
+	daStream *das = fat->das;
462
+	fat->state &= (~fat_nucReady);
463
+
464
+	//printf("[proc Nuc] start: '%s'\n",das->r_iter);
465
+	if((ACGT[ (unsigned)*das->r_iter ] != zval))
466
+	{
467
+		// Goto begin of proc array.
468
+		das->p_iter = das->pos;
469
+
470
+		//unsigned i=0,j;
471
+		while(!dasEmpty(das))
472
+		{
473
+			// Copy nucleotides
474
+			while( !( (ACGT[ (unsigned)*das->r_iter ] == zval) || dasEmpty(das) || dasProcEmpty(das) ) )
475
+			{
476
+				*das->p_iter = *das->r_iter;
477
+				++das->r_iter;
478
+				++das->p_iter;
479
+			}
480
+
481
+			// Two recoverable conditions:
482
+			if(fat_checkNewLine(fat))
483
+			{
484
+				//printf("[proc Nuc] New Line.\n");
485
+				if(fat_skipNewLine(fat))
486
+					return fat_retProcNuc(fat);
487
+			}
488
+			else if(fatEmpty(fat)) // Can't be checked with fatCheckFill
489
+			{
490
+				//printf("[proc Nuc] fatEmpty.\n");
491
+				if(das_fill(das))
492
+					return fat_retProcNuc(fat);
493
+			}
494
+			else // Anything else -> break
495
+			{
496
+				//printf("[proc Nuc] else ...\n");
497
+				return fat_retProcNuc(fat);
498
+			}
499
+			//printf("[proc Nuc] End while pos: '%s'\n",das->pos);
500
+		}
501
+
502
+		// Pos array is full
503
+		fat->state |= fat_nucReady;
504
+		fat->state &= (~fat_loc);
505
+		//printf("[proc Nuc] return fat_ok\n");
506
+		return fat_retProcNuc(fat);
507
+		//return fat->state;
508
+	}
509
+	else //if(fat_checkNewLine(fat))
510
+	{
511
+		//printf("[proc Nuc] AGCT->zval\n");
512
+		if(fat_checkNewLine(fat))
513
+			fat_skipNewLine(fat);
514
+
515
+			/*
516
+			if(fat_skipNewLine(fat))
517
+			{
518
+				Rprintf("[proc Nuc] Newline skipped: '%s'\n", das->r_iter);
519
+			}
520
+			else
521
+				Rprintf("[proc Nuc] Found ASCII %u\n", (unsigned char)(*fat->das->r_iter));
522
+		*/
523
+	}
524
+
525
+	fat->state |= fat_loc;
526
+	return fat->state;
527
+}
528
+
529
+
530
+
531
+int fat_findKarray(faTraverse *fat)
532
+{
533
+	//fat->state&=(~fat_nucReady);
534
+	//printf("[fat_findKarray] Start: '%s'\t%u\tstrlen: %lu\n",fat->das->r_iter,fatEmpty(fat),strlen(fat->das->r_iter));
535
+	if(!fatEmpty(fat))
536
+	{
537
+		if(fat_checkN(fat))
538
+		{
539
+			//printf("[fat_findKarray] skipN.\n");
540
+			fat_skipN(fat);
541
+		}
542
+		if(fat_checkNewSeq(fat))
543
+		{
544
+			//printf("[fat_findKarray] New seq found.\n");
545
+			return fat->state;
546
+		}
547
+		if(fat_checkComment(fat))
548
+		{
549
+			fat_skipComment(fat);
550
+			//printf("[fat_findKarray] post cmt: '%s'\n",fat->das->r_iter);
551
+		}
552
+		if(fat_procNuc(fat))
553
+		{
554
+			//printf("[fat_findKarray] procNuc return state: %i\n",fat->state);
555
+			return fat->state;
556
+		}
557
+		/*
558
+		if(fat_checkNewLine(fat))
559
+		{
560
+			printf("[fat_findKarray] CheckNewLine!\n");
561
+		}
562
+		*/
563
+	}
564
+	//printf("[find Kar] Something else: '%s'\n",fat->das->r_iter);
565
+	return fat_ok;
566
+}
567
+
568
+
569
+
570
+#endif /* FA_TRAVERSE_H_ */