Browse code

add seqTools package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/seqTools@95415 bc3139a8-67e5-0310-9ffc-ced21a209358

Herve Pages authored on 13/10/2014 19:02:34
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,384 @@
1
+/*
2
+ * fq_traverse.h
3
+ *
4
+ *  Created on: 12.10.2013
5
+ *      Author: wolfgang
6
+ */
7
+
8
+#ifndef FQ_TRAVERSE_H_
9
+#define FQ_TRAVERSE_H_
10
+
11
+#include "stat_defs.h"
12
+#include "dna_astream.h"
13
+
14
+///////////////////////////////////////////////////////////////////////////////////////////////////
15
+//
16
+// 	Fastq traverse:
17
+//				Contains dna_astream which is traversed.
18
+//				The <seqname> following '+' is optional,
19
+//					but if it appears right after '+', it should be identical to the <seqname> following '@'.
20
+//				The length of <seq> is identical the length of <qual>. Each character in <qual> represents the phred quality of the corresponding nucleotide in <seq>.
21
+
22
+//
23
+///////////////////////////////////////////////////////////////////////////////////////////////////
24
+
25
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
26
+//
27
+// Definition of static constants
28
+//
29
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
30
+
31
+static const unsigned long  faq_array_size  	=0xFFFFFF;	// char array capacity
32
+
33
+static const char			faq_char_sqDelim 	='@';	// Fastq sequence delimiter
34
+static const char			faq_char_qlDelim 	='+';	// Fastq quality  delimiter
35
+static const char			faq_char_lf			='\n';	// Line feed
36
+static const char			faq_char_eos		='\0';	// End of string
37
+
38
+static const int			faq_ok				=0;
39
+static const int			faq_empty			=1;
40
+static const int			faq_newSeq			=4;
41
+
42
+
43
+typedef struct fastq_traverse
44
+{
45
+	daStream *das;
46
+
47
+	int state;
48
+	unsigned nSeq;		// Number of sequences
49
+	unsigned nFill;		// Number of fill operations.
50
+	unsigned nProcFull;	// Number of times where proc array was filled up.
51
+
52
+	unsigned minSeqLen;
53
+	unsigned maxSeqLen;
54
+
55
+	unsigned lastSeqLen;
56
+	unsigned nUneqLeqLen;
57
+
58
+} fqTraverse;
59
+
60
+
61
+int faqEmpty(fqTraverse* faq)	{ return dasEmpty(faq->das);  }
62
+int faqIsOpen(fqTraverse* faq)	{ return dasIsOpen(faq->das); }
63
+int faqIsEof(fqTraverse* faq)	{ return dasIsEof(faq->das); }
64
+
65
+void faq_destroy(fqTraverse *faq)
66
+{
67
+	if(faq)
68
+	{
69
+		das_destroy(faq->das);
70
+		free(faq);
71
+	}
72
+}
73
+
74
+
75
+fqTraverse * faq_init(const char* filename, unsigned mode)
76
+{
77
+	fqTraverse * faq=(fqTraverse*)calloc(sizeof(fqTraverse),1);
78
+
79
+	if(!faq)
80
+		return 0;
81
+
82
+	faq->das=das_init(filename,mode,faq_array_size);
83
+
84
+	if(!faq->das)
85
+	{
86
+		//printf("[faq_init] das_init returned 0!\n");
87
+		free(faq);
88
+		return 0;
89
+	}
90
+	// initialize with large value
91
+	--faq->minSeqLen;
92
+
93
+	// Returns memory initialized object.
94
+	// File may be closed.
95
+	// File may be empty.
96
+	// Stream is not yet initialized.
97
+	return faq;
98
+}
99
+
100
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
101
+// Check-Fill: Checks for empty rfc (frs==0) and eventually fills das.
102
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
103
+
104
+int faqCheckFill(fqTraverse *faq)
105
+{
106
+	//printf("[faqCheckFill].\n");
107
+	if(dasEmpty(faq->das))
108
+	{
109
+		//printf("[faqCheckFill] ->das_fill\n");
110
+		if(das_fill(faq->das))
111
+		{
112
+			++(faq->nFill);
113
+			faq->state|=faq_empty;
114
+			return faq->state;
115
+		}
116
+		faq->state&=(~faq_empty);
117
+	}
118
+	return faq_ok;
119
+}
120
+
121
+int faqCheckCapFill(fqTraverse *faq,int capacity)
122
+{
123
+	daStream *das=faq->das;
124
+	if(das->r_end-das->r_iter<capacity)
125
+	{
126
+		if(das_fill(das))
127
+		{
128
+			++(faq->nFill);
129
+			faq->state|=faq_empty;
130
+			return faq->state;
131
+		}
132
+		faq->state&=(~faq_empty);
133
+	}
134
+	return faq_ok;
135
+}
136
+
137
+
138
+
139
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
140
+// Check section (Must be placed beforehand skip-routines)
141
+//		faq_checkNewLine
142
+//		faq_checkSeqHeader
143
+//		faq_checkComment
144
+//		faq_checkN
145
+//		faq_check_nuc
146
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
147
+
148
+
149
+
150
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
151
+// Skip section (Do not reorder...)
152
+//			faq_checkSkipNewLine
153
+//			faq_skipLine
154
+//			faq_skipComment
155
+//			faq_skipN
156
+// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
157
+
158
+static inline int faq_checkSkipNewLine(fqTraverse *faq)
159
+{
160
+	// Skips LF ('\n') character and
161
+	// eventually tries to fill array
162
+
163
+	//printf("[CheckNewLine].\n");
164
+	if(*faq->das->r_iter==faq_char_lf)
165
+	{
166
+		if(faqCheckCapFill(faq,2))
167
+			return faq->state;
168
+		// Eat newline
169
+		++faq->das->r_iter;
170
+		return faq_ok;
171
+	}
172
+	return faq->state;
173
+}
174
+
175
+static inline int faq_skipLastNewLine(fqTraverse *faq)
176
+{
177
+	daStream *das=faq->das;
178
+	if((*faq->das->r_iter==faq_char_lf) & (das->dnaf->state==dfs_file_closed))
179
+	{
180
+		++das->r_iter;
181
+		return faq_ok;
182
+	}
183
+	return faq_empty;
184
+}
185
+
186
+
187
+static inline int faq_skipLine(fqTraverse *faq)
188
+{
189
+	// Traverses until first position of next line
190
+	// and eventually tries to refill array
191
+	daStream *das=faq->das;
192
+	while(*das->r_iter!=faq_char_lf)
193
+	{
194
+		++das->r_iter;
195
+		if(faqCheckFill(faq))
196
+			return faq->state;
197
+	}
198
+	++das->r_iter;
199
+
200
+	if(faqCheckFill(faq))
201
+		return faq->state;
202
+
203
+	return faq_ok;
204
+}
205
+
206
+int faq_procNuc(fqTraverse *faq)
207
+{
208
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
209
+	// Copies one line of continuous Nucleotide sequence
210
+	// from:	input  fastq stream			(rfc array)
211
+	// to  :	output nucleotide stream	(pos array)
212
+	//
213
+	// Copy procedure ends when:
214
+	//	A) A non nucleotide is found.
215
+	//	B) Quality header ('+') is found.
216
+	//	C) rfc array runs empty.
217
+	//	D) pos array is filled up.
218
+	//
219
+	// Tries to recover when end of rfc-array is reached.
220
+	// Copying always starts at begin of nuc array
221
+	//
222
+	// Returns fat_ok (=0) when *NO* output is created.
223
+	//
224
+	// Allows 'if(fat_procNuc) => process output'
225
+	// testing.
226
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
227
+
228
+	daStream *das=faq->das;
229
+	unsigned seqLen=0;
230
+
231
+	//printf("[faq pNuc] start: '%s'\n",das->r_iter);
232
+
233
+	// Empty array must be prevented here
234
+	if(faqCheckFill(faq))
235
+	{
236
+		faq->lastSeqLen=seqLen;
237
+		return seqLen;
238
+	}
239
+
240
+
241
+	if(*faq->das->r_iter==faq_char_sqDelim)
242
+	{
243
+		// Skip header line
244
+		if(faq_skipLine(faq))
245
+		{
246
+			faq->lastSeqLen=seqLen;
247
+			return seqLen;
248
+		}
249
+
250
+		// Goto begin of pos array.
251
+		das->p_iter=das->pos;
252
+		//printf("[faq pNuc] while: '%s'\n",das->r_iter);
253
+		++faq->nSeq;
254
+
255
+		while(!((ACGT[(unsigned)*das->r_iter]==zval) || *das->r_iter==faq_char_qlDelim ) )
256
+		{
257
+
258
+			*das->p_iter=*das->r_iter;
259
+			++das->r_iter;
260
+			++das->p_iter;
261
+			++seqLen;
262
+			//printf("[faq pNuc]  fill: '%s'\n",das->r_iter);
263
+			// Two recoverable conditions:
264
+			if(faqCheckFill(faq))
265
+			{
266
+				faq->lastSeqLen=seqLen;
267
+				return seqLen;
268
+			}
269
+			if(faq_checkSkipNewLine(faq))
270
+			{
271
+				faq->lastSeqLen=seqLen;
272
+				return seqLen;
273
+			}
274
+			if(dasProcEmpty(das))
275
+			{
276
+				++faq->nProcFull;
277
+				faq->lastSeqLen=seqLen;
278
+				return seqLen;
279
+			}
280
+		}
281
+		// Add string termination
282
+		if(!dasProcEmpty(das))
283
+		{
284
+			das->npPos=(das->p_iter-das->pos);
285
+			*das->p_iter=faq_char_eos;
286
+		}
287
+	}
288
+	faq->lastSeqLen=seqLen;
289
+	return seqLen;
290
+}
291
+
292
+int faq_procPhred(fqTraverse *faq)
293
+{
294
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
295
+	// Copies one line of Phred characters
296
+	// from:	input  fastq stream			(rfc array)
297
+	// to  :	output nucleotide stream	(pos array)
298
+	//
299
+	// Copy procedure ends when:
300
+	//	A) Sequence header ('@') is found.
301
+	//	B) rfc  array runs empty.
302
+	//	C) pos array is filled up.
303
+	//
304
+	// Tries to recover when end of rfc-array is reached.
305
+	// Copying always starts at begin of pos array.
306
+	//
307
+	// Returns fat_ok (=0) when *NO* output is created.
308
+	//
309
+	// Allows 'if(fat_procNuc) => process output'
310
+	// testing.
311
+	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
312
+
313
+	daStream *das=faq->das;
314
+	unsigned seqLen=0;
315
+
316
+	// Empty array must be prevented here
317
+	if(faqCheckFill(faq))
318
+	{
319
+		faq->lastSeqLen=seqLen;
320
+		return seqLen;
321
+	}
322
+
323
+	//printf("[faq_Phred] start: '%s'\n",das->r_iter);
324
+	if(*faq->das->r_iter==faq_char_qlDelim)
325
+	{
326
+		// Skip header line
327
+		if(faq_skipLine(faq))
328
+		{
329
+			faq->lastSeqLen=seqLen;
330
+			return seqLen;
331
+		}
332
+		if(faqCheckFill(faq))
333
+			return 0;
334
+
335
+		// Goto begin of pos array.
336
+		das->p_iter=das->pos;
337
+		// Copy until new seq header is found
338
+
339
+		//printf("[faq_Phred] while: '%s'\n",das->r_iter);
340
+		while(!(*das->r_iter==faq_char_sqDelim))
341
+		{
342
+			*das->p_iter=*das->r_iter;
343
+			++das->r_iter;
344
+			++das->p_iter;
345
+			++seqLen;
346
+
347
+			//printf("[faq Phred]  while: '%s'\tr_iter:%li\n",das->r_iter,das->r_end-das->r_iter);
348
+			// Two recoverable conditions:
349
+			if(faqCheckFill(faq))
350
+			{
351
+				//printf("[faq_Phred] checkFill: %u\n",faq->nFill);
352
+				if(seqLen!=faq->lastSeqLen)
353
+					++faq->nUneqLeqLen;
354
+				return seqLen;
355
+			}
356
+			if(faq_checkSkipNewLine(faq))
357
+			{
358
+				if(seqLen!=faq->lastSeqLen)
359
+					++faq->nUneqLeqLen;
360
+				return seqLen;
361
+			}
362
+			if(dasProcEmpty(das))
363
+			{
364
+				++faq->nProcFull;
365
+				if(seqLen!=faq->lastSeqLen)
366
+					++faq->nUneqLeqLen;
367
+				return seqLen;
368
+			}
369
+		}
370
+		// Add string termination
371
+		if(!dasProcEmpty(das))
372
+		{
373
+			das->npPos=(das->p_iter-das->pos);
374
+			*das->p_iter=faq_char_eos;
375
+		}
376
+
377
+		if(seqLen!=faq->lastSeqLen)
378
+			++faq->nUneqLeqLen;
379
+	}
380
+	return seqLen;
381
+}
382
+
383
+
384
+#endif /* FQ_TRAVERSE_H_ */