e46f15e6 |
/*
* fq_traverse.h
*
* Created on: 12.10.2013
* Author: wolfgang
*/
#ifndef FQ_TRAVERSE_H_
#define FQ_TRAVERSE_H_
#include "stat_defs.h"
#include "dna_astream.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
//
// Fastq traverse:
// Contains dna_astream which is traversed.
// The <seqname> following '+' is optional,
// but if it appears right after '+', it should be identical to the <seqname> following '@'.
// The length of <seq> is identical the length of <qual>. Each character in <qual> represents the phred quality of the corresponding nucleotide in <seq>.
//
///////////////////////////////////////////////////////////////////////////////////////////////////
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
//
// Definition of static constants
//
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
static const unsigned long faq_array_size =0xFFFFFF; // char array capacity
static const char faq_char_sqDelim ='@'; // Fastq sequence delimiter
static const char faq_char_qlDelim ='+'; // Fastq quality delimiter
static const char faq_char_lf ='\n'; // Line feed
static const char faq_char_eos ='\0'; // End of string
static const int faq_ok =0;
static const int faq_empty =1;
static const int faq_newSeq =4;
typedef struct fastq_traverse
{
daStream *das;
int state;
unsigned nSeq; // Number of sequences
unsigned nFill; // Number of fill operations.
unsigned nProcFull; // Number of times where proc array was filled up.
unsigned minSeqLen;
unsigned maxSeqLen;
unsigned lastSeqLen;
unsigned nUneqLeqLen;
} fqTraverse;
int faqEmpty(fqTraverse* faq) { return dasEmpty(faq->das); }
int faqIsOpen(fqTraverse* faq) { return dasIsOpen(faq->das); }
int faqIsEof(fqTraverse* faq) { return dasIsEof(faq->das); }
void faq_destroy(fqTraverse *faq)
{
if(faq)
{
das_destroy(faq->das);
free(faq);
}
}
fqTraverse * faq_init(const char* filename, unsigned mode)
{
fqTraverse * faq=(fqTraverse*)calloc(sizeof(fqTraverse),1);
if(!faq)
return 0;
faq->das=das_init(filename,mode,faq_array_size);
if(!faq->das)
{
//printf("[faq_init] das_init returned 0!\n");
free(faq);
return 0;
}
// initialize with large value
--faq->minSeqLen;
// Returns memory initialized object.
// File may be closed.
// File may be empty.
// Stream is not yet initialized.
return faq;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Check-Fill: Checks for empty rfc (frs==0) and eventually fills das.
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
int faqCheckFill(fqTraverse *faq)
{
//printf("[faqCheckFill].\n");
if(dasEmpty(faq->das))
{
//printf("[faqCheckFill] ->das_fill\n");
if(das_fill(faq->das))
{
++(faq->nFill);
faq->state|=faq_empty;
return faq->state;
}
faq->state&=(~faq_empty);
}
return faq_ok;
}
int faqCheckCapFill(fqTraverse *faq,int capacity)
{
daStream *das=faq->das;
if(das->r_end-das->r_iter<capacity)
{
if(das_fill(das))
{
++(faq->nFill);
faq->state|=faq_empty;
return faq->state;
}
faq->state&=(~faq_empty);
}
return faq_ok;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Check section (Must be placed beforehand skip-routines)
// faq_checkNewLine
// faq_checkSeqHeader
// faq_checkComment
// faq_checkN
// faq_check_nuc
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Skip section (Do not reorder...)
// faq_checkSkipNewLine
// faq_skipLine
// faq_skipComment
// faq_skipN
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
static inline int faq_checkSkipNewLine(fqTraverse *faq)
{
// Skips LF ('\n') character and
// eventually tries to fill array
//printf("[CheckNewLine].\n");
if(*faq->das->r_iter==faq_char_lf)
{
if(faqCheckCapFill(faq,2))
return faq->state;
// Eat newline
++faq->das->r_iter;
return faq_ok;
}
return faq->state;
}
static inline int faq_skipLastNewLine(fqTraverse *faq)
{
daStream *das=faq->das;
if((*faq->das->r_iter==faq_char_lf) & (das->dnaf->state==dfs_file_closed))
{
++das->r_iter;
return faq_ok;
}
return faq_empty;
}
static inline int faq_skipLine(fqTraverse *faq)
{
// Traverses until first position of next line
// and eventually tries to refill array
daStream *das=faq->das;
while(*das->r_iter!=faq_char_lf)
{
++das->r_iter;
if(faqCheckFill(faq))
return faq->state;
}
++das->r_iter;
if(faqCheckFill(faq))
return faq->state;
return faq_ok;
}
int faq_procNuc(fqTraverse *faq)
{
// + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Copies one line of continuous Nucleotide sequence
// from: input fastq stream (rfc array)
// to : output nucleotide stream (pos array)
//
// Copy procedure ends when:
// A) A non nucleotide is found.
// B) Quality header ('+') is found.
// C) rfc array runs empty.
// D) pos array is filled up.
//
// Tries to recover when end of rfc-array is reached.
// Copying always starts at begin of nuc array
//
// Returns fat_ok (=0) when *NO* output is created.
//
// Allows 'if(fat_procNuc) => process output'
// testing.
// + + + + + + + + + + + + + + + + + + + + + + + + + + //
daStream *das=faq->das;
unsigned seqLen=0;
//printf("[faq pNuc] start: '%s'\n",das->r_iter);
// Empty array must be prevented here
if(faqCheckFill(faq))
{
faq->lastSeqLen=seqLen;
return seqLen;
}
if(*faq->das->r_iter==faq_char_sqDelim)
{
// Skip header line
if(faq_skipLine(faq))
{
faq->lastSeqLen=seqLen;
return seqLen;
}
// Goto begin of pos array.
das->p_iter=das->pos;
//printf("[faq pNuc] while: '%s'\n",das->r_iter);
++faq->nSeq;
while(!((ACGT[(unsigned)*das->r_iter]==zval) || *das->r_iter==faq_char_qlDelim ) )
{
*das->p_iter=*das->r_iter;
++das->r_iter;
++das->p_iter;
++seqLen;
//printf("[faq pNuc] fill: '%s'\n",das->r_iter);
// Two recoverable conditions:
if(faqCheckFill(faq))
{
faq->lastSeqLen=seqLen;
return seqLen;
}
if(faq_checkSkipNewLine(faq))
{
faq->lastSeqLen=seqLen;
return seqLen;
}
if(dasProcEmpty(das))
{
++faq->nProcFull;
faq->lastSeqLen=seqLen;
return seqLen;
}
}
// Add string termination
if(!dasProcEmpty(das))
{
das->npPos=(das->p_iter-das->pos);
*das->p_iter=faq_char_eos;
}
}
faq->lastSeqLen=seqLen;
return seqLen;
}
int faq_procPhred(fqTraverse *faq)
{
// + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Copies one line of Phred characters
// from: input fastq stream (rfc array)
// to : output nucleotide stream (pos array)
//
// Copy procedure ends when:
// A) Sequence header ('@') is found.
// B) rfc array runs empty.
// C) pos array is filled up.
//
// Tries to recover when end of rfc-array is reached.
// Copying always starts at begin of pos array.
//
// Returns fat_ok (=0) when *NO* output is created.
//
// Allows 'if(fat_procNuc) => process output'
// testing.
// + + + + + + + + + + + + + + + + + + + + + + + + + + //
daStream *das=faq->das;
unsigned seqLen=0;
// Empty array must be prevented here
if(faqCheckFill(faq))
{
faq->lastSeqLen=seqLen;
return seqLen;
}
//printf("[faq_Phred] start: '%s'\n",das->r_iter);
if(*faq->das->r_iter==faq_char_qlDelim)
{
// Skip header line
if(faq_skipLine(faq))
{
faq->lastSeqLen=seqLen;
return seqLen;
}
if(faqCheckFill(faq))
return 0;
// Goto begin of pos array.
das->p_iter=das->pos;
// Copy until new seq header is found
//printf("[faq_Phred] while: '%s'\n",das->r_iter);
while(!(*das->r_iter==faq_char_sqDelim))
{
*das->p_iter=*das->r_iter;
++das->r_iter;
++das->p_iter;
++seqLen;
//printf("[faq Phred] while: '%s'\tr_iter:%li\n",das->r_iter,das->r_end-das->r_iter);
// Two recoverable conditions:
if(faqCheckFill(faq))
{
//printf("[faq_Phred] checkFill: %u\n",faq->nFill);
if(seqLen!=faq->lastSeqLen)
++faq->nUneqLeqLen;
return seqLen;
}
if(faq_checkSkipNewLine(faq))
{
if(seqLen!=faq->lastSeqLen)
++faq->nUneqLeqLen;
return seqLen;
}
if(dasProcEmpty(das))
{
++faq->nProcFull;
if(seqLen!=faq->lastSeqLen)
++faq->nUneqLeqLen;
return seqLen;
}
}
// Add string termination
if(!dasProcEmpty(das))
{
das->npPos=(das->p_iter-das->pos);
*das->p_iter=faq_char_eos;
}
if(seqLen!=faq->lastSeqLen)
++faq->nUneqLeqLen;
}
return seqLen;
}
#endif /* FQ_TRAVERSE_H_ */
|