/* * fq_traverse.h * * Created on: 12.10.2013 * Author: wolfgang */ #ifndef FQ_TRAVERSE_H_ #define FQ_TRAVERSE_H_ #include "stat_defs.h" #include "dna_astream.h" /////////////////////////////////////////////////////////////////////////////////////////////////// // // Fastq traverse: // Contains dna_astream which is traversed. // The following '+' is optional, // but if it appears right after '+', it should be identical to the following '@'. // The length of is identical the length of . Each character in represents the phred quality of the corresponding nucleotide in . // /////////////////////////////////////////////////////////////////////////////////////////////////// // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // // // Definition of static constants // // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // static const unsigned long faq_array_size =0xFFFFFF; // char array capacity static const char faq_char_sqDelim ='@'; // Fastq sequence delimiter static const char faq_char_qlDelim ='+'; // Fastq quality delimiter static const char faq_char_lf ='\n'; // Line feed static const char faq_char_eos ='\0'; // End of string static const int faq_ok =0; static const int faq_empty =1; static const int faq_newSeq =4; typedef struct fastq_traverse { daStream *das; int state; unsigned nSeq; // Number of sequences unsigned nFill; // Number of fill operations. unsigned nProcFull; // Number of times where proc array was filled up. unsigned minSeqLen; unsigned maxSeqLen; unsigned lastSeqLen; unsigned nUneqLeqLen; } fqTraverse; int faqEmpty(fqTraverse* faq) { return dasEmpty(faq->das); } int faqIsOpen(fqTraverse* faq) { return dasIsOpen(faq->das); } int faqIsEof(fqTraverse* faq) { return dasIsEof(faq->das); } void faq_destroy(fqTraverse *faq) { if(faq) { das_destroy(faq->das); free(faq); } } fqTraverse * faq_init(const char* filename, unsigned mode) { fqTraverse * faq=(fqTraverse*)calloc(sizeof(fqTraverse),1); if(!faq) return 0; faq->das=das_init(filename,mode,faq_array_size); if(!faq->das) { //printf("[faq_init] das_init returned 0!\n"); free(faq); return 0; } // initialize with large value --faq->minSeqLen; // Returns memory initialized object. // File may be closed. // File may be empty. // Stream is not yet initialized. return faq; } // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // // Check-Fill: Checks for empty rfc (frs==0) and eventually fills das. // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // int faqCheckFill(fqTraverse *faq) { //printf("[faqCheckFill].\n"); if(dasEmpty(faq->das)) { //printf("[faqCheckFill] ->das_fill\n"); if(das_fill(faq->das)) { ++(faq->nFill); faq->state|=faq_empty; return faq->state; } faq->state&=(~faq_empty); } return faq_ok; } int faqCheckCapFill(fqTraverse *faq,int capacity) { daStream *das=faq->das; if(das->r_end-das->r_iternFill); faq->state|=faq_empty; return faq->state; } faq->state&=(~faq_empty); } return faq_ok; } // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // // Check section (Must be placed beforehand skip-routines) // faq_checkNewLine // faq_checkSeqHeader // faq_checkComment // faq_checkN // faq_check_nuc // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // // Skip section (Do not reorder...) // faq_checkSkipNewLine // faq_skipLine // faq_skipComment // faq_skipN // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + // static inline int faq_checkSkipNewLine(fqTraverse *faq) { // Skips LF ('\n') character and // eventually tries to fill array //printf("[CheckNewLine].\n"); if(*faq->das->r_iter==faq_char_lf) { if(faqCheckCapFill(faq,2)) return faq->state; // Eat newline ++faq->das->r_iter; return faq_ok; } return faq->state; } static inline int faq_skipLastNewLine(fqTraverse *faq) { daStream *das=faq->das; if((*faq->das->r_iter==faq_char_lf) & (das->dnaf->state==dfs_file_closed)) { ++das->r_iter; return faq_ok; } return faq_empty; } static inline int faq_skipLine(fqTraverse *faq) { // Traverses until first position of next line // and eventually tries to refill array daStream *das=faq->das; while(*das->r_iter!=faq_char_lf) { ++das->r_iter; if(faqCheckFill(faq)) return faq->state; } ++das->r_iter; if(faqCheckFill(faq)) return faq->state; return faq_ok; } int faq_procNuc(fqTraverse *faq) { // + + + + + + + + + + + + + + + + + + + + + + + + + + // // Copies one line of continuous Nucleotide sequence // from: input fastq stream (rfc array) // to : output nucleotide stream (pos array) // // Copy procedure ends when: // A) A non nucleotide is found. // B) Quality header ('+') is found. // C) rfc array runs empty. // D) pos array is filled up. // // Tries to recover when end of rfc-array is reached. // Copying always starts at begin of nuc array // // Returns fat_ok (=0) when *NO* output is created. // // Allows 'if(fat_procNuc) => process output' // testing. // + + + + + + + + + + + + + + + + + + + + + + + + + + // daStream *das=faq->das; unsigned seqLen=0; //printf("[faq pNuc] start: '%s'\n",das->r_iter); // Empty array must be prevented here if(faqCheckFill(faq)) { faq->lastSeqLen=seqLen; return seqLen; } if(*faq->das->r_iter==faq_char_sqDelim) { // Skip header line if(faq_skipLine(faq)) { faq->lastSeqLen=seqLen; return seqLen; } // Goto begin of pos array. das->p_iter=das->pos; //printf("[faq pNuc] while: '%s'\n",das->r_iter); ++faq->nSeq; while(!((ACGT[(unsigned)*das->r_iter]==zval) || *das->r_iter==faq_char_qlDelim ) ) { *das->p_iter=*das->r_iter; ++das->r_iter; ++das->p_iter; ++seqLen; //printf("[faq pNuc] fill: '%s'\n",das->r_iter); // Two recoverable conditions: if(faqCheckFill(faq)) { faq->lastSeqLen=seqLen; return seqLen; } if(faq_checkSkipNewLine(faq)) { faq->lastSeqLen=seqLen; return seqLen; } if(dasProcEmpty(das)) { ++faq->nProcFull; faq->lastSeqLen=seqLen; return seqLen; } } // Add string termination if(!dasProcEmpty(das)) { das->npPos=(das->p_iter-das->pos); *das->p_iter=faq_char_eos; } } faq->lastSeqLen=seqLen; return seqLen; } int faq_procPhred(fqTraverse *faq) { // + + + + + + + + + + + + + + + + + + + + + + + + + + // // Copies one line of Phred characters // from: input fastq stream (rfc array) // to : output nucleotide stream (pos array) // // Copy procedure ends when: // A) Sequence header ('@') is found. // B) rfc array runs empty. // C) pos array is filled up. // // Tries to recover when end of rfc-array is reached. // Copying always starts at begin of pos array. // // Returns fat_ok (=0) when *NO* output is created. // // Allows 'if(fat_procNuc) => process output' // testing. // + + + + + + + + + + + + + + + + + + + + + + + + + + // daStream *das=faq->das; unsigned seqLen=0; // Empty array must be prevented here if(faqCheckFill(faq)) { faq->lastSeqLen=seqLen; return seqLen; } //printf("[faq_Phred] start: '%s'\n",das->r_iter); if(*faq->das->r_iter==faq_char_qlDelim) { // Skip header line if(faq_skipLine(faq)) { faq->lastSeqLen=seqLen; return seqLen; } if(faqCheckFill(faq)) return 0; // Goto begin of pos array. das->p_iter=das->pos; // Copy until new seq header is found //printf("[faq_Phred] while: '%s'\n",das->r_iter); while(!(*das->r_iter==faq_char_sqDelim)) { *das->p_iter=*das->r_iter; ++das->r_iter; ++das->p_iter; ++seqLen; //printf("[faq Phred] while: '%s'\tr_iter:%li\n",das->r_iter,das->r_end-das->r_iter); // Two recoverable conditions: if(faqCheckFill(faq)) { //printf("[faq_Phred] checkFill: %u\n",faq->nFill); if(seqLen!=faq->lastSeqLen) ++faq->nUneqLeqLen; return seqLen; } if(faq_checkSkipNewLine(faq)) { if(seqLen!=faq->lastSeqLen) ++faq->nUneqLeqLen; return seqLen; } if(dasProcEmpty(das)) { ++faq->nProcFull; if(seqLen!=faq->lastSeqLen) ++faq->nUneqLeqLen; return seqLen; } } // Add string termination if(!dasProcEmpty(das)) { das->npPos=(das->p_iter-das->pos); *das->p_iter=faq_char_eos; } if(seqLen!=faq->lastSeqLen) ++faq->nUneqLeqLen; } return seqLen; } #endif /* FQ_TRAVERSE_H_ */