src/fa_traverse.h
e46f15e6
 /*
  * fa_traverse.h
  *
  *  Created on: 08.10.2013
  *      Author: kaisers
  */
 
 #ifndef FA_TRAVERSE_H_
 #define FA_TRAVERSE_H_
 
 #include "stat_defs.h"
 #include "dna_astream.h"
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // 	Fasta traverse:
 //				Contains dna_astream which is traversed.
 //				Identified Nucleotide sequences of length >= k will be copied to
 //				das.pos array.
 //
 //				Defines subroutines for skipping header-lines, N-array and comment lines.
 //
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 //
 // 	DNA file stream:
 //				Struct layer for reading text into char * buffer
 //				from either uncompressed or compressed files.
 //
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 //
 // Definition of static constants
 //
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 static const unsigned  		fat_array_size  =0x1000;	// char array capacity
 
 static const char			fat_char_newSeq	='>';		// Fasta sequence delimiter
 static const char			fat_char_cmt	=';'; 		// Fasta comment  delimiter
 static const char			fat_char_lf		='\n';		// Line feed
 static const char			fat_char_eos	='\0';		// End of string
 
 static const int			fat_ok			=0;
 static const int			fat_empty		=1;
 static const int			fat_loc			=2;
 static const int			fat_newSeq		=4;
 static const int			fat_nucReady	=8;
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 //
 // 	Struct definition, basic accessors; constructor and destructors:
 //		struct faTraverse
 //		fatEmpty, fatNewSeq, fatNucReady
 //		fat_destroy, fat_init
 //
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 typedef struct fasta_traverse
 {
 	daStream *das;
 
 	int state;
 	unsigned k;
 
 	unsigned nN;		// Number of N's so far seen
 	unsigned nSeq;		// Number of sequences
 	unsigned nCom;		// Number of comment lines
 	unsigned nFill;		// Number of fill operations.
 
 } faTraverse;
 
 int fatEmpty (faTraverse* fat)  { return dasEmpty (fat->das); }
 int fatIsOpen(faTraverse* fat)	{ return dasIsOpen(fat->das); }
 int fatIsEof (faTraverse* fat)	{ return dasIsEof (fat->das); }
 int fatNewSeq(faTraverse *fat)  { return fat->state & fat_newSeq; }
 
 // Only purposed for use in top-level:
 // resets nucReady when called.
 int fatNucReady(faTraverse *fat)
 {
 	if(fat->state & fat_nucReady)
 	{
 		fat->state &= (~fat_nucReady);
 		return 1;
 	}
 	return 0;
 }
 
 
 void fat_destroy(faTraverse *fat)
 {
 	if(fat)
 	{
 		das_destroy(fat->das);
 		free(fat);
 	}
 }
 
 faTraverse * fat_init(const char* filename, int k)
 {
 	faTraverse * fat = (faTraverse*) calloc(sizeof(faTraverse), 1);
 	if(!fat)
 		return 0;
 
 	fat->das = das_init(filename, fat_array_size);
 	if(!fat->das)
 	{
 		//printf("[fat_init] das_init returned 0!\n");
 		free(fat);
 		return 0;
 	}
 	fat->k = (unsigned) k;
 
 	// Returns memory initialized object.
 	// File may be closed.
 	// File may be empty.
 	// Stream is not yet initialized.
 	return fat;
 }
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 // Check-Fill: Checks for empty rfc (frs==0) and eventually fills das.
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 int fatCheckFill(faTraverse *fat)
 {
 	//if(fat->das->frs==0)
 	if(dasEmpty(fat->das) )
 	{
 		//printf("[fatCheckFill] ...\n");
 		if(das_fill(fat->das))
 		{
 			++(fat->nFill);
 			fat->state |= fat_empty;
 			//printf("[fatCheckFill] return empty.\n");
 			return fat->state;
 		}
 		fat->state &= (~fat_empty);
 	}
 	return fat_ok;
 }
 
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 // Check section (Must be placed beforehand skip-routines)
 //		fat_checkNewLine
 //		fat_checkNewSeq
 //		fat_checkComment
 //		fat_checkN
 //		fat_check_nuc
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 static R_INLINE int fat_checkNewLine(faTraverse *fat)
 {
 	if(LFCR[(unsigned)*fat->das->r_iter] == lfcv)
 	{
 		fat->state |= fat_loc;
 		return fat->state;
 	}
 	return fat_ok;
 }
 
 static R_INLINE int fat_checkNewSeq(faTraverse *fat)
 {
 	if(*fat->das->r_iter == fat_char_newSeq)
 	{
 		fat->state |= fat_newSeq;
 		return fat->state;
 	}
 	return fat_ok;
 }
 
 static R_INLINE int fat_checkComment(faTraverse *fat)
 {
 	if(*fat->das->r_iter==fat_char_cmt)
 	{
 		//printf("[checkCmt] Comment found!\n");
 		fat->state|=fat_loc;
 		return fat->state;
 	}
 	return fat_ok;
 }
 
 static R_INLINE int fat_checkN(faTraverse *fat)
 {
 	if(ACGTN[(unsigned)*fat->das->r_iter] == nval)
 	{
 
 		fat->state |= fat_loc;
 		return fat->state;
 	}
 	return fat_ok;
 }
 
 static R_INLINE int fat_check_nuc(faTraverse *fat)
 {
 	if(ACGT[ ((unsigned) *fat->das->r_iter) ] == zval)
 	{
 		fat->state |= fat_loc;
 		return fat->state;
 	}
 	return fat_ok;
 }
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 // Skip section (Do not reorder...)
 //			fat_skipNewLine
 //			fat_skipLine
 //			fat_skipComment
 //			fat_skipN
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 static R_INLINE int fat_skipNewLine(faTraverse *fat)
 {
 	// Skips LF ('\n', ASCII 10)  and CR (ASCII 13) character and
 	// eventually tries to fill array
 	if(LFCR[ (unsigned)*fat->das->r_iter ] == lfcv)
 	{
 		if(fatCheckFill(fat))
 			return fat->state;
 
 		// Eat newline
 		++fat->das->r_iter;
 
 		// Unset flag
 		fat->state &= (~fat_loc);
 
 		return fat_ok;
 	}
 	return fat->state;
 }
 
 static R_INLINE int fat_skipLine(faTraverse *fat)
 {
 	// Traverses until first position of next line
 	// and eventually tries to refill array
 
 	daStream *das = fat->das;
 	while(LFCR[ (unsigned)*fat->das->r_iter ] == zval)
 	{
 		++das->r_iter;
 		if(fatCheckFill(fat))
 			return fat->state;
 	}
 	//printf("[fat_skipLine] post iter: '%s'\n",das->r_iter);
 	return fat_skipNewLine(fat);
 }
 
 static R_INLINE int fat_skipComment(faTraverse *fat)
 {
 	if(*fat->das->r_iter == fat_char_cmt)
 	{
 		++fat->nCom;
 		if(fat_skipLine(fat))
 		{
 			fat->state &= (~fat_loc);
 			return fat_ok;
 		}
 	}
 	return fat->state;
 }
 
 static R_INLINE int fat_skipN(faTraverse *fat)
 {
 	// Proceeds in array as long as 'N' nucleotide is present.
 	// Tries to skip Newline (LF) characters
 
 	// Eventually checks for new sequence and eventually
 	// sets flag (No initializing on new sequence).
 
 	// Eventually tries to refill array.
 
 	if(fatEmpty(fat))
 		return fat->state;
 
 	daStream *das=fat->das;
 	while(ACGTN[ (unsigned)*das->r_iter ] == nval)
 	{
 		++das->r_iter;
 		++fat->nN;
 
 		if(fatCheckFill(fat))
 			return fat->state;
 
 		//printf("[skip  N] Post fill '%s'\n",das->r_iter);
 
 		// Function does not return
 		// with r_iter on newline
 		if(fat_checkNewLine(fat))
 			if(fat_skipNewLine(fat))
 				return fat->state;
 	}
 
 	// Unset N-flag
 	fat->state &= (~fat_loc);
 	return fat_ok;
 }
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 //
 // Routines:
 // 			fat_getSeqName
 //			fat_findNextKmer
 //			fat_initSeq
 //
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 char * fat_getSeqName(faTraverse *fat)
 {
 	// From actual sequence delimiter:
 	// Copies text until empty space (or newline)
 	// into new char array.
 
 	// Char array will be returned und must be free'd from outside.
 
 	// Eventually tries to refill array
 	daStream *das = fat->das;
 	unsigned nchar = 0;
 
 	//printf("[fat_getSeqName] '%s'\n",fat->das->r_iter);
 
 	if(*das->r_iter == fat_char_newSeq)
 	{
 		char *seq_name = das->r_iter;
 		while( !( (*seq_name==' ') || (*seq_name==fat_char_lf) ) )
 		{
 			++seq_name;
 			++nchar;
 
 			// End of array reached
 			//printf("[fat_getSeqName]  seq_name: '%s'\n",seq_name);
 			if(seq_name == das->r_end)
 			{
 				if(das_fill(das))
 				{
 					//printf("[fat_getSeqName] empty seq name.\n");
 					return 0;
 				}
 				// Restart search
 				seq_name = fat->das->r_iter;
 				nchar = 0;
 			}
 		}
 
 		if(nchar == 0)
 		{
 			//printf("[fat_getSeqName] empty seq name.\n");
 			return 0;
 		}
 
 		// Skip first char='>'
 		--nchar;
 		seq_name=fat->das->r_iter;
 		++seq_name;
 
 		//printf("[fat_getSeqName] copy: '%s'\tnchar=%u\n",das->r_iter,nchar);
 
 		// Must be free'd from outside
 		char *ret = calloc(sizeof(char), nchar + 1);
 		unsigned i;
 
 		for(i=0; i<nchar; ++i)
 		{
 			ret[i] = *seq_name;
 			++seq_name;
 		}
 		//printf("[seq name] success.\t'%s'\n",fat->das->r_iter);
 		return ret;
 	}
 	return 0;
 }
 
 
 // ToDo: Change into loop entry point from top_level
 int fat_skipSeqHeader(faTraverse *fat)
 {
 	if(fat->state & fat_newSeq)
 	{
 		++fat->nSeq;
 		fat->state &= (~fat_newSeq);
 		// Skip header line
 		if(fat_skipLine(fat))
 			return fat->state;
 	}
 	return fat_ok;
 }
 
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 //
 // Routines:
 // 			fat_returnFromTranfer
 //			fat_procNuc
 //
 // + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 static R_INLINE int fat_retProcNuc(faTraverse *fat)
 {
 	// Joint terminating routine for
 	// procNuc function.
 	daStream *das=fat->das;
 	//printf("[ret proc] plen: %lu\t'%s'\n",(das->p_iter-das->pos),das->pos);
 
 	if( (das->p_iter-das->pos) >= (fat->k) )
 	{
 		// A)	At least k nucleotides copied
 		// 		-> Success:
 		//		Add string terminator.
 		if((das->p_iter-das->pos) > 0)
 		{
 			das->npPos = (int)(das->p_iter - das->pos);
 			*das->p_iter = fat_char_eos;
 		}
 
 		//		Set nuc_ready state.
 		fat->state |= fat_nucReady;
 		fat->state &= (~fat_loc);
 
 		//printf("[proc nuc] NUC READY: '%s'\n",das->pos);
 		return fat->state;
 	}
 	else
 	{
 		// B)	Not enough nucleotides
 		//		four counting
 		//		-> Failure:
 		//		Reset pos array.
 		das->p_iter = das->pos;
 		*das->p_iter = '\0';
 		//		Set pos empty state
 		fat->state &= (~fat_nucReady);
 		fat->state |= fat_loc;
 		//printf("[proc nuc] Nuc_empty\n");
 		return fat_ok;
 	}
 }
 
 
 int fat_procNuc(faTraverse *fat)
 {
 	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
 	// Copies continuous Nucleotide sequence from
 	// input  fasta stream		(rfc array) to
 	// output nucleotide stream	(pos array).
 	//
 	// Tries to recover when newline character is found or
 	// end of rfc-array is reached.
 	//
 	// Copying always starts at begin of nuc array
 	//
 	// Returns fat_ok (=0) when *NO* output is created.
 	//
 	// Allows 'if(fat_procNuc) => process output'
 	// testing.
 	// + + + + + + + + + + + + + + + + + + + + + + + + + + //
 
 	daStream *das = fat->das;
 	fat->state &= (~fat_nucReady);
 
 	//printf("[proc Nuc] start: '%s'\n",das->r_iter);
 	if((ACGT[ (unsigned)*das->r_iter ] != zval))
 	{
 		// Goto begin of proc array.
 		das->p_iter = das->pos;
 
 		//unsigned i=0,j;
 		while(!dasEmpty(das))
 		{
 			// Copy nucleotides
 			while( !( (ACGT[ (unsigned)*das->r_iter ] == zval) || dasEmpty(das) || dasProcEmpty(das) ) )
 			{
 				*das->p_iter = *das->r_iter;
 				++das->r_iter;
 				++das->p_iter;
 			}
 
 			// Two recoverable conditions:
 			if(fat_checkNewLine(fat))
 			{
 				//printf("[proc Nuc] New Line.\n");
 				if(fat_skipNewLine(fat))
 					return fat_retProcNuc(fat);
 			}
 			else if(fatEmpty(fat)) // Can't be checked with fatCheckFill
 			{
 				//printf("[proc Nuc] fatEmpty.\n");
 				if(das_fill(das))
 					return fat_retProcNuc(fat);
 			}
 			else // Anything else -> break
 			{
 				//printf("[proc Nuc] else ...\n");
 				return fat_retProcNuc(fat);
 			}
 			//printf("[proc Nuc] End while pos: '%s'\n",das->pos);
 		}
 
 		// Pos array is full
 		fat->state |= fat_nucReady;
 		fat->state &= (~fat_loc);
 		//printf("[proc Nuc] return fat_ok\n");
 		return fat_retProcNuc(fat);
 		//return fat->state;
 	}
 	else //if(fat_checkNewLine(fat))
 	{
 		//printf("[proc Nuc] AGCT->zval\n");
 		if(fat_checkNewLine(fat))
 			fat_skipNewLine(fat);
 
 			/*
 			if(fat_skipNewLine(fat))
 			{
 				Rprintf("[proc Nuc] Newline skipped: '%s'\n", das->r_iter);
 			}
 			else
 				Rprintf("[proc Nuc] Found ASCII %u\n", (unsigned char)(*fat->das->r_iter));
 		*/
 	}
 
 	fat->state |= fat_loc;
 	return fat->state;
 }
 
 
 
 int fat_findKarray(faTraverse *fat)
 {
 	//fat->state&=(~fat_nucReady);
 	//printf("[fat_findKarray] Start: '%s'\t%u\tstrlen: %lu\n",fat->das->r_iter,fatEmpty(fat),strlen(fat->das->r_iter));
 	if(!fatEmpty(fat))
 	{
 		if(fat_checkN(fat))
 		{
 			//printf("[fat_findKarray] skipN.\n");
 			fat_skipN(fat);
 		}
 		if(fat_checkNewSeq(fat))
 		{
 			//printf("[fat_findKarray] New seq found.\n");
 			return fat->state;
 		}
 		if(fat_checkComment(fat))
 		{
 			fat_skipComment(fat);
 			//printf("[fat_findKarray] post cmt: '%s'\n",fat->das->r_iter);
 		}
 		if(fat_procNuc(fat))
 		{
 			//printf("[fat_findKarray] procNuc return state: %i\n",fat->state);
 			return fat->state;
 		}
 		/*
 		if(fat_checkNewLine(fat))
 		{
 			printf("[fat_findKarray] CheckNewLine!\n");
 		}
 		*/
 	}
 	//printf("[find Kar] Something else: '%s'\n",fat->das->r_iter);
 	return fat_ok;
 }
 
 
 
 #endif /* FA_TRAVERSE_H_ */