e46f15e6 |
/*
* fa_traverse.h
*
* Created on: 08.10.2013
* Author: kaisers
*/
#ifndef FA_TRAVERSE_H_
#define FA_TRAVERSE_H_
#include "stat_defs.h"
#include "dna_astream.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
//
// Fasta traverse:
// Contains dna_astream which is traversed.
// Identified Nucleotide sequences of length >= k will be copied to
// das.pos array.
//
// Defines subroutines for skipping header-lines, N-array and comment lines.
//
///////////////////////////////////////////////////////////////////////////////////////////////////
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
//
// DNA file stream:
// Struct layer for reading text into char * buffer
// from either uncompressed or compressed files.
//
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
//
// Definition of static constants
//
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
static const unsigned fat_array_size =0x1000; // char array capacity
static const char fat_char_newSeq ='>'; // Fasta sequence delimiter
static const char fat_char_cmt =';'; // Fasta comment delimiter
static const char fat_char_lf ='\n'; // Line feed
static const char fat_char_eos ='\0'; // End of string
static const int fat_ok =0;
static const int fat_empty =1;
static const int fat_loc =2;
static const int fat_newSeq =4;
static const int fat_nucReady =8;
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
//
// Struct definition, basic accessors; constructor and destructors:
// struct faTraverse
// fatEmpty, fatNewSeq, fatNucReady
// fat_destroy, fat_init
//
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
typedef struct fasta_traverse
{
daStream *das;
int state;
unsigned k;
unsigned nN; // Number of N's so far seen
unsigned nSeq; // Number of sequences
unsigned nCom; // Number of comment lines
unsigned nFill; // Number of fill operations.
} faTraverse;
int fatEmpty (faTraverse* fat) { return dasEmpty (fat->das); }
int fatIsOpen(faTraverse* fat) { return dasIsOpen(fat->das); }
int fatIsEof (faTraverse* fat) { return dasIsEof (fat->das); }
int fatNewSeq(faTraverse *fat) { return fat->state & fat_newSeq; }
// Only purposed for use in top-level:
// resets nucReady when called.
int fatNucReady(faTraverse *fat)
{
if(fat->state & fat_nucReady)
{
fat->state &= (~fat_nucReady);
return 1;
}
return 0;
}
void fat_destroy(faTraverse *fat)
{
if(fat)
{
das_destroy(fat->das);
free(fat);
}
}
faTraverse * fat_init(const char* filename, int k)
{
faTraverse * fat = (faTraverse*) calloc(sizeof(faTraverse), 1);
if(!fat)
return 0;
fat->das = das_init(filename, fat_array_size);
if(!fat->das)
{
//printf("[fat_init] das_init returned 0!\n");
free(fat);
return 0;
}
fat->k = (unsigned) k;
// Returns memory initialized object.
// File may be closed.
// File may be empty.
// Stream is not yet initialized.
return fat;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Check-Fill: Checks for empty rfc (frs==0) and eventually fills das.
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
int fatCheckFill(faTraverse *fat)
{
//if(fat->das->frs==0)
if(dasEmpty(fat->das) )
{
//printf("[fatCheckFill] ...\n");
if(das_fill(fat->das))
{
++(fat->nFill);
fat->state |= fat_empty;
//printf("[fatCheckFill] return empty.\n");
return fat->state;
}
fat->state &= (~fat_empty);
}
return fat_ok;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Check section (Must be placed beforehand skip-routines)
// fat_checkNewLine
// fat_checkNewSeq
// fat_checkComment
// fat_checkN
// fat_check_nuc
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
static R_INLINE int fat_checkNewLine(faTraverse *fat)
{
if(LFCR[(unsigned)*fat->das->r_iter] == lfcv)
{
fat->state |= fat_loc;
return fat->state;
}
return fat_ok;
}
static R_INLINE int fat_checkNewSeq(faTraverse *fat)
{
if(*fat->das->r_iter == fat_char_newSeq)
{
fat->state |= fat_newSeq;
return fat->state;
}
return fat_ok;
}
static R_INLINE int fat_checkComment(faTraverse *fat)
{
if(*fat->das->r_iter==fat_char_cmt)
{
//printf("[checkCmt] Comment found!\n");
fat->state|=fat_loc;
return fat->state;
}
return fat_ok;
}
static R_INLINE int fat_checkN(faTraverse *fat)
{
if(ACGTN[(unsigned)*fat->das->r_iter] == nval)
{
fat->state |= fat_loc;
return fat->state;
}
return fat_ok;
}
static R_INLINE int fat_check_nuc(faTraverse *fat)
{
if(ACGT[ ((unsigned) *fat->das->r_iter) ] == zval)
{
fat->state |= fat_loc;
return fat->state;
}
return fat_ok;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Skip section (Do not reorder...)
// fat_skipNewLine
// fat_skipLine
// fat_skipComment
// fat_skipN
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
static R_INLINE int fat_skipNewLine(faTraverse *fat)
{
// Skips LF ('\n', ASCII 10) and CR (ASCII 13) character and
// eventually tries to fill array
if(LFCR[ (unsigned)*fat->das->r_iter ] == lfcv)
{
if(fatCheckFill(fat))
return fat->state;
// Eat newline
++fat->das->r_iter;
// Unset flag
fat->state &= (~fat_loc);
return fat_ok;
}
return fat->state;
}
static R_INLINE int fat_skipLine(faTraverse *fat)
{
// Traverses until first position of next line
// and eventually tries to refill array
daStream *das = fat->das;
while(LFCR[ (unsigned)*fat->das->r_iter ] == zval)
{
++das->r_iter;
if(fatCheckFill(fat))
return fat->state;
}
//printf("[fat_skipLine] post iter: '%s'\n",das->r_iter);
return fat_skipNewLine(fat);
}
static R_INLINE int fat_skipComment(faTraverse *fat)
{
if(*fat->das->r_iter == fat_char_cmt)
{
++fat->nCom;
if(fat_skipLine(fat))
{
fat->state &= (~fat_loc);
return fat_ok;
}
}
return fat->state;
}
static R_INLINE int fat_skipN(faTraverse *fat)
{
// Proceeds in array as long as 'N' nucleotide is present.
// Tries to skip Newline (LF) characters
// Eventually checks for new sequence and eventually
// sets flag (No initializing on new sequence).
// Eventually tries to refill array.
if(fatEmpty(fat))
return fat->state;
daStream *das=fat->das;
while(ACGTN[ (unsigned)*das->r_iter ] == nval)
{
++das->r_iter;
++fat->nN;
if(fatCheckFill(fat))
return fat->state;
//printf("[skip N] Post fill '%s'\n",das->r_iter);
// Function does not return
// with r_iter on newline
if(fat_checkNewLine(fat))
if(fat_skipNewLine(fat))
return fat->state;
}
// Unset N-flag
fat->state &= (~fat_loc);
return fat_ok;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
//
// Routines:
// fat_getSeqName
// fat_findNextKmer
// fat_initSeq
//
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
char * fat_getSeqName(faTraverse *fat)
{
// From actual sequence delimiter:
// Copies text until empty space (or newline)
// into new char array.
// Char array will be returned und must be free'd from outside.
// Eventually tries to refill array
daStream *das = fat->das;
unsigned nchar = 0;
//printf("[fat_getSeqName] '%s'\n",fat->das->r_iter);
if(*das->r_iter == fat_char_newSeq)
{
char *seq_name = das->r_iter;
while( !( (*seq_name==' ') || (*seq_name==fat_char_lf) ) )
{
++seq_name;
++nchar;
// End of array reached
//printf("[fat_getSeqName] seq_name: '%s'\n",seq_name);
if(seq_name == das->r_end)
{
if(das_fill(das))
{
//printf("[fat_getSeqName] empty seq name.\n");
return 0;
}
// Restart search
seq_name = fat->das->r_iter;
nchar = 0;
}
}
if(nchar == 0)
{
//printf("[fat_getSeqName] empty seq name.\n");
return 0;
}
// Skip first char='>'
--nchar;
seq_name=fat->das->r_iter;
++seq_name;
//printf("[fat_getSeqName] copy: '%s'\tnchar=%u\n",das->r_iter,nchar);
// Must be free'd from outside
char *ret = calloc(sizeof(char), nchar + 1);
unsigned i;
for(i=0; i<nchar; ++i)
{
ret[i] = *seq_name;
++seq_name;
}
//printf("[seq name] success.\t'%s'\n",fat->das->r_iter);
return ret;
}
return 0;
}
// ToDo: Change into loop entry point from top_level
int fat_skipSeqHeader(faTraverse *fat)
{
if(fat->state & fat_newSeq)
{
++fat->nSeq;
fat->state &= (~fat_newSeq);
// Skip header line
if(fat_skipLine(fat))
return fat->state;
}
return fat_ok;
}
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
//
// Routines:
// fat_returnFromTranfer
// fat_procNuc
//
// + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + //
static R_INLINE int fat_retProcNuc(faTraverse *fat)
{
// Joint terminating routine for
// procNuc function.
daStream *das=fat->das;
//printf("[ret proc] plen: %lu\t'%s'\n",(das->p_iter-das->pos),das->pos);
if( (das->p_iter-das->pos) >= (fat->k) )
{
// A) At least k nucleotides copied
// -> Success:
// Add string terminator.
if((das->p_iter-das->pos) > 0)
{
das->npPos = (int)(das->p_iter - das->pos);
*das->p_iter = fat_char_eos;
}
// Set nuc_ready state.
fat->state |= fat_nucReady;
fat->state &= (~fat_loc);
//printf("[proc nuc] NUC READY: '%s'\n",das->pos);
return fat->state;
}
else
{
// B) Not enough nucleotides
// four counting
// -> Failure:
// Reset pos array.
das->p_iter = das->pos;
*das->p_iter = '\0';
// Set pos empty state
fat->state &= (~fat_nucReady);
fat->state |= fat_loc;
//printf("[proc nuc] Nuc_empty\n");
return fat_ok;
}
}
int fat_procNuc(faTraverse *fat)
{
// + + + + + + + + + + + + + + + + + + + + + + + + + + //
// Copies continuous Nucleotide sequence from
// input fasta stream (rfc array) to
// output nucleotide stream (pos array).
//
// Tries to recover when newline character is found or
// end of rfc-array is reached.
//
// Copying always starts at begin of nuc array
//
// Returns fat_ok (=0) when *NO* output is created.
//
// Allows 'if(fat_procNuc) => process output'
// testing.
// + + + + + + + + + + + + + + + + + + + + + + + + + + //
daStream *das = fat->das;
fat->state &= (~fat_nucReady);
//printf("[proc Nuc] start: '%s'\n",das->r_iter);
if((ACGT[ (unsigned)*das->r_iter ] != zval))
{
// Goto begin of proc array.
das->p_iter = das->pos;
//unsigned i=0,j;
while(!dasEmpty(das))
{
// Copy nucleotides
while( !( (ACGT[ (unsigned)*das->r_iter ] == zval) || dasEmpty(das) || dasProcEmpty(das) ) )
{
*das->p_iter = *das->r_iter;
++das->r_iter;
++das->p_iter;
}
// Two recoverable conditions:
if(fat_checkNewLine(fat))
{
//printf("[proc Nuc] New Line.\n");
if(fat_skipNewLine(fat))
return fat_retProcNuc(fat);
}
else if(fatEmpty(fat)) // Can't be checked with fatCheckFill
{
//printf("[proc Nuc] fatEmpty.\n");
if(das_fill(das))
return fat_retProcNuc(fat);
}
else // Anything else -> break
{
//printf("[proc Nuc] else ...\n");
return fat_retProcNuc(fat);
}
//printf("[proc Nuc] End while pos: '%s'\n",das->pos);
}
// Pos array is full
fat->state |= fat_nucReady;
fat->state &= (~fat_loc);
//printf("[proc Nuc] return fat_ok\n");
return fat_retProcNuc(fat);
//return fat->state;
}
else //if(fat_checkNewLine(fat))
{
//printf("[proc Nuc] AGCT->zval\n");
if(fat_checkNewLine(fat))
fat_skipNewLine(fat);
/*
if(fat_skipNewLine(fat))
{
Rprintf("[proc Nuc] Newline skipped: '%s'\n", das->r_iter);
}
else
Rprintf("[proc Nuc] Found ASCII %u\n", (unsigned char)(*fat->das->r_iter));
*/
}
fat->state |= fat_loc;
return fat->state;
}
int fat_findKarray(faTraverse *fat)
{
//fat->state&=(~fat_nucReady);
//printf("[fat_findKarray] Start: '%s'\t%u\tstrlen: %lu\n",fat->das->r_iter,fatEmpty(fat),strlen(fat->das->r_iter));
if(!fatEmpty(fat))
{
if(fat_checkN(fat))
{
//printf("[fat_findKarray] skipN.\n");
fat_skipN(fat);
}
if(fat_checkNewSeq(fat))
{
//printf("[fat_findKarray] New seq found.\n");
return fat->state;
}
if(fat_checkComment(fat))
{
fat_skipComment(fat);
//printf("[fat_findKarray] post cmt: '%s'\n",fat->das->r_iter);
}
if(fat_procNuc(fat))
{
//printf("[fat_findKarray] procNuc return state: %i\n",fat->state);
return fat->state;
}
/*
if(fat_checkNewLine(fat))
{
printf("[fat_findKarray] CheckNewLine!\n");
}
*/
}
//printf("[find Kar] Something else: '%s'\n",fat->das->r_iter);
return fat_ok;
}
#endif /* FA_TRAVERSE_H_ */
|