/* AXT - A simple alignment format with four lines per
 * alignment.  The first specifies the names of the
 * two sequences that align and the position of the
 * alignment, as well as the strand.  The next two
 * lines contain the alignment itself including dashes
 * for inserts.  The alignment is separated from the
 * next alignment with a blank line. 
 *
 * This file contains routines to read such alignments.
 * Note that though the coordinates are one based and
 * closed on disk, they get converted to our usual half
 * open zero based in memory. 
 *
 * This also contains code for simple DNA alignment scoring
 * schemes. 
 *
 * This file is copyright 2002 Jim Kent, but license is hereby
 * granted for all use - public, private or commercial. */

#ifndef AXT_H
#define AXT_H

#ifndef LINEFILE_H
#include "linefile.h"
#endif 

#ifndef DNASEQ_H
#include "dnaseq.h"
#endif

#ifndef CHAIN_H
#include "chain.h"
#endif

struct axt
/* This contains information about one xeno alignment. */
    {
    struct axt *next;
    char *qName;		/* Name of query sequence. */
    int qStart, qEnd;		/* Half open zero=based coordinates. */
    char qStrand;		/* Is this forward or reverse strand + or - */
    char *tName;		/* Name of target. */
    int tStart, tEnd;		/* Target coordinates. */
    char tStrand;               /* Target strand - currently always +. */
    int score;	                /* Score.  Zero for unknown.  Units arbitrary. */
    int symCount;               /* Size of alignments. */
    char *qSym, *tSym;          /* Alignments. */
    int frame;			/* If non-zero then translation frame. */
    };

void axtFree(struct axt **pEl);
/* Free an axt. */

void axtFreeList(struct axt **pList);
/* Free a list of dynamically allocated axt's */

struct axt *axtRead(struct lineFile *lf);
/* Read in next record from .axt file and return it.
 * Returns NULL at EOF. */

struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset);
/* Read next axt, and if retOffset is not-NULL, fill it with
 * offset of start of axt. */

boolean axtCheck(struct axt *axt, struct lineFile *lf);
/* Return FALSE if there's a problem with axt. */

void axtWrite(struct axt *axt, FILE *f);
/* Output axt to axt file. */

int axtCmpQuery(const void *va, const void *vb);
/* Compare to sort based on query position. */

int axtCmpTarget(const void *va, const void *vb);
/* Compare to sort based on target position. */

int axtCmpScore(const void *va, const void *vb);
/* Compare to sort based on score. */

int axtCmpTargetScoreDesc(const void *va, const void *vb);
/* Compare to sort based on target name and score descending. */

struct axtScoreScheme
/* A scoring scheme or DNA alignment. */
    {
    struct scoreMatrix *next;
    int matrix[256][256];   /* Look up with letters. */
    int gapOpen;	/* Gap open cost. */
    int gapExtend;	/* Gap extension. */
    char *extra;        /* extra parameters */
    };

void axtScoreSchemeFree(struct axtScoreScheme **pObj);
/* Free up score scheme. */

struct axtScoreScheme *axtScoreSchemeDefault();
/* Return default scoring scheme (after blastz).  Do NOT axtScoreSchemeFree
 * this. */

struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend);
/* Return a relatively simple scoring scheme for DNA. */

struct axtScoreScheme *axtScoreSchemeRnaDefault();
/* Return default scoring scheme for RNA/DNA alignments
 * within the same species.  Do NOT axtScoreSchemeFree
 * this. */

struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend);
/* return scoring schema from a string in the following format */
/* 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 */

struct axtScoreScheme *axtScoreSchemeRnaFill();
/* Return scoreing scheme a little more relaxed than 
 * RNA/DNA defaults for filling in gaps. */

struct axtScoreScheme *axtScoreSchemeProteinDefault();
/* Returns default protein scoring scheme.  This is
 * scaled to be compatible with the blastz one.  Don't
 * axtScoreSchemeFree this. */

struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName);
/* read in blosum-like matrix */

struct axtScoreScheme *axtScoreSchemeRead(char *fileName);
/* Read in scoring scheme from file. Looks like
    A    C    G    T
    91 -114  -31 -123
    -114  100 -125  -31
    -31 -125  100 -114
    -123  -31 -114   91
    O = 400, E = 30
 * axtScoreSchemeFree this when done. */

struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf );
/* Read in scoring scheme from file. Looks like
    A    C    G    T
    91 -114  -31 -123
    -114  100 -125  -31
    -31 -125  100 -114
    -123  -31 -114   91
    O = 400, E = 30
 * axtScoreSchemeFree this when done. */

void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name);
/* output the score dna based score matrix in meta Data format to File f,
name should be set to the name of the program that is using the matrix */

int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym);
/* Return score without setting up an axt structure. */

int axtScore(struct axt *axt, struct axtScoreScheme *ss);
/* Return calculated score of axt. */

int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss);
/* Return calculated score of axt. do not score gaps if they are repeat masked*/

int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size);
/* Score ungapped alignment. */

int axtScoreDnaDefault(struct axt *axt);
/* Score DNA-based axt using default scheme. */

int axtScoreProteinDefault(struct axt *axt);
/* Score protein-based axt using default scheme. */

boolean axtGetSubsetOnT(struct axt *axt, struct axt *axtOut,
			int newStart, int newEnd, struct axtScoreScheme *ss,
			boolean includeEdgeGaps);
/* Return FALSE if axt is not in the new range.  Otherwise, set axtOut to
 * a subset that goes from newStart to newEnd in target coordinates. 
 * If includeEdgeGaps, don't trim target gaps before or after the range. */

void axtSubsetOnT(struct axt *axt, int newStart, int newEnd, 
	struct axtScoreScheme *ss, FILE *f);
/* Write out subset of axt that goes from newStart to newEnd
 * in target coordinates. */

int axtTransPosToQ(struct axt *axt, int tPos);
/* Convert from t to q coordinates */

void axtSwap(struct axt *axt, int tSize, int qSize);
/* Flip target and query on one axt. */

struct axtBundle
/* A bunch of axt's on the same query/target sequence. */
    {
    struct axtBundle *next;
    struct axt *axtList;	/* List of alignments. */
    int tSize;			/* Size of target. */
    int qSize;			/* Size of query. */
    };

void axtBundleFree(struct axtBundle **pObj);
/* Free a axtBundle. */

void axtBundleFreeList(struct axtBundle **pList);
/* Free a list of gfAxtBundles. */

void axtBlastOut(struct axtBundle *abList, 
	int queryIx, boolean isProt, FILE *f, 
	char *databaseName, int databaseSeqCount, double databaseLetterCount, 
	char *blastType, char *ourId, double minIdentity);
/* Output a bundle of axt's on the same query sequence in blast format.
 * The parameters in detail are:
 *   ab - the list of bundles of axt's. 
 *   f  - output file handle
 *   databaseSeqCount - number of sequences in database
 *   databaseLetterCount - number of bases or aa's in database
 *   blastType - blast/wublast/blast8/blast9/xml
 *   ourId - optional (may be NULL) thing to put in header
 */

struct axt *axtAffine(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
/* Return alignment if any of query and target using scoring scheme.  This does
 * dynamic program affine-gap based alignment.  It's not suitable for very large
 * sequences. */

boolean axtAffineSmallEnough(double querySize, double targetSize);
/* Return TRUE if it is reasonable to align sequences of given sizes
 * with axtAffine. */


struct axt *axtAffine2Level(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
/* 

   Return alignment if any of query and target using scoring scheme. 
   
   2Level uses an economical amount of ram and should work for large target sequences.
   
   If Q is query size and T is target size and M is memory size, then
   Total memory used M = 30*Q*sqrt(T).  When the target is much larger than the query
   this method saves ram, and average runtime is only 50% greater, or 1.5 QT.  
   If Q=5000 and T=245,522,847 for hg17 chr1, then M = 2.2 GB ram.  
   axtAffine would need M=3QT = 3.4 TB.
   Of course massive alignments will be painfully slow anyway.

   Works for protein as well as DNA given the correct scoreScheme.
  
   NOTES:
   Double-gap cost is equal to gap-extend cost, but gap-open would also work.
   On very large target, score integer may overflow.
   Input sequences not checked for invalid chars.
   Input not checked but query should be shorter than target.
   
*/

void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt);
/* Add blocks (gapless subalignments) from (non-NULL!) axt to block list. 
 * Note: list will be in reverse order of axt blocks. */

void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, 
	FILE *f);
/* Print out an alignment with line-breaks. */

void axtPrintTraditionalExtra(struct axt *axt, int maxLine,
			      struct axtScoreScheme *ss, FILE *f,
			      boolean reverseTPos, boolean reverseQPos);
/* Print out an alignment with line-breaks.  If reverseTPos is true, then
 * the sequence has been reverse complemented, so show the coords starting
 * at tEnd and decrementing down to tStart; likewise for reverseQPos. */

double axtIdWithGaps(struct axt *axt);
/* Return ratio of matching bases to total symbols in alignment. */

double axtCoverage(struct axt *axt, int qSize, int tSize);
/* Return % of q and t covered. */

void axtOutPretty(struct axt *axt, int lineSize, FILE *f);
/* Output axt in pretty format. */

#endif /* AXT_H */