src/ucsc/axt.h
3387c13c
 /* AXT - A simple alignment format with four lines per
  * alignment.  The first specifies the names of the
  * two sequences that align and the position of the
  * alignment, as well as the strand.  The next two
  * lines contain the alignment itself including dashes
  * for inserts.  The alignment is separated from the
  * next alignment with a blank line. 
  *
  * This file contains routines to read such alignments.
  * Note that though the coordinates are one based and
  * closed on disk, they get converted to our usual half
  * open zero based in memory. 
  *
  * This also contains code for simple DNA alignment scoring
  * schemes. 
  *
  * This file is copyright 2002 Jim Kent, but license is hereby
  * granted for all use - public, private or commercial. */
 
 #ifndef AXT_H
 #define AXT_H
 
 #ifndef LINEFILE_H
 #include "linefile.h"
 #endif 
 
 #ifndef DNASEQ_H
 #include "dnaseq.h"
 #endif
 
 #ifndef CHAIN_H
 #include "chain.h"
 #endif
 
 struct axt
 /* This contains information about one xeno alignment. */
     {
     struct axt *next;
     char *qName;		/* Name of query sequence. */
     int qStart, qEnd;		/* Half open zero=based coordinates. */
     char qStrand;		/* Is this forward or reverse strand + or - */
     char *tName;		/* Name of target. */
     int tStart, tEnd;		/* Target coordinates. */
     char tStrand;               /* Target strand - currently always +. */
     int score;	                /* Score.  Zero for unknown.  Units arbitrary. */
     int symCount;               /* Size of alignments. */
     char *qSym, *tSym;          /* Alignments. */
     int frame;			/* If non-zero then translation frame. */
     };
 
 void axtFree(struct axt **pEl);
 /* Free an axt. */
 
 void axtFreeList(struct axt **pList);
 /* Free a list of dynamically allocated axt's */
 
 struct axt *axtRead(struct lineFile *lf);
 /* Read in next record from .axt file and return it.
  * Returns NULL at EOF. */
 
 struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset);
 /* Read next axt, and if retOffset is not-NULL, fill it with
  * offset of start of axt. */
 
 boolean axtCheck(struct axt *axt, struct lineFile *lf);
 /* Return FALSE if there's a problem with axt. */
 
 void axtWrite(struct axt *axt, FILE *f);
 /* Output axt to axt file. */
 
 int axtCmpQuery(const void *va, const void *vb);
 /* Compare to sort based on query position. */
 
 int axtCmpTarget(const void *va, const void *vb);
 /* Compare to sort based on target position. */
 
 int axtCmpScore(const void *va, const void *vb);
 /* Compare to sort based on score. */
 
 int axtCmpTargetScoreDesc(const void *va, const void *vb);
 /* Compare to sort based on target name and score descending. */
 
 struct axtScoreScheme
 /* A scoring scheme or DNA alignment. */
     {
     struct scoreMatrix *next;
     int matrix[256][256];   /* Look up with letters. */
     int gapOpen;	/* Gap open cost. */
     int gapExtend;	/* Gap extension. */
     char *extra;        /* extra parameters */
     };
 
 void axtScoreSchemeFree(struct axtScoreScheme **pObj);
 /* Free up score scheme. */
 
 struct axtScoreScheme *axtScoreSchemeDefault();
 /* Return default scoring scheme (after blastz).  Do NOT axtScoreSchemeFree
  * this. */
 
 struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend);
 /* Return a relatively simple scoring scheme for DNA. */
 
 struct axtScoreScheme *axtScoreSchemeRnaDefault();
 /* Return default scoring scheme for RNA/DNA alignments
  * within the same species.  Do NOT axtScoreSchemeFree
  * this. */
 
 struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend);
 /* return scoring schema from a string in the following format */
 /* 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 */
 
 struct axtScoreScheme *axtScoreSchemeRnaFill();
 /* Return scoreing scheme a little more relaxed than 
  * RNA/DNA defaults for filling in gaps. */
 
 struct axtScoreScheme *axtScoreSchemeProteinDefault();
 /* Returns default protein scoring scheme.  This is
  * scaled to be compatible with the blastz one.  Don't
  * axtScoreSchemeFree this. */
 
 struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName);
 /* read in blosum-like matrix */
 
 struct axtScoreScheme *axtScoreSchemeRead(char *fileName);
 /* Read in scoring scheme from file. Looks like
     A    C    G    T
     91 -114  -31 -123
     -114  100 -125  -31
     -31 -125  100 -114
     -123  -31 -114   91
     O = 400, E = 30
  * axtScoreSchemeFree this when done. */
 
 struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf );
 /* Read in scoring scheme from file. Looks like
     A    C    G    T
     91 -114  -31 -123
     -114  100 -125  -31
     -31 -125  100 -114
     -123  -31 -114   91
     O = 400, E = 30
  * axtScoreSchemeFree this when done. */
 
 void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name);
 /* output the score dna based score matrix in meta Data format to File f,
 name should be set to the name of the program that is using the matrix */
 
 int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym);
 /* Return score without setting up an axt structure. */
 
 int axtScore(struct axt *axt, struct axtScoreScheme *ss);
 /* Return calculated score of axt. */
 
 int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss);
 /* Return calculated score of axt. do not score gaps if they are repeat masked*/
 
 int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size);
 /* Score ungapped alignment. */
 
 int axtScoreDnaDefault(struct axt *axt);
 /* Score DNA-based axt using default scheme. */
 
 int axtScoreProteinDefault(struct axt *axt);
 /* Score protein-based axt using default scheme. */
 
 boolean axtGetSubsetOnT(struct axt *axt, struct axt *axtOut,
 			int newStart, int newEnd, struct axtScoreScheme *ss,
 			boolean includeEdgeGaps);
 /* Return FALSE if axt is not in the new range.  Otherwise, set axtOut to
  * a subset that goes from newStart to newEnd in target coordinates. 
  * If includeEdgeGaps, don't trim target gaps before or after the range. */
 
 void axtSubsetOnT(struct axt *axt, int newStart, int newEnd, 
 	struct axtScoreScheme *ss, FILE *f);
 /* Write out subset of axt that goes from newStart to newEnd
  * in target coordinates. */
 
 int axtTransPosToQ(struct axt *axt, int tPos);
 /* Convert from t to q coordinates */
 
 void axtSwap(struct axt *axt, int tSize, int qSize);
 /* Flip target and query on one axt. */
 
 struct axtBundle
 /* A bunch of axt's on the same query/target sequence. */
     {
     struct axtBundle *next;
     struct axt *axtList;	/* List of alignments. */
     int tSize;			/* Size of target. */
     int qSize;			/* Size of query. */
     };
 
 void axtBundleFree(struct axtBundle **pObj);
 /* Free a axtBundle. */
 
 void axtBundleFreeList(struct axtBundle **pList);
 /* Free a list of gfAxtBundles. */
 
 void axtBlastOut(struct axtBundle *abList, 
 	int queryIx, boolean isProt, FILE *f, 
 	char *databaseName, int databaseSeqCount, double databaseLetterCount, 
 	char *blastType, char *ourId, double minIdentity);
 /* Output a bundle of axt's on the same query sequence in blast format.
  * The parameters in detail are:
  *   ab - the list of bundles of axt's. 
  *   f  - output file handle
  *   databaseSeqCount - number of sequences in database
  *   databaseLetterCount - number of bases or aa's in database
  *   blastType - blast/wublast/blast8/blast9/xml
  *   ourId - optional (may be NULL) thing to put in header
  */
 
 struct axt *axtAffine(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
 /* Return alignment if any of query and target using scoring scheme.  This does
  * dynamic program affine-gap based alignment.  It's not suitable for very large
  * sequences. */
 
 boolean axtAffineSmallEnough(double querySize, double targetSize);
 /* Return TRUE if it is reasonable to align sequences of given sizes
  * with axtAffine. */
 
 
 struct axt *axtAffine2Level(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
 /* 
 
    Return alignment if any of query and target using scoring scheme. 
    
    2Level uses an economical amount of ram and should work for large target sequences.
    
    If Q is query size and T is target size and M is memory size, then
    Total memory used M = 30*Q*sqrt(T).  When the target is much larger than the query
    this method saves ram, and average runtime is only 50% greater, or 1.5 QT.  
    If Q=5000 and T=245,522,847 for hg17 chr1, then M = 2.2 GB ram.  
    axtAffine would need M=3QT = 3.4 TB.
    Of course massive alignments will be painfully slow anyway.
 
    Works for protein as well as DNA given the correct scoreScheme.
   
    NOTES:
    Double-gap cost is equal to gap-extend cost, but gap-open would also work.
    On very large target, score integer may overflow.
    Input sequences not checked for invalid chars.
    Input not checked but query should be shorter than target.
    
 */
 
 void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt);
 /* Add blocks (gapless subalignments) from (non-NULL!) axt to block list. 
  * Note: list will be in reverse order of axt blocks. */
 
 void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, 
 	FILE *f);
 /* Print out an alignment with line-breaks. */
 
 void axtPrintTraditionalExtra(struct axt *axt, int maxLine,
 			      struct axtScoreScheme *ss, FILE *f,
 			      boolean reverseTPos, boolean reverseQPos);
 /* Print out an alignment with line-breaks.  If reverseTPos is true, then
  * the sequence has been reverse complemented, so show the coords starting
  * at tEnd and decrementing down to tStart; likewise for reverseQPos. */
 
 double axtIdWithGaps(struct axt *axt);
 /* Return ratio of matching bases to total symbols in alignment. */
 
 double axtCoverage(struct axt *axt, int qSize, int tSize);
 /* Return % of q and t covered. */
 
 void axtOutPretty(struct axt *axt, int lineSize, FILE *f);
 /* Output axt in pretty format. */
 
 #endif /* AXT_H */