Browse code

initial version from github

Ge Tan authored on 25/02/2014 21:11:54
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,272 @@
1
+/* AXT - A simple alignment format with four lines per
2
+ * alignment.  The first specifies the names of the
3
+ * two sequences that align and the position of the
4
+ * alignment, as well as the strand.  The next two
5
+ * lines contain the alignment itself including dashes
6
+ * for inserts.  The alignment is separated from the
7
+ * next alignment with a blank line. 
8
+ *
9
+ * This file contains routines to read such alignments.
10
+ * Note that though the coordinates are one based and
11
+ * closed on disk, they get converted to our usual half
12
+ * open zero based in memory. 
13
+ *
14
+ * This also contains code for simple DNA alignment scoring
15
+ * schemes. 
16
+ *
17
+ * This file is copyright 2002 Jim Kent, but license is hereby
18
+ * granted for all use - public, private or commercial. */
19
+
20
+#ifndef AXT_H
21
+#define AXT_H
22
+
23
+#ifndef LINEFILE_H
24
+#include "linefile.h"
25
+#endif 
26
+
27
+#ifndef DNASEQ_H
28
+#include "dnaseq.h"
29
+#endif
30
+
31
+#ifndef CHAIN_H
32
+#include "chain.h"
33
+#endif
34
+
35
+struct axt
36
+/* This contains information about one xeno alignment. */
37
+    {
38
+    struct axt *next;
39
+    char *qName;		/* Name of query sequence. */
40
+    int qStart, qEnd;		/* Half open zero=based coordinates. */
41
+    char qStrand;		/* Is this forward or reverse strand + or - */
42
+    char *tName;		/* Name of target. */
43
+    int tStart, tEnd;		/* Target coordinates. */
44
+    char tStrand;               /* Target strand - currently always +. */
45
+    int score;	                /* Score.  Zero for unknown.  Units arbitrary. */
46
+    int symCount;               /* Size of alignments. */
47
+    char *qSym, *tSym;          /* Alignments. */
48
+    int frame;			/* If non-zero then translation frame. */
49
+    };
50
+
51
+void axtFree(struct axt **pEl);
52
+/* Free an axt. */
53
+
54
+void axtFreeList(struct axt **pList);
55
+/* Free a list of dynamically allocated axt's */
56
+
57
+struct axt *axtRead(struct lineFile *lf);
58
+/* Read in next record from .axt file and return it.
59
+ * Returns NULL at EOF. */
60
+
61
+struct axt *axtReadWithPos(struct lineFile *lf, off_t *retOffset);
62
+/* Read next axt, and if retOffset is not-NULL, fill it with
63
+ * offset of start of axt. */
64
+
65
+boolean axtCheck(struct axt *axt, struct lineFile *lf);
66
+/* Return FALSE if there's a problem with axt. */
67
+
68
+void axtWrite(struct axt *axt, FILE *f);
69
+/* Output axt to axt file. */
70
+
71
+int axtCmpQuery(const void *va, const void *vb);
72
+/* Compare to sort based on query position. */
73
+
74
+int axtCmpTarget(const void *va, const void *vb);
75
+/* Compare to sort based on target position. */
76
+
77
+int axtCmpScore(const void *va, const void *vb);
78
+/* Compare to sort based on score. */
79
+
80
+int axtCmpTargetScoreDesc(const void *va, const void *vb);
81
+/* Compare to sort based on target name and score descending. */
82
+
83
+struct axtScoreScheme
84
+/* A scoring scheme or DNA alignment. */
85
+    {
86
+    struct scoreMatrix *next;
87
+    int matrix[256][256];   /* Look up with letters. */
88
+    int gapOpen;	/* Gap open cost. */
89
+    int gapExtend;	/* Gap extension. */
90
+    char *extra;        /* extra parameters */
91
+    };
92
+
93
+void axtScoreSchemeFree(struct axtScoreScheme **pObj);
94
+/* Free up score scheme. */
95
+
96
+struct axtScoreScheme *axtScoreSchemeDefault();
97
+/* Return default scoring scheme (after blastz).  Do NOT axtScoreSchemeFree
98
+ * this. */
99
+
100
+struct axtScoreScheme *axtScoreSchemeSimpleDna(int match, int misMatch, int gapOpen, int gapExtend);
101
+/* Return a relatively simple scoring scheme for DNA. */
102
+
103
+struct axtScoreScheme *axtScoreSchemeRnaDefault();
104
+/* Return default scoring scheme for RNA/DNA alignments
105
+ * within the same species.  Do NOT axtScoreSchemeFree
106
+ * this. */
107
+
108
+struct axtScoreScheme *axtScoreSchemeFromBlastzMatrix(char *text, int gapOpen, int gapExtend);
109
+/* return scoring schema from a string in the following format */
110
+/* 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91 */
111
+
112
+struct axtScoreScheme *axtScoreSchemeRnaFill();
113
+/* Return scoreing scheme a little more relaxed than 
114
+ * RNA/DNA defaults for filling in gaps. */
115
+
116
+struct axtScoreScheme *axtScoreSchemeProteinDefault();
117
+/* Returns default protein scoring scheme.  This is
118
+ * scaled to be compatible with the blastz one.  Don't
119
+ * axtScoreSchemeFree this. */
120
+
121
+struct axtScoreScheme *axtScoreSchemeProteinRead(char *fileName);
122
+/* read in blosum-like matrix */
123
+
124
+struct axtScoreScheme *axtScoreSchemeRead(char *fileName);
125
+/* Read in scoring scheme from file. Looks like
126
+    A    C    G    T
127
+    91 -114  -31 -123
128
+    -114  100 -125  -31
129
+    -31 -125  100 -114
130
+    -123  -31 -114   91
131
+    O = 400, E = 30
132
+ * axtScoreSchemeFree this when done. */
133
+
134
+struct axtScoreScheme *axtScoreSchemeReadLf(struct lineFile *lf );
135
+/* Read in scoring scheme from file. Looks like
136
+    A    C    G    T
137
+    91 -114  -31 -123
138
+    -114  100 -125  -31
139
+    -31 -125  100 -114
140
+    -123  -31 -114   91
141
+    O = 400, E = 30
142
+ * axtScoreSchemeFree this when done. */
143
+
144
+void axtScoreSchemeDnaWrite(struct axtScoreScheme *ss, FILE *f, char *name);
145
+/* output the score dna based score matrix in meta Data format to File f,
146
+name should be set to the name of the program that is using the matrix */
147
+
148
+int axtScoreSym(struct axtScoreScheme *ss, int symCount, char *qSym, char *tSym);
149
+/* Return score without setting up an axt structure. */
150
+
151
+int axtScore(struct axt *axt, struct axtScoreScheme *ss);
152
+/* Return calculated score of axt. */
153
+
154
+int axtScoreFilterRepeats(struct axt *axt, struct axtScoreScheme *ss);
155
+/* Return calculated score of axt. do not score gaps if they are repeat masked*/
156
+
157
+int axtScoreUngapped(struct axtScoreScheme *ss, char *q, char *t, int size);
158
+/* Score ungapped alignment. */
159
+
160
+int axtScoreDnaDefault(struct axt *axt);
161
+/* Score DNA-based axt using default scheme. */
162
+
163
+int axtScoreProteinDefault(struct axt *axt);
164
+/* Score protein-based axt using default scheme. */
165
+
166
+boolean axtGetSubsetOnT(struct axt *axt, struct axt *axtOut,
167
+			int newStart, int newEnd, struct axtScoreScheme *ss,
168
+			boolean includeEdgeGaps);
169
+/* Return FALSE if axt is not in the new range.  Otherwise, set axtOut to
170
+ * a subset that goes from newStart to newEnd in target coordinates. 
171
+ * If includeEdgeGaps, don't trim target gaps before or after the range. */
172
+
173
+void axtSubsetOnT(struct axt *axt, int newStart, int newEnd, 
174
+	struct axtScoreScheme *ss, FILE *f);
175
+/* Write out subset of axt that goes from newStart to newEnd
176
+ * in target coordinates. */
177
+
178
+int axtTransPosToQ(struct axt *axt, int tPos);
179
+/* Convert from t to q coordinates */
180
+
181
+void axtSwap(struct axt *axt, int tSize, int qSize);
182
+/* Flip target and query on one axt. */
183
+
184
+struct axtBundle
185
+/* A bunch of axt's on the same query/target sequence. */
186
+    {
187
+    struct axtBundle *next;
188
+    struct axt *axtList;	/* List of alignments. */
189
+    int tSize;			/* Size of target. */
190
+    int qSize;			/* Size of query. */
191
+    };
192
+
193
+void axtBundleFree(struct axtBundle **pObj);
194
+/* Free a axtBundle. */
195
+
196
+void axtBundleFreeList(struct axtBundle **pList);
197
+/* Free a list of gfAxtBundles. */
198
+
199
+void axtBlastOut(struct axtBundle *abList, 
200
+	int queryIx, boolean isProt, FILE *f, 
201
+	char *databaseName, int databaseSeqCount, double databaseLetterCount, 
202
+	char *blastType, char *ourId, double minIdentity);
203
+/* Output a bundle of axt's on the same query sequence in blast format.
204
+ * The parameters in detail are:
205
+ *   ab - the list of bundles of axt's. 
206
+ *   f  - output file handle
207
+ *   databaseSeqCount - number of sequences in database
208
+ *   databaseLetterCount - number of bases or aa's in database
209
+ *   blastType - blast/wublast/blast8/blast9/xml
210
+ *   ourId - optional (may be NULL) thing to put in header
211
+ */
212
+
213
+struct axt *axtAffine(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
214
+/* Return alignment if any of query and target using scoring scheme.  This does
215
+ * dynamic program affine-gap based alignment.  It's not suitable for very large
216
+ * sequences. */
217
+
218
+boolean axtAffineSmallEnough(double querySize, double targetSize);
219
+/* Return TRUE if it is reasonable to align sequences of given sizes
220
+ * with axtAffine. */
221
+
222
+
223
+struct axt *axtAffine2Level(bioSeq *query, bioSeq *target, struct axtScoreScheme *ss);
224
+/* 
225
+
226
+   Return alignment if any of query and target using scoring scheme. 
227
+   
228
+   2Level uses an economical amount of ram and should work for large target sequences.
229
+   
230
+   If Q is query size and T is target size and M is memory size, then
231
+   Total memory used M = 30*Q*sqrt(T).  When the target is much larger than the query
232
+   this method saves ram, and average runtime is only 50% greater, or 1.5 QT.  
233
+   If Q=5000 and T=245,522,847 for hg17 chr1, then M = 2.2 GB ram.  
234
+   axtAffine would need M=3QT = 3.4 TB.
235
+   Of course massive alignments will be painfully slow anyway.
236
+
237
+   Works for protein as well as DNA given the correct scoreScheme.
238
+  
239
+   NOTES:
240
+   Double-gap cost is equal to gap-extend cost, but gap-open would also work.
241
+   On very large target, score integer may overflow.
242
+   Input sequences not checked for invalid chars.
243
+   Input not checked but query should be shorter than target.
244
+   
245
+*/
246
+
247
+void axtAddBlocksToBoxInList(struct cBlock **pList, struct axt *axt);
248
+/* Add blocks (gapless subalignments) from (non-NULL!) axt to block list. 
249
+ * Note: list will be in reverse order of axt blocks. */
250
+
251
+void axtPrintTraditional(struct axt *axt, int maxLine, struct axtScoreScheme *ss, 
252
+	FILE *f);
253
+/* Print out an alignment with line-breaks. */
254
+
255
+void axtPrintTraditionalExtra(struct axt *axt, int maxLine,
256
+			      struct axtScoreScheme *ss, FILE *f,
257
+			      boolean reverseTPos, boolean reverseQPos);
258
+/* Print out an alignment with line-breaks.  If reverseTPos is true, then
259
+ * the sequence has been reverse complemented, so show the coords starting
260
+ * at tEnd and decrementing down to tStart; likewise for reverseQPos. */
261
+
262
+double axtIdWithGaps(struct axt *axt);
263
+/* Return ratio of matching bases to total symbols in alignment. */
264
+
265
+double axtCoverage(struct axt *axt, int qSize, int tSize);
266
+/* Return % of q and t covered. */
267
+
268
+void axtOutPretty(struct axt *axt, int lineSize, FILE *f);
269
+/* Output axt in pretty format. */
270
+
271
+#endif /* AXT_H */
272
+