msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,313 @@ |
1 |
+/***************************************************************** |
|
2 |
+ * SQUID - a library of functions for biological sequence analysis |
|
3 |
+ * Copyright (C) 1992-2002 Washington University School of Medicine |
|
4 |
+ * |
|
5 |
+ * This source code is freely distributed under the terms of the |
|
6 |
+ * GNU General Public License. See the files COPYRIGHT and LICENSE |
|
7 |
+ * for details. |
|
8 |
+ *****************************************************************/ |
|
9 |
+ |
|
10 |
+#ifndef SQUID_MSA_INCLUDED |
|
11 |
+#define SQUID_MSA_INCLUDED |
|
12 |
+ |
|
13 |
+/* msa.h |
|
14 |
+ * SRE, Mon May 17 10:24:30 1999 |
|
15 |
+ * |
|
16 |
+ * Header file for SQUID's multiple sequence alignment |
|
17 |
+ * manipulation code. |
|
18 |
+ * |
|
19 |
+ * RCS $Id: msa.h 277 2013-05-16 15:42:49Z fabian $ (Original squid RCS Id: msa.h,v 1.12 2002/10/12 04:40:35 eddy Exp) |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+#include <stdio.h> /* FILE support */ |
|
23 |
+#include "gki.h" /* hash table support */ |
|
24 |
+#include "ssi.h" /* sequence file index support */ |
|
25 |
+#include "squid.h" /* need SQINFO */ |
|
26 |
+ |
|
27 |
+/**************************************************** |
|
28 |
+ * Obsolete alignment information, AINFO |
|
29 |
+ * Superceded by MSA structure further below; but we |
|
30 |
+ * need AINFO for the near future for backwards |
|
31 |
+ * compatibility. |
|
32 |
+ ****************************************************/ |
|
33 |
+/* Structure: aliinfo_s |
|
34 |
+ * |
|
35 |
+ * Purpose: Optional information returned from an alignment file. |
|
36 |
+ * |
|
37 |
+ * flags: always used. Flags for which info is valid/alloced. |
|
38 |
+ * |
|
39 |
+ * alen: mandatory. Alignments are always flushed right |
|
40 |
+ * with gaps so that all aseqs are the same length, alen. |
|
41 |
+ * Available for all alignment formats. |
|
42 |
+ * |
|
43 |
+ * nseq: mandatory. Aligned seqs are indexed 0..nseq-1. |
|
44 |
+ * |
|
45 |
+ * wgt: 0..nseq-1 vector of sequence weights. Mandatory. |
|
46 |
+ * If not explicitly set, weights are initialized to 1.0. |
|
47 |
+ * |
|
48 |
+ * cs: 0..alen-1, just like the alignment. Contains single-letter |
|
49 |
+ * secondary structure codes for consensus structure; "<>^+" |
|
50 |
+ * for RNA, "EHL." for protein. May be NULL if unavailable |
|
51 |
+ * from seqfile. Only available for SELEX format files. |
|
52 |
+ * |
|
53 |
+ * rf: 0..alen-1, just like the alignment. rf is an arbitrary string |
|
54 |
+ * of characters, used for annotating columns. Blanks are |
|
55 |
+ * interpreted as non-canonical columns and anything else is |
|
56 |
+ * considered canonical. Only available from SELEX files. |
|
57 |
+ * |
|
58 |
+ * sqinfo: mandatory. Array of 0..nseq-1 |
|
59 |
+ * per-sequence information structures, carrying |
|
60 |
+ * name, id, accession, coords. |
|
61 |
+ * |
|
62 |
+ */ |
|
63 |
+struct aliinfo_s { |
|
64 |
+ int flags; /* flags for what info is valid */ |
|
65 |
+ int alen; /* length of alignment (columns) */ |
|
66 |
+ int nseq; /* number of seqs in alignment */ |
|
67 |
+ float *wgt; /* sequence weights [0..nseq-1] */ |
|
68 |
+ char *cs; /* consensus secondary structure string */ |
|
69 |
+ char *rf; /* reference coordinate system */ |
|
70 |
+ struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */ |
|
71 |
+ |
|
72 |
+ /* Pfam/HMMER pick-ups */ |
|
73 |
+ char *name; /* name of alignment */ |
|
74 |
+ char *desc; /* description of alignment */ |
|
75 |
+ char *acc; /* accession of alignment */ |
|
76 |
+ char *au; /* "author" information */ |
|
77 |
+ float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */ |
|
78 |
+ float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */ |
|
79 |
+ float ga1, ga2; /* gathering cutoffs */ |
|
80 |
+}; |
|
81 |
+typedef struct aliinfo_s AINFO; |
|
82 |
+#define AINFO_TC (1 << 0) |
|
83 |
+#define AINFO_NC (1 << 1) |
|
84 |
+#define AINFO_GA (1 << 2) |
|
85 |
+ |
|
86 |
+/***************************************************************** |
|
87 |
+ * MSA |
|
88 |
+ * SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland] |
|
89 |
+ * |
|
90 |
+ * Defines the new data structure and API for multiple |
|
91 |
+ * sequence alignment i/o. |
|
92 |
+ *****************************************************************/ |
|
93 |
+ |
|
94 |
+/* The following constants define the Pfam/Rfam cutoff set we'll propagate |
|
95 |
+ * from msa's into HMMER and Infernal models. |
|
96 |
+ */ |
|
97 |
+#define MSA_CUTOFF_TC1 0 |
|
98 |
+#define MSA_CUTOFF_TC2 1 |
|
99 |
+#define MSA_CUTOFF_GA1 2 |
|
100 |
+#define MSA_CUTOFF_GA2 3 |
|
101 |
+#define MSA_CUTOFF_NC1 4 |
|
102 |
+#define MSA_CUTOFF_NC2 5 |
|
103 |
+#define MSA_MAXCUTOFFS 6 |
|
104 |
+ |
|
105 |
+/* Structure: MSA |
|
106 |
+ * SRE, Tue May 18 11:33:08 1999 |
|
107 |
+ * |
|
108 |
+ * Our object for a multiple sequence alignment. |
|
109 |
+ */ |
|
110 |
+typedef struct msa_struct { |
|
111 |
+ /* Mandatory information associated with the alignment. |
|
112 |
+ */ |
|
113 |
+ char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */ |
|
114 |
+ char **sqname; /* names of sequences, [0..nseq-1][0..alen-1] */ |
|
115 |
+ float *wgt; /* sequence weights [0..nseq-1] */ |
|
116 |
+ int alen; /* length of alignment (columns) */ |
|
117 |
+ int nseq; /* number of seqs in alignment */ |
|
118 |
+ |
|
119 |
+ /* Optional information that we understand, and might have. |
|
120 |
+ */ |
|
121 |
+ int flags; /* flags for what optional info is valid */ |
|
122 |
+ int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */ |
|
123 |
+ char *name; /* name of alignment, or NULL */ |
|
124 |
+ char *desc; /* description of alignment, or NULL */ |
|
125 |
+ char *acc; /* accession of alignment, or NULL */ |
|
126 |
+ char *au; /* "author" information, or NULL */ |
|
127 |
+ char *ss_cons; /* consensus secondary structure string, or NULL */ |
|
128 |
+ char *sa_cons; /* consensus surface accessibility string, or NULL */ |
|
129 |
+ char *rf; /* reference coordinate system, or NULL */ |
|
130 |
+ char **sqacc; /* accession numbers for individual sequences */ |
|
131 |
+ char **sqdesc; /* description lines for individual sequences */ |
|
132 |
+ char **ss; /* per-seq secondary structure annotation, or NULL */ |
|
133 |
+ char **sa; /* per-seq surface accessibility annotation, or NULL */ |
|
134 |
+ float cutoff[MSA_MAXCUTOFFS]; /* NC, TC, GA cutoffs propagated to Pfam/Rfam */ |
|
135 |
+ int cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */ |
|
136 |
+ |
|
137 |
+ /* Optional information that we don't understand. |
|
138 |
+ * That is, we know what type of information it is, but it's |
|
139 |
+ * either (interpreted as) free-text comment, or it's Stockholm |
|
140 |
+ * markup with unfamiliar tags. |
|
141 |
+ */ |
|
142 |
+ char **comment; /* free text comments, or NULL */ |
|
143 |
+ int ncomment; /* number of comment lines */ |
|
144 |
+ int alloc_ncomment; /* number of comment lines alloc'ed */ |
|
145 |
+ |
|
146 |
+ char **gf_tag; /* markup tags for unparsed #=GF lines */ |
|
147 |
+ char **gf; /* annotations for unparsed #=GF lines */ |
|
148 |
+ int ngf; /* number of unparsed #=GF lines */ |
|
149 |
+ int alloc_ngf; /* number of gf lines alloc'ed */ |
|
150 |
+ |
|
151 |
+ char **gs_tag; /* markup tags for unparsed #=GS lines */ |
|
152 |
+ char ***gs; /* [0..ngs-1][0..nseq-1][free text] markup */ |
|
153 |
+ GKI *gs_idx; /* hash of #=GS tag types */ |
|
154 |
+ int ngs; /* number of #=GS tag types */ |
|
155 |
+ |
|
156 |
+ char **gc_tag; /* markup tags for unparsed #=GC lines */ |
|
157 |
+ char **gc; /* [0..ngc-1][0..alen-1] markup */ |
|
158 |
+ GKI *gc_idx; /* hash of #=GC tag types */ |
|
159 |
+ int ngc; /* number of #=GC tag types */ |
|
160 |
+ |
|
161 |
+ char **gr_tag; /* markup tags for unparsed #=GR lines */ |
|
162 |
+ char ***gr; /* [0..ngr][0..nseq-1][0..alen-1] markup */ |
|
163 |
+ GKI *gr_idx; /* hash of #=GR tag types */ |
|
164 |
+ int ngr; /* number of #=GR tag types */ |
|
165 |
+ |
|
166 |
+ /* Stuff we need for our own maintenance of the data structure |
|
167 |
+ */ |
|
168 |
+ GKI *index; /* name ->seqidx hash table */ |
|
169 |
+ int nseqalloc; /* number of seqs currently allocated for */ |
|
170 |
+ int nseqlump; /* lump size for dynamic expansions of nseq */ |
|
171 |
+ int *sqlen; /* individual sequence lengths during parsing */ |
|
172 |
+ int *sslen; /* individual ss lengths during parsing */ |
|
173 |
+ int *salen; /* individual sa lengths during parsing */ |
|
174 |
+ int lastidx; /* last index we saw; use for guessing next */ |
|
175 |
+} MSA; |
|
176 |
+#define MSA_SET_WGT (1 << 0) /* track whether wgts were set, or left at default 1.0 */ |
|
177 |
+ |
|
178 |
+ |
|
179 |
+/* Structure: MSAFILE |
|
180 |
+ * SRE, Tue May 18 11:36:54 1999 |
|
181 |
+ * |
|
182 |
+ * Defines an alignment file that's open for reading. |
|
183 |
+ */ |
|
184 |
+typedef struct msafile_struct { |
|
185 |
+ FILE *f; /* open file pointer */ |
|
186 |
+ char *fname; /* name of file. used for diagnostic output */ |
|
187 |
+ int linenumber; /* what line are we on in the file */ |
|
188 |
+ |
|
189 |
+ char *buf; /* buffer for line input w/ sre_fgets() */ |
|
190 |
+ int buflen; /* current allocated length for buf */ |
|
191 |
+ |
|
192 |
+ SSIFILE *ssi; /* open SSI index file; or NULL, if none. */ |
|
193 |
+ |
|
194 |
+ int do_gzip; /* TRUE if f is a pipe from gzip -dc (need pclose(f)) */ |
|
195 |
+ int do_stdin; /* TRUE if f is stdin (don't close f, not our problem) */ |
|
196 |
+ int format; /* format of alignment file we're reading */ |
|
197 |
+} MSAFILE; |
|
198 |
+ |
|
199 |
+ |
|
200 |
+/* Alignment file formats. |
|
201 |
+ * Must coexist with sqio.c/squid.h unaligned file format codes. |
|
202 |
+ * Rules: |
|
203 |
+ * - 0 is an unknown/unassigned format |
|
204 |
+ * - <100 reserved for unaligned formats |
|
205 |
+ * - >100 reserved for aligned formats |
|
206 |
+ */ |
|
207 |
+#define MSAFILE_UNKNOWN 0 /* unknown format */ |
|
208 |
+#define MSAFILE_STOCKHOLM 101 /* Pfam/HMMER's Stockholm format */ |
|
209 |
+#define MSAFILE_SELEX 102 /* Obsolete(!): old HMMER/SELEX format */ |
|
210 |
+#define MSAFILE_MSF 103 /* GCG MSF format */ |
|
211 |
+#define MSAFILE_CLUSTAL 104 /* Clustal V/W format */ |
|
212 |
+#define MSAFILE_A2M 105 /* aligned FASTA (A2M is UCSC terminology) */ |
|
213 |
+#define MSAFILE_PHYLIP 106 /* Felsenstein's PHYLIP format */ |
|
214 |
+#define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */ |
|
215 |
+#ifdef CLUSTALO |
|
216 |
+#define MSAFILE_VIENNA 108 /* Vienna: concatenated fasta */ |
|
217 |
+#endif |
|
218 |
+ |
|
219 |
+#define IsAlignmentFormat(fmt) ((fmt) > 100) |
|
220 |
+ |
|
221 |
+ |
|
222 |
+/* from msa.c |
|
223 |
+ */ |
|
224 |
+extern MSAFILE *MSAFileOpen(char *filename, int format, char *env); |
|
225 |
+extern MSA *MSAFileRead(MSAFILE *afp); |
|
226 |
+extern void MSAFileClose(MSAFILE *afp); |
|
227 |
+extern void MSAFree(MSA *msa); |
|
228 |
+#ifdef CLUSTALO |
|
229 |
+extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline, int iWrap, int bResno); |
|
230 |
+#else |
|
231 |
+extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline); |
|
232 |
+#endif |
|
233 |
+ |
|
234 |
+extern int MSAFileRewind(MSAFILE *afp); |
|
235 |
+extern int MSAFilePositionByKey(MSAFILE *afp, char *key); |
|
236 |
+extern int MSAFilePositionByIndex(MSAFILE *afp, int idx); |
|
237 |
+ |
|
238 |
+extern int MSAFileFormat(MSAFILE *afp); |
|
239 |
+extern MSA *MSAAlloc(int nseq, int alen); |
|
240 |
+extern void MSAExpand(MSA *msa); |
|
241 |
+extern char *MSAFileGetLine(MSAFILE *afp); |
|
242 |
+extern void MSASetSeqAccession(MSA *msa, int seqidx, char *acc); |
|
243 |
+extern void MSASetSeqDescription(MSA *msa, int seqidx, char *desc); |
|
244 |
+extern void MSAAddComment(MSA *msa, char *s); |
|
245 |
+extern void MSAAddGF(MSA *msa, char *tag, char *value); |
|
246 |
+extern void MSAAddGS(MSA *msa, char *tag, int seqidx, char *value); |
|
247 |
+extern void MSAAppendGC(MSA *msa, char *tag, char *value); |
|
248 |
+extern char *MSAGetGC(MSA *msa, char *tag); |
|
249 |
+extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value); |
|
250 |
+extern void MSAVerifyParse(MSA *msa); |
|
251 |
+extern int MSAGetSeqidx(MSA *msa, char *name, int guess); |
|
252 |
+ |
|
253 |
+extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo); |
|
254 |
+ |
|
255 |
+extern void MSAMingap(MSA *msa); |
|
256 |
+extern void MSANogap(MSA *msa); |
|
257 |
+extern void MSAShorterAlignment(MSA *msa, int *useme); |
|
258 |
+extern void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new); |
|
259 |
+ |
|
260 |
+extern char *MSAGetSeqAccession(MSA *msa, int idx); |
|
261 |
+extern char *MSAGetSeqDescription(MSA *msa, int idx); |
|
262 |
+extern char *MSAGetSeqSS(MSA *msa, int idx); |
|
263 |
+extern char *MSAGetSeqSA(MSA *msa, int idx); |
|
264 |
+ |
|
265 |
+extern float MSAAverageSequenceLength(MSA *msa); |
|
266 |
+ |
|
267 |
+/* from a2m.c |
|
268 |
+ */ |
|
269 |
+extern MSA *ReadA2M(MSAFILE *afp); |
|
270 |
+#ifdef CLUSTALO |
|
271 |
+//extern void WriteA2M(FILE *fp, MSA *msa, int vienna); |
|
272 |
+extern void WriteA2M(FILE *fp, MSA *msa, int iWrap); |
|
273 |
+#else |
|
274 |
+extern void WriteA2M(FILE *fp, MSA *msa); |
|
275 |
+#endif |
|
276 |
+/* from clustal.c |
|
277 |
+ */ |
|
278 |
+extern MSA *ReadClustal(MSAFILE *afp); |
|
279 |
+#ifdef CLUSTALO |
|
280 |
+extern void WriteClustal(FILE *fp, MSA *msa, int iWrap, int bResno); |
|
281 |
+extern void WriteClustalForR(MSA *msa, int iWrap, int bResno, int *result_c, char ***result_v); |
|
282 |
+#else |
|
283 |
+extern void WriteClustal(FILE *fp, MSA *msa); |
|
284 |
+extern void WriteClustalForR(MSA *msa, int *result_c, char ***result_v); |
|
285 |
+#endif |
|
286 |
+ |
|
287 |
+/* from eps.c |
|
288 |
+ */ |
|
289 |
+extern void EPSWriteSmallMSA(FILE *fp, MSA *msa); |
|
290 |
+ |
|
291 |
+/* from msf.c |
|
292 |
+ */ |
|
293 |
+extern MSA *ReadMSF(MSAFILE *afp); |
|
294 |
+extern void WriteMSF(FILE *fp, MSA *msa); |
|
295 |
+ |
|
296 |
+/* from phylip.c |
|
297 |
+ */ |
|
298 |
+extern MSA *ReadPhylip(MSAFILE *afp); |
|
299 |
+extern void WritePhylip(FILE *fp, MSA *msa); |
|
300 |
+ |
|
301 |
+/* from selex.c |
|
302 |
+ */ |
|
303 |
+extern MSA *ReadSELEX(MSAFILE *afp); |
|
304 |
+extern void WriteSELEX(FILE *fp, MSA *msa); |
|
305 |
+extern void WriteSELEXOneBlock(FILE *fp, MSA *msa); |
|
306 |
+ |
|
307 |
+/* from stockholm.c |
|
308 |
+ */ |
|
309 |
+extern MSA *ReadStockholm(MSAFILE *afp); |
|
310 |
+extern void WriteStockholm(FILE *fp, MSA *msa); |
|
311 |
+extern void WriteStockholmOneBlock(FILE *fp, MSA *msa); |
|
312 |
+ |
|
313 |
+#endif /*SQUID_MSA_INCLUDED*/ |