Browse code

add package to the repository

msa


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358

Sonali Arora authored on 10/04/2015 00:12:33
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,313 @@
1
+/*****************************************************************
2
+ * SQUID - a library of functions for biological sequence analysis
3
+ * Copyright (C) 1992-2002 Washington University School of Medicine
4
+ * 
5
+ *     This source code is freely distributed under the terms of the
6
+ *     GNU General Public License. See the files COPYRIGHT and LICENSE
7
+ *     for details.
8
+ *****************************************************************/
9
+
10
+#ifndef SQUID_MSA_INCLUDED
11
+#define SQUID_MSA_INCLUDED
12
+
13
+/* msa.h
14
+ * SRE, Mon May 17 10:24:30 1999
15
+ * 
16
+ * Header file for SQUID's multiple sequence alignment 
17
+ * manipulation code.
18
+ * 
19
+ * RCS $Id: msa.h 277 2013-05-16 15:42:49Z fabian $ (Original squid RCS Id: msa.h,v 1.12 2002/10/12 04:40:35 eddy Exp)
20
+ */
21
+
22
+#include <stdio.h>		/* FILE support */
23
+#include "gki.h"		/* hash table support */
24
+#include "ssi.h"		/* sequence file index support */
25
+#include "squid.h"		/* need SQINFO */
26
+
27
+/****************************************************
28
+ * Obsolete alignment information, AINFO
29
+ * Superceded by MSA structure further below; but we
30
+ * need AINFO for the near future for backwards
31
+ * compatibility.
32
+ ****************************************************/
33
+/* Structure: aliinfo_s
34
+ * 
35
+ * Purpose:   Optional information returned from an alignment file.
36
+ * 
37
+ *            flags: always used. Flags for which info is valid/alloced.
38
+ *       
39
+ *            alen: mandatory. Alignments are always flushed right
40
+ *                  with gaps so that all aseqs are the same length, alen.
41
+ *                  Available for all alignment formats.
42
+ *
43
+ *            nseq: mandatory. Aligned seqs are indexed 0..nseq-1. 
44
+ *                  
45
+ *            wgt:  0..nseq-1 vector of sequence weights. Mandatory.
46
+ *                  If not explicitly set, weights are initialized to 1.0.
47
+ *
48
+ *            cs:   0..alen-1, just like the alignment. Contains single-letter
49
+ *                  secondary structure codes for consensus structure; "<>^+"
50
+ *                  for RNA, "EHL." for protein. May be NULL if unavailable
51
+ *                  from seqfile. Only available for SELEX format files.
52
+ *                  
53
+ *            rf:   0..alen-1, just like the alignment. rf is an arbitrary string
54
+ *                  of characters, used for annotating columns. Blanks are
55
+ *                  interpreted as non-canonical columns and anything else is
56
+ *                  considered canonical. Only available from SELEX files.
57
+ *                  
58
+ *            sqinfo: mandatory. Array of 0..nseq-1 
59
+ *                  per-sequence information structures, carrying
60
+ *                  name, id, accession, coords.
61
+ *                  
62
+ */
63
+struct aliinfo_s {		
64
+  int               flags;      /* flags for what info is valid             */
65
+  int               alen;	/* length of alignment (columns)            */
66
+  int               nseq;       /* number of seqs in alignment              */
67
+  float            *wgt;	/* sequence weights [0..nseq-1]             */
68
+  char             *cs;         /* consensus secondary structure string     */
69
+  char             *rf;         /* reference coordinate system              */
70
+  struct seqinfo_s *sqinfo;     /* name, id, coord info for each sequence   */
71
+
72
+        /* Pfam/HMMER pick-ups */	
73
+  char  *name;			/* name of alignment        */
74
+  char  *desc;			/* description of alignment */
75
+  char  *acc;			/* accession of alignment   */
76
+  char  *au;			/* "author" information     */
77
+  float  tc1, tc2;		/* trusted score cutoffs (per-seq, per-domain) */
78
+  float  nc1, nc2;		/* noise score cutoffs (per-seq, per-domain)   */
79
+  float  ga1, ga2;		/* gathering cutoffs */
80
+};
81
+typedef struct aliinfo_s AINFO;
82
+#define AINFO_TC      (1 << 0)
83
+#define AINFO_NC      (1 << 1)
84
+#define AINFO_GA      (1 << 2)
85
+
86
+/*****************************************************************
87
+ * MSA  
88
+ * SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland]
89
+ * 
90
+ * Defines the new data structure and API for multiple
91
+ * sequence alignment i/o.
92
+ *****************************************************************/
93
+
94
+/* The following constants define the Pfam/Rfam cutoff set we'll propagate
95
+ * from msa's into HMMER and Infernal models.
96
+ */
97
+#define MSA_CUTOFF_TC1 0
98
+#define MSA_CUTOFF_TC2 1
99
+#define MSA_CUTOFF_GA1 2
100
+#define MSA_CUTOFF_GA2 3
101
+#define MSA_CUTOFF_NC1 4
102
+#define MSA_CUTOFF_NC2 5
103
+#define MSA_MAXCUTOFFS 6
104
+
105
+/* Structure: MSA
106
+ * SRE, Tue May 18 11:33:08 1999
107
+ * 
108
+ * Our object for a multiple sequence alignment.
109
+ */
110
+typedef struct msa_struct {
111
+  /* Mandatory information associated with the alignment.
112
+   */
113
+  char **aseq;                  /* the alignment itself, [0..nseq-1][0..alen-1] */
114
+  char **sqname;                /* names of sequences, [0..nseq-1][0..alen-1]   */
115
+  float *wgt;	                /* sequence weights [0..nseq-1]                 */
116
+  int    alen;			/* length of alignment (columns)                */
117
+  int    nseq;			/* number of seqs in alignment                  */
118
+
119
+  /* Optional information that we understand, and might have.
120
+   */
121
+  int    flags;			/* flags for what optional info is valid    */
122
+  int    type;			/* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */
123
+  char  *name;             	/* name of alignment, or NULL */
124
+  char  *desc;	                /* description of alignment, or NULL */
125
+  char  *acc;	                /* accession of alignment, or NULL */
126
+  char  *au;		        /* "author" information, or NULL */
127
+  char  *ss_cons;		/* consensus secondary structure string, or NULL */
128
+  char  *sa_cons;               /* consensus surface accessibility string, or NULL */
129
+  char  *rf;                    /* reference coordinate system, or NULL */
130
+  char **sqacc;			/* accession numbers for individual sequences */
131
+  char **sqdesc;		/* description lines for individual sequences */
132
+  char **ss;                    /* per-seq secondary structure annotation, or NULL */
133
+  char **sa;                    /* per-seq surface accessibility annotation, or NULL */
134
+  float  cutoff[MSA_MAXCUTOFFS];       /* NC, TC, GA cutoffs propagated to Pfam/Rfam */
135
+  int    cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */
136
+
137
+  /* Optional information that we don't understand.
138
+   * That is, we know what type of information it is, but it's
139
+   * either (interpreted as) free-text comment, or it's Stockholm 
140
+   * markup with unfamiliar tags.
141
+   */
142
+  char  **comment;              /* free text comments, or NULL      */
143
+  int     ncomment;		/* number of comment lines          */
144
+  int     alloc_ncomment;	/* number of comment lines alloc'ed */
145
+
146
+  char  **gf_tag;               /* markup tags for unparsed #=GF lines  */
147
+  char  **gf;                   /* annotations for unparsed #=GF lines  */
148
+  int     ngf;			/* number of unparsed #=GF lines        */
149
+  int     alloc_ngf;		/* number of gf lines alloc'ed          */
150
+
151
+  char  **gs_tag;               /* markup tags for unparsed #=GS lines     */
152
+  char ***gs;                   /* [0..ngs-1][0..nseq-1][free text] markup */
153
+  GKI    *gs_idx;               /* hash of #=GS tag types                  */
154
+  int     ngs;                  /* number of #=GS tag types                */
155
+  
156
+  char  **gc_tag;               /* markup tags for unparsed #=GC lines  */
157
+  char  **gc;                   /* [0..ngc-1][0..alen-1] markup         */
158
+  GKI    *gc_idx;               /* hash of #=GC tag types               */
159
+  int     ngc;                  /* number of #=GC tag types             */
160
+
161
+  char  **gr_tag;               /* markup tags for unparsed #=GR lines   */
162
+  char ***gr;                   /* [0..ngr][0..nseq-1][0..alen-1] markup */
163
+  GKI    *gr_idx;               /* hash of #=GR tag types                */
164
+  int     ngr;			/* number of #=GR tag types              */
165
+
166
+  /* Stuff we need for our own maintenance of the data structure
167
+   */
168
+  GKI   *index;		        /* name ->seqidx hash table */
169
+  int    nseqalloc;		/* number of seqs currently allocated for   */
170
+  int    nseqlump;		/* lump size for dynamic expansions of nseq */
171
+  int   *sqlen;                 /* individual sequence lengths during parsing */
172
+  int   *sslen;                 /* individual ss lengths during parsing       */
173
+  int   *salen;                 /* individual sa lengths during parsing       */
174
+  int    lastidx;		/* last index we saw; use for guessing next   */
175
+} MSA;
176
+#define MSA_SET_WGT     (1 << 0)  /* track whether wgts were set, or left at default 1.0 */
177
+
178
+                                     
179
+/* Structure: MSAFILE
180
+ * SRE, Tue May 18 11:36:54 1999
181
+ * 
182
+ * Defines an alignment file that's open for reading.
183
+ */
184
+typedef struct msafile_struct {
185
+  FILE *f;                      /* open file pointer                         */
186
+  char *fname;			/* name of file. used for diagnostic output  */
187
+  int   linenumber;		/* what line are we on in the file           */
188
+
189
+  char *buf;			/* buffer for line input w/ sre_fgets() */
190
+  int   buflen;			/* current allocated length for buf     */
191
+
192
+  SSIFILE *ssi;		        /* open SSI index file; or NULL, if none. */
193
+
194
+  int   do_gzip;		/* TRUE if f is a pipe from gzip -dc (need pclose(f))  */
195
+  int   do_stdin;		/* TRUE if f is stdin (don't close f, not our problem) */
196
+  int   format;			/* format of alignment file we're reading */
197
+} MSAFILE;
198
+
199
+
200
+/* Alignment file formats.
201
+ * Must coexist with sqio.c/squid.h unaligned file format codes.
202
+ * Rules:
203
+ *     - 0 is an unknown/unassigned format 
204
+ *     - <100 reserved for unaligned formats
205
+ *     - >100 reserved for aligned formats
206
+ */
207
+#define MSAFILE_UNKNOWN   0	/* unknown format                          */
208
+#define MSAFILE_STOCKHOLM 101	/* Pfam/HMMER's Stockholm format           */
209
+#define MSAFILE_SELEX	  102	/* Obsolete(!): old HMMER/SELEX format     */
210
+#define MSAFILE_MSF	  103	/* GCG MSF format                          */
211
+#define MSAFILE_CLUSTAL	  104	/* Clustal V/W format                      */
212
+#define MSAFILE_A2M	  105	/* aligned FASTA (A2M is UCSC terminology) */
213
+#define MSAFILE_PHYLIP    106	/* Felsenstein's PHYLIP format             */
214
+#define MSAFILE_EPS       107	/* Encapsulated PostScript (output only)   */
215
+#ifdef CLUSTALO
216
+#define MSAFILE_VIENNA    108	/* Vienna: concatenated fasta   */
217
+#endif
218
+
219
+#define IsAlignmentFormat(fmt)  ((fmt) > 100)
220
+
221
+
222
+/* from msa.c
223
+ */
224
+extern MSAFILE *MSAFileOpen(char *filename, int format, char *env);
225
+extern MSA     *MSAFileRead(MSAFILE *afp);
226
+extern void     MSAFileClose(MSAFILE *afp);
227
+extern void     MSAFree(MSA *msa);
228
+#ifdef CLUSTALO
229
+extern void     MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline, int iWrap, int bResno);
230
+#else
231
+extern void     MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline);
232
+#endif
233
+
234
+extern int MSAFileRewind(MSAFILE *afp);
235
+extern int MSAFilePositionByKey(MSAFILE *afp, char *key);
236
+extern int MSAFilePositionByIndex(MSAFILE *afp, int idx);
237
+
238
+extern int   MSAFileFormat(MSAFILE *afp);
239
+extern MSA  *MSAAlloc(int nseq, int alen);
240
+extern void  MSAExpand(MSA *msa);
241
+extern char *MSAFileGetLine(MSAFILE *afp);
242
+extern void  MSASetSeqAccession(MSA *msa, int seqidx, char *acc);
243
+extern void  MSASetSeqDescription(MSA *msa, int seqidx, char *desc);
244
+extern void  MSAAddComment(MSA *msa, char *s);
245
+extern void  MSAAddGF(MSA *msa, char *tag, char *value);
246
+extern void  MSAAddGS(MSA *msa, char *tag, int seqidx, char *value);
247
+extern void  MSAAppendGC(MSA *msa, char *tag, char *value);
248
+extern char *MSAGetGC(MSA *msa, char *tag);
249
+extern void  MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value);
250
+extern void  MSAVerifyParse(MSA *msa);
251
+extern int   MSAGetSeqidx(MSA *msa, char *name, int guess);
252
+
253
+extern MSA  *MSAFromAINFO(char **aseq, AINFO *ainfo);   
254
+
255
+extern void  MSAMingap(MSA *msa);
256
+extern void  MSANogap(MSA *msa);
257
+extern void  MSAShorterAlignment(MSA *msa, int *useme);
258
+extern void  MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new);
259
+
260
+extern char *MSAGetSeqAccession(MSA *msa, int idx);
261
+extern char *MSAGetSeqDescription(MSA *msa, int idx);
262
+extern char *MSAGetSeqSS(MSA *msa, int idx);
263
+extern char *MSAGetSeqSA(MSA *msa, int idx);
264
+
265
+extern float MSAAverageSequenceLength(MSA *msa);
266
+
267
+/* from a2m.c
268
+ */
269
+extern MSA  *ReadA2M(MSAFILE *afp);
270
+#ifdef CLUSTALO
271
+//extern void  WriteA2M(FILE *fp, MSA *msa, int vienna);
272
+extern void  WriteA2M(FILE *fp, MSA *msa, int iWrap);
273
+#else
274
+extern void  WriteA2M(FILE *fp, MSA *msa);
275
+#endif
276
+/* from clustal.c
277
+ */
278
+extern MSA  *ReadClustal(MSAFILE *afp);
279
+#ifdef CLUSTALO
280
+extern void  WriteClustal(FILE *fp, MSA *msa, int iWrap, int bResno);
281
+extern void  WriteClustalForR(MSA *msa, int iWrap, int bResno, int *result_c, char ***result_v);
282
+#else
283
+extern void  WriteClustal(FILE *fp, MSA *msa);
284
+extern void  WriteClustalForR(MSA *msa, int *result_c, char ***result_v);
285
+#endif
286
+
287
+/* from eps.c
288
+ */
289
+extern void EPSWriteSmallMSA(FILE *fp, MSA *msa);
290
+
291
+/* from msf.c
292
+ */
293
+extern MSA  *ReadMSF(MSAFILE *afp);
294
+extern void  WriteMSF(FILE *fp, MSA *msa);
295
+
296
+/* from phylip.c
297
+ */
298
+extern MSA  *ReadPhylip(MSAFILE *afp);
299
+extern void  WritePhylip(FILE *fp, MSA *msa);
300
+
301
+/* from selex.c
302
+ */
303
+extern MSA  *ReadSELEX(MSAFILE *afp);
304
+extern void  WriteSELEX(FILE *fp, MSA *msa);
305
+extern void  WriteSELEXOneBlock(FILE *fp, MSA *msa);
306
+
307
+/* from stockholm.c
308
+ */
309
+extern MSA  *ReadStockholm(MSAFILE *afp);
310
+extern void  WriteStockholm(FILE *fp, MSA *msa);
311
+extern void  WriteStockholmOneBlock(FILE *fp, MSA *msa);
312
+
313
+#endif /*SQUID_MSA_INCLUDED*/