Browse code

add package to the repository

msa


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358

Sonali Arora authored on 10/04/2015 00:12:33
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,247 @@
1
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
+
3
+/*********************************************************************
4
+ * Clustal Omega - Multiple sequence alignment
5
+ *
6
+ * Copyright (C) 2010 University College Dublin
7
+ *
8
+ * Clustal-Omega is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU General Public License as
10
+ * published by the Free Software Foundation; either version 2 of the
11
+ * License, or (at your option) any later version.
12
+ *
13
+ * This file is part of Clustal-Omega.
14
+ *
15
+ ********************************************************************/
16
+
17
+/*
18
+ * RCS $Id: hhdecl-C.h 227 2011-03-28 17:03:09Z fabian $
19
+ */
20
+
21
+/////////////////////////////////////////////////////////////////////////////////////
22
+//// Constants
23
+/////////////////////////////////////////////////////////////////////////////////////
24
+
25
+const char VERSION_AND_DATE[]="version 1.5.1.3 (November 2008)";
26
+const char REFERENCE[]="Soding, J. Protein homology detection by HMM-HMM comparison. Bioinformatics 2005, 21, 951-960.\n";
27
+const char COPYRIGHT[]="(C) Johannes Soeding (see LICENSE file)\n";
28
+const int MAXSEQ=65535; //max number of sequences in input alignment (must be <~30000 on cluster nodes)
29
+#if 0
30
+const int MAXCOL=32765; //max number of residues in input files; must be <= LINELEN and >= MAXRES
31
+const int MAXRES=15002; //max number of columns in HMM; must be <= LINELEN
32
+#else
33
+const int MAXCOL=2/*131072*/; //max number of residues in input files; must be <= LINELEN and >= MAXRES
34
+const int MAXRES=1/*65536*/; //max number of columns in HMM; must be <= LINELEN
35
+#endif
36
+const int LINELEN=262144; //max length of line read in from input files; must be >= MAXCOL 
37
+const int MAXSEQDIS=3; //10238;//max number of sequences stored in 'hit' objects and displayed in output alignment 
38
+const int IDLEN=255;     //max length of scop hierarchy id and pdb-id
39
+const int DESCLEN=32765;//max length of sequence description (longname)
40
+const int NAMELEN=511;  //max length of file names etc.
41
+const int MAXOPT=127;   //Maximum number of options to be read in from .hhconfig or command line
42
+const int NAA=20;       //number of amino acids (0-19)
43
+const int NTRANS=10;    //number of transitions recorded in HMM (M2M,M2I,M2D,I2M,I2I,D2M,D2D,M2M_GAPOPEN,GAPOPEN,GAPEXTD)
44
+const int NCOLMIN=10;   //min number of cols in subalignment for calculating pos-specific weights w[k][i]
45
+const int ANY=20;       //number representing an X (any amino acid) internally
46
+const int GAP=21;       //number representing a gap internally 
47
+const int ENDGAP=22;    //Important to distinguish because end gaps do not contribute to tansition counts 
48
+const int HMMSCALE=1000;//Scaling number for log2-values in HMMs
49
+const int NFAMMAX=5119; //Size of hash for counting number of HMMs in each family
50
+const int MAXPROF=32766;//Maximum number of HMM scores for fitting EVD
51
+const float MAXENDGAPFRAC=0.1; //For weighting: include only columns into subalignment i that have a max fraction of seqs with endgap
52
+const float SMIN= 20.;  //Minimum score of hit needed to search for another repeat of same profile: p=exp(-(4-mu)/lamda)=0.01
53
+const float LAMDA=0.388; //lamda in score EVD used for -local mode in length correction: S = S-log(Lq*Lt)/LAMDA) 
54
+const float LAMDA_GLOB=0.42; //lamda in score EVD used for -global mode
55
+const float PMAX=1E-2;  //Maximum single-repeat p-value that can contribute to whole-protein p-value
56
+const float MINEVALEXCL=0.5; //above this E-value from first ML fit hits are not used for final ML fit of EVD
57
+const int SELFEXCL=3;   // exclude self-alignments with j-i<SELFEXCL
58
+//is now in defined in hhhalign_wrapper.c -- const float PLTY_GAPOPEN=6.0f; // for -qsc option (filter for min similarity to query): 6 bits to open gap
59
+//is now in defined in hhhalign_wrapper.c -- const float PLTY_GAPEXTD=1.0f; // for -qsc option (filter for min similarity to query): 1 bit to extend gap
60
+const int MINCOLS_REALIGN=6; // hits with MAC alignments with fewer matched columns will be deleted in hhsearch hitlist
61
+
62
+enum transitions {M2M,M2I,M2D,I2M,I2I,D2M,D2D,M2M_GAPOPEN,GAPOPEN,GAPEXTD}; // index for transitions within a HMM
63
+enum pair_states {STOP=0,SAME=1,GD=2,IM=3,DG=4,MI=5,MS=6,ML=7,SM=8,LM=9,MM=10}; 
64
+
65
+// const char aa[]="ARNDCQEGHILKMFPSTWYVX-";
66
+//Amino acids Sorted by alphabet     -> internal numbers a 
67
+//                0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
68
+//                A  C  D  E  F  G  H  I  K  L  M  N  P  Q  R  S  T  V  W  Y  X
69
+const int s2a[]={ 0, 4, 3, 6,13, 7, 8, 9,11,10,12, 2,14, 5, 1,15,16,19,17,18,20};
70
+//Internal numbers a for amino acids -> amino acids Sorted by alphabet: 
71
+//                0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
72
+//                A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
73
+const int a2s[]={ 0,14,11, 2, 1,13, 3, 5, 6, 7, 9, 8,10, 4,12,15,16,18,19,17,20};
74
+
75
+// Secondary structure
76
+const int NDSSP=8;      //number of different ss states determined by dssp: 0-7 (0: no state available)
77
+const int NSSPRED=4;    //number of different ss states predicted by psipred: 0-3 (0: no prediction availabe)
78
+const int MAXCF=11;     //number of different confidence values: 0-10 (0: no prediction availabe)
79
+const int NSA=7;        //number of classes relative solvent accesiblity (0:no coord,  1:<2%, 2:<14%, 3:<33%, 4:<55%, 5:>55%, 6:S-S bridge)
80
+ 
81
+/////////////////////////////////////////////////////////////////////////////////////
82
+/////////////////////////////////////////////////////////////////////////////////////
83
+
84
+// Input parameters
85
+class Parameters          // Parameters for gap penalties and pseudocounts
86
+{
87
+public:
88
+  char** argv;            //command line parameters
89
+  char argc;              //dimension of argv
90
+
91
+  char infile[NAMELEN];   // input filename
92
+  char outfile[NAMELEN];  // output filename
93
+  char pairwisealisfile[NAMELEN]; // output filename with pairwise alignments
94
+  char alnfile[NAMELEN];  // name of output alignment file in A3M format (for iterative search)
95
+  char hhmfile[NAMELEN];  // name of output HHM file for (iterative search)
96
+  char psifile[NAMELEN];  // name of output alignmen file in PSI-BLAST format (iterative search)
97
+  char scorefile[NAMELEN];// table of scores etc for all HMMs in searched database
98
+  char tfile[NAMELEN];    // template filename (in hhalign)
99
+  char buffer[NAMELEN];   // buffer to write results for other programs into
100
+  char pngfile[NAMELEN];  // png image file for dotplot
101
+  char wfile[NAMELEN];    // weights file generated with hhformat
102
+  char* blafile;          // output of 'blastpgp -m 8' with PSI-BLAST E-values for HHblast  
103
+  char* dbfiles;          // database filenames, separated by colons
104
+  char* exclstr;          // optional string containing list of excluded residues, e.g. '1-33,97-168'
105
+  int aliwidth;           // number of characters per line in output alignments for HMM search
106
+  char append;            // append to output file? (hhmake)
107
+  float p;                // minimum probability for inclusion in hit list and alignments
108
+  float E;                // maximum E-value for inclusion in hit list and alignment list
109
+  float e;                // maximum E-value for inclusion in output alignment, output HMM, and PSI-BLAST checkpoint model
110
+  int Z;                  // max number of lines in hit list
111
+  int z;                  // min number of lines in hit list
112
+  int B;                  // max number of lines in alignment list
113
+  int b;                  // min number of lines in alignment list
114
+  int showcons;           // 0: don't show consensus sequence in alignments  1:show
115
+  int showdssp;           // 0: don't show consensus sequence in alignments  1:show
116
+  int showpred;           // 0: don't show consensus sequence in alignments  1:show
117
+  int nseqdis;          // maximum number of query or template sequences in output alignments
118
+  char cons;              // if set to 1, include consensus as first representative sequence of HMM
119
+  char mark;              // which sequences to mark for display in output alignments? 0: auto; 1:all
120
+  char outformat;         // 0: hhr  1: FASTA  2:A2M   3:A3M 
121
+  char mode;              // 
122
+                          //0:MAC alignment, master-slave  1:MAC blending, master-slave  2:MAC alignment, combining
123
+
124
+  int max_seqid;          // Maximum sequence identity with all other sequences in alignment
125
+  int qid;                // Minimum sequence identity with query sequence (sequence 0)
126
+  float qsc;              // Minimum score per column with query sequence (sequence 0)
127
+  int coverage;           // Minimum coverage threshold
128
+  int Ndiff;              // Pick Ndiff most different sequences that passed the other filter thresholds
129
+  int coverage_core;      // Minimum coverage for sequences in core alignment
130
+  float qsc_core;         // Minimum sequence identity with query for sequences in core alignment
131
+  float coresc;           // Minimum score per column with core alignment (HMM)
132
+	
133
+  int maxResLen;          /* length of longest sequence/profile, FS 2010-11-05 */
134
+  int maxColCnt;          /* maximum number of columns in HMM, FS 2010-11-05 */
135
+
136
+  int Mgaps;              // Maximum percentage of gaps for match states
137
+  int M;                  // Match state assignment by  1:upper/lower case  2:percentage rule  3:marked sequence
138
+  char matrix;            // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50
139
+     
140
+  char wg;                // 0: use local sequence weights   1: use local ones
141
+  double *pdWg1;          /* seq weights 1st profile, derived from tree */
142
+  double *pdWg2;          /* seq weights 2nd profile, derived from tree */
143
+
144
+  char pcm;               // 0:no pseudocounts, 1:pos-specific pcs, 2:PSIBLAST pcs
145
+  /* pseudo-count parameters for MAC*/
146
+  float pca;              // Pseudocount matrix = (1-tau(i))*I + tau(i)*S
147
+  float pcb;              // tau(i) = pca/(1 + ((Neff-1)/pcb)^pcc
148
+  float pcc;              // 
149
+  float pcw;              // Decrease pseudocounts for conserved columns 
150
+		     
151
+  /* gap parameters for MAC*/
152
+  float gapb;             // Diversity threshold for adding pseudocounts to transitions from M state
153
+  float gapd;             // Gap open penalty factor for deletions
154
+  float gape;             // Gap extend penalty: factor to multiply hmmer values (def=1)
155
+  float gapf;             // factor for increasing/reducing the gap opening penalty for deletes
156
+  float gapg;             // factor for increasing/reducing the gap opening penalty for inserts
157
+  float gaph;             // factor for increasing/reducing the gap extension penalty for deletes
158
+  float gapi;             // factor for increasing/reducing the gap extension penalty for inserts
159
+  		      
160
+  /* pseudo-count parameters for Viterbi, FS, r226->r227 */
161
+  float pcaV;              // Pseudocount matrix = (1-tau(i))*I + tau(i)*S
162
+  float pcbV;              // tau(i) = pca/(1 + ((Neff-1)/pcb)^pcc
163
+  float pccV;              // 
164
+  float pcwV;              // Decrease pseudocounts for conserved columns 
165
+		     
166
+  /* gap parameters for Viterbi, FS, r226->r227 */
167
+  float gapbV;             // Diversity threshold for adding pseudocounts to transitions from M state
168
+  float gapdV;             // Gap open penalty factor for deletions
169
+  float gapeV;             // Gap extend penalty: factor to multiply hmmer values (def=1)
170
+  float gapfV;             // factor for increasing/reducing the gap opening penalty for deletes
171
+  float gapgV;             // factor for increasing/reducing the gap opening penalty for inserts
172
+  float gaphV;             // factor for increasing/reducing the gap extension penalty for deletes
173
+  float gapiV;             // factor for increasing/reducing the gap extension penalty for inserts
174
+  		      
175
+  float egq;              // penalty for end gaps when query not fully covered
176
+  float egt;              // penalty for end gaps when template not fully covered 
177
+
178
+  float neffa;            // Coefficients to estimate Neff-dependent weights for HMM merging procedure
179
+  float neffb;            // Coefficients to estimate Neff-dependent weights for HMM merging procedure
180
+		      
181
+  char ssgap;             // 1: add secondary structure-dependent gap penalties  0:off
182
+  float ssgapd;           // secondary structure-dependent gap-opening penalty (per residue)
183
+  float ssgape;           // secondary structure-dependent gap-extension penalty (per residue)
184
+  char ssgapi;            // max. number of inside-integer(ii); gap-open-penalty= -ii*ssgapd
185
+		      
186
+  char ssm;               // SS comparison mode: 0:no ss scoring  1:ss scoring AFTER alignment  2:ss score in column score
187
+  float ssw;              // SS weight as compared to column score
188
+  float ssa;              // SS state evolution matrix M1 = (1-ssa)*I + ssa*M0
189
+		      
190
+  char loc;               // 0: local alignment (wrt. query), 1: global alignement 
191
+  char forward;           // 0:Viterbi algorithm  1:Forward algorithm  2: MAC
192
+  char realign;           // realign database hits to be displayed with MAC algorithm
193
+  char altali;            // find up to this many possibly overlapping alignments
194
+  int columnscore;        // 0: no aa comp corr  1: 1/2(qav+tav) 2: template av freqs 3: query av freqs 4:...
195
+  float corr;             // Weight of correlations between scores with |i-j|<=4
196
+  float shift;            // Score offset for match-match states
197
+  float mact;             // Score threshold (negative offset) in MAC alignment
198
+		       
199
+  char calibrate;         // calibration of query HMM?  0:no, 1:yes (write lamda,mu into query profile)
200
+  char calm;              // derive P-values from: 0:query calibration  1:template calibration  2:both
201
+  int opt;                // for optimization: compare only every opt'th negative; 0: mode off
202
+  int readdefaultsfile ;  // read defaults file ./.hhdefaults or HOME/.hhdefaults?
203
+  int min_overlap;        // all cells of dyn. programming matrix with L_T-j+i or L_Q-i+j < min_overlap will be ignored
204
+  int hitrank;            // rank of hit to be printed as a3m alignment
205
+  char notags;            // neutralize His-tags, FLAG tags, C-myc tags?
206
+  unsigned int maxdbstrlen; // maximum length of database string to be printed in 'Command' line of hhr file
207
+		       
208
+  char trans;             // 0: normal pairwise scoring; 1:transitive scoring 
209
+  float Emax_trans;       // max E-value for intermediate HMMs in transitive scoring (i.e. l is intermediate HMM if E_lq, E_lk <Emax_trans)
210
+  float wtrans;           // Ztot[k] = Zq[k] + wtrans * (Zforward[k]+Zreverse[k])
211
+
212
+
213
+  // SCRAP THE FOLLOWING VARIABLES?
214
+
215
+  float wstruc;          // weight of structure scores
216
+  char repmode;          // 1:repeat identification: multiple hits not treated as independent 0: repeat mode off
217
+
218
+  // ...
219
+  float gapOpening;  //instead of PTLY_GAPOPEN
220
+  float gapExtension; //instead of PTLY_GAPEXTD
221
+
222
+  int idummy;
223
+  int jdummy;
224
+  float fdummy;      
225
+};
226
+
227
+/////////////////////////////////////////////////////////////////////////////////////
228
+//// Global variable declarations
229
+/////////////////////////////////////////////////////////////////////////////////////
230
+
231
+char v=1;             // 1: show only warnings 2:verbose mode  
232
+Parameters par;
233
+char program_name[NAMELEN]; //name of program executed (e.g. hhmake of hhsearch)
234
+
235
+// substitution matrix flavours
236
+float P[21][21];      // P[a][b] = combined probability for a aligned to b
237
+float R[21][21];      // R[a][b]=P[a][b]/p[b]=P(a|b); precalculated for pseudocounts
238
+float Sim[21][21];    // Similarity matrix Sim[a][b]: how similar are a and b?
239
+float S[21][21];      // Substitution score matrix S[a][b] = log2(Pab/pa/pb)
240
+float pb[21];         // pb[a] = background amino acid probabilities for chosen substitution matrix
241
+float qav[21];        // qav[a] = background amino acid probabilities for query HMM (needed for rate matrix rescaling)
242
+
243
+// secondary structure matrices
244
+float S73[NDSSP][NSSPRED][MAXCF];           // P[A][B][cf]       =  log2 P(A,B,cf)/P(A)/P(B,cf)
245
+float S33[NSSPRED][MAXCF][NSSPRED][MAXCF];  // P[B][cf][B'][cf'] =  log2 sum_B' P(A,B',cf)/P(A)/P(B,cf) * P_b(B'|B)
246
+// float S77[NDSSP][DSSP];                  // P[A][B]           =  log2 P(A,B)/P(A)/P(B)
247
+