msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,192 @@ |
1 |
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|
2 |
+ |
|
3 |
+/********************************************************************* |
|
4 |
+ * Clustal Omega - Multiple sequence alignment |
|
5 |
+ * |
|
6 |
+ * Copyright (C) 2010 University College Dublin |
|
7 |
+ * |
|
8 |
+ * Clustal-Omega is free software; you can redistribute it and/or |
|
9 |
+ * modify it under the terms of the GNU General Public License as |
|
10 |
+ * published by the Free Software Foundation; either version 2 of the |
|
11 |
+ * License, or (at your option) any later version. |
|
12 |
+ * |
|
13 |
+ * This file is part of Clustal-Omega. |
|
14 |
+ * |
|
15 |
+ ********************************************************************/ |
|
16 |
+ |
|
17 |
+/* |
|
18 |
+ * RCS $Id: hhhit.h 243 2011-05-31 13:49:19Z fabian $ |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+// hhhit.h |
|
22 |
+ |
|
23 |
+////////////////////////////////////////////////////////////////////////////// |
|
24 |
+/* Describes an alignment of two profiles. |
|
25 |
+ Used as list element in Hits : List<Hit> */ |
|
26 |
+////////////////////////////////////////////////////////////////////////////// |
|
27 |
+class Hit |
|
28 |
+{ |
|
29 |
+ public: |
|
30 |
+ char* longname; // Name of HMM |
|
31 |
+ char* name; // One-word name of HMM |
|
32 |
+ char* file; // Basename (with path, without extension) of alignment file that was used to construct the HMM |
|
33 |
+ // (path from db-file is prepended to FILE record in HMM file!) |
|
34 |
+ char fam[IDLEN]; // family ID (derived from name) (FAM field) |
|
35 |
+ char sfam[IDLEN]; // superfamily ID (derived from name) |
|
36 |
+ char fold[IDLEN]; // fold ID (derived from name) |
|
37 |
+ char cl[IDLEN]; // class ID (derived from name) |
|
38 |
+ int index; // index of HMM in order of reading in (first=0) |
|
39 |
+ char* dbfile; // full database file name from which HMM was read |
|
40 |
+ long ftellpos; // start position of HMM in database file |
|
41 |
+ |
|
42 |
+ float score; // Score of alignment (i.e. of Viterbi path) |
|
43 |
+ float score_sort; // score to sort hits in output list (negative means first/best!) |
|
44 |
+ float score_aass; // first: just hit.score, then hit.logPval-SSSCORE2NATLOG*hit.score_ss;(negative means best!) |
|
45 |
+ float score_ss; // Part of score due to secondary structure |
|
46 |
+ float Pval; // P-value for whole protein based on score distribution of query |
|
47 |
+ float Pvalt; // P-value for whole protein based on score distribution of template |
|
48 |
+ float logPval; // natural logarithm of Pval |
|
49 |
+ float logPvalt; // natural logarithm of Pvalt |
|
50 |
+ float Eval; // E-value for whole protein |
|
51 |
+ float Probab; // probability in % for a positive (depends only on score) |
|
52 |
+ float weight; // weight of hit for P-value calculation (= 1/#HMMs-in-family/#families-in-superfamily) |
|
53 |
+ double Pforward; // scaled total forward probability : Pforward * Product_{i=1}^{Lq+1}(scale[i]) |
|
54 |
+ |
|
55 |
+/* float score_comp; // compositional similarity score */ |
|
56 |
+/* float logPcomp; // natural logarithm of Pvalue for compositional similarity score */ |
|
57 |
+/* float Prep; // P-value for single-repeat hit */ |
|
58 |
+/* float Erep; // E-value for single-repeat hit */ |
|
59 |
+/* float logPrep; // natural logarithm of P-value for single-repeat hit */ |
|
60 |
+ float E1val; // E-value for whole protein from transitive scoring |
|
61 |
+ float logP1val; // natural logarithm of P1val, the transitive P-value |
|
62 |
+ |
|
63 |
+ int L; // Number of match states in template |
|
64 |
+ int irep; // Index of single-repeat hit (1: highest scoring repeat hit) |
|
65 |
+ int nrep; // Number of single-repeat hits with one template |
|
66 |
+ |
|
67 |
+ int n_display; // number of sequences stored for display of alignment |
|
68 |
+ char** sname; // names of stored sequences |
|
69 |
+ char** seq; // residues of stored sequences (first at pos 1) |
|
70 |
+ int nss_dssp; // index of dssp secondary structure sequence in seq[] |
|
71 |
+ int nsa_dssp; // index of of dssp solvent accessibility in seq[] |
|
72 |
+ int nss_pred; // index of dssp secondary structure sequence in seq[] |
|
73 |
+ int nss_conf; // index of dssp secondary structure sequence in seq[] |
|
74 |
+ int nfirst; // index of query sequence in seq[] |
|
75 |
+ int ncons; // index of consensus sequence |
|
76 |
+ |
|
77 |
+ int nsteps; // index for last step in Viterbi path; (first=1) |
|
78 |
+ int* i; // i[step] = query match state at step of Viterbi path |
|
79 |
+ int* j; // j[step] = template match state at step of Viterbi path |
|
80 |
+ char* states; // state at step of Viterbi path 0: Start 1: M(MM) 2: A(-D) 3: B(IM) 4: C(D-) 5 D(MI) |
|
81 |
+ float* S; // S[step] = match-match score contribution at alignment step |
|
82 |
+ float* S_ss; // S_ss[step] = secondary structure score contribution |
|
83 |
+ float* P_posterior; // P_posterior[step] = posterior prob for MM states (otherwise zero) |
|
84 |
+ char* Xcons; // consensus sequence for aligned states in internal representation (A=0 R=1 N=2 D=3 ...) |
|
85 |
+ int i1; // First aligned residue in query |
|
86 |
+ int i2; // Last aligned residue in query |
|
87 |
+ int j1; // First aligned residue in template |
|
88 |
+ int j2; // Last aligned residue in template |
|
89 |
+ int matched_cols; // number of matched columns in alignment against query |
|
90 |
+ int ssm1; // SS scoring AFTER alignment? 0:no 1:yes; t->dssp q->psipred 2:yes; q->dssp t->psipred |
|
91 |
+ int ssm2; // SS scoring DURING alignment? 0:no 1:yes; t->dssp q->psipred 2:yes; q->dssp t->psipred |
|
92 |
+ char self; // 0: align two different HMMs 1: align HMM with itself |
|
93 |
+ int min_overlap; // Minimum overlap between query and template |
|
94 |
+ float sum_of_probs; // sum of probabilities for Maximum ACcuracy alignment (if dssp states defined, only aligned pairs with defined dssp state contribute to sum) |
|
95 |
+ float Neff_HMM; // Diversity of underlying alignment |
|
96 |
+ |
|
97 |
+ // Constructor (only set pointers to NULL) |
|
98 |
+ Hit(); |
|
99 |
+ ~Hit(){}; |
|
100 |
+ |
|
101 |
+ // Free all allocated memory (to delete list of hits) |
|
102 |
+ void Delete(); |
|
103 |
+ |
|
104 |
+ // Allocate/delete memory for dynamic programming matrix |
|
105 |
+ void AllocateBacktraceMatrix(int Nq, int Nt); |
|
106 |
+ void DeleteBacktraceMatrix(int Nq); |
|
107 |
+ void AllocateForwardMatrix(int Nq, int Nt); |
|
108 |
+ void DeleteForwardMatrix(int Nq); |
|
109 |
+ void AllocateBackwardMatrix(int Nq, int Nt); |
|
110 |
+ void DeleteBackwardMatrix(int Nq); |
|
111 |
+ |
|
112 |
+ // Compare an HMM with overlapping subalignments |
|
113 |
+ void Viterbi(HMM& q, HMM& t, float** Sstruc=NULL); |
|
114 |
+ |
|
115 |
+ // Compare two HMMs with each other in lin space |
|
116 |
+ int Forward(HMM& q, HMM& t, float** Pstruc=NULL); |
|
117 |
+ |
|
118 |
+ // Compare two HMMs with each other in lin space |
|
119 |
+ int Backward(HMM& q, HMM& t); |
|
120 |
+ |
|
121 |
+ // Find maximum accuracy alignment (after running Forward and Backward algorithms) |
|
122 |
+ void MACAlignment(HMM& q, HMM& t); |
|
123 |
+ |
|
124 |
+ // Trace back alignment of two profiles based on matrices bXX[][] |
|
125 |
+ void Backtrace(HMM& q, HMM& t); |
|
126 |
+ |
|
127 |
+ // Trace back alignment of two profiles based on matrices bXX[][] |
|
128 |
+ void StochasticBacktrace(HMM& q, HMM& t, char maximize=0); |
|
129 |
+ |
|
130 |
+ // Trace back MAC alignment of two profiles based on matrix bMM[][] |
|
131 |
+ void BacktraceMAC(HMM& q, HMM& t); |
|
132 |
+ |
|
133 |
+ // Calculate secondary structure score between columns i and j of two HMMs (query and template) |
|
134 |
+ inline float ScoreSS(HMM& q, HMM& t, int i, int j, int ssm); |
|
135 |
+ |
|
136 |
+ // Calculate secondary structure score between columns i and j of two HMMs (query and template) |
|
137 |
+ inline float ScoreSS(HMM& q, HMM& t, int i, int j); |
|
138 |
+ |
|
139 |
+ // Calculate total score (including secondary structure score and compositional bias correction |
|
140 |
+ inline float ScoreTot(HMM& q, HMM& t, int i, int j); |
|
141 |
+ |
|
142 |
+ // Calculate score (excluding secondary structure score and compositional bias correction |
|
143 |
+ inline float ScoreAA(HMM& q, HMM& t, int i, int j); |
|
144 |
+ |
|
145 |
+ // Comparison (used to sort list of hits) |
|
146 |
+ int operator<(const Hit& hit2) {return score_sort<hit2.score_sort;} |
|
147 |
+ |
|
148 |
+ // Merge HMM with next aligned HMM |
|
149 |
+ void MergeHMM(HMM& Q, HMM& t, float wk[]); |
|
150 |
+ |
|
151 |
+#ifdef CLUSTALO |
|
152 |
+ void ClobberGlobal(void); |
|
153 |
+#endif |
|
154 |
+ |
|
155 |
+ |
|
156 |
+ double** B_MM; // Backward matrices |
|
157 |
+ |
|
158 |
+private: |
|
159 |
+ char state; // 0: Start/STOP state 1: MM state 2: GD state (-D) 3: IM state 4: DG state (D-) 5 MI state |
|
160 |
+ char** bMM; // (backtracing) bMM[i][j] = STOP:start of alignment MM:prev was MM GD:prev was GD etc |
|
161 |
+ char** bGD; // (backtracing) bMM[i][j] = STOP:start of alignment MM:prev was MM SAME:prev was GD |
|
162 |
+ char** bDG; // (backtracing) |
|
163 |
+ char** bIM; // (backtracing) |
|
164 |
+ char** bMI; // (backtracing) |
|
165 |
+ char** cell_off; // cell_off[i][j]=1 means this cell will get score -infinity |
|
166 |
+ |
|
167 |
+ double** F_MM; // Forward matrices |
|
168 |
+ double** F_GD; // F_XY[i][j] * Prod_1^i(scale[i]) |
|
169 |
+ double** F_DG; // = Sum_x1..xl{ P(HMMs aligned up to Xi||Yj co-emmitted x1..xl ) / (Prod_k=1^l f(x_k)) } |
|
170 |
+ double** F_IM; // end gaps are not penalized! |
|
171 |
+ double** F_MI; // |
|
172 |
+ double* scale; // |
|
173 |
+ |
|
174 |
+ double** B_GD; // B_XY[i][j] * Prod_i+1^(L+1) (scale[i]) |
|
175 |
+ double** B_DG; // = Sum_x2..xl{ P(HMMs aligned from Xi||Yj to end co-emmitted x2..xl ) / (Prod_k=2^l f(x_k)) } |
|
176 |
+ double** B_IM; // end gaps are not penalized! |
|
177 |
+ double** B_MI; // |
|
178 |
+ |
|
179 |
+ void InitializeBacktrace(HMM& q, HMM& t); |
|
180 |
+ void InitializeForAlignment(HMM& q, HMM& t); |
|
181 |
+}; |
|
182 |
+ |
|
183 |
+ |
|
184 |
+double Pvalue(double x, double a[]); |
|
185 |
+double Pvalue(float x, float lamda, float mu); |
|
186 |
+double logPvalue(float x, float lamda, float mu); |
|
187 |
+double logPvalue(float x, double a[]); |
|
188 |
+double Probab(Hit& hit); |
|
189 |
+ |
|
190 |
+ |
|
191 |
+ |
|
192 |
+ |