git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102514 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -77,15 +77,15 @@ Alignment::Alignment(int maxseq, int maxres) |
77 | 77 |
|
78 | 78 |
//printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */ |
79 | 79 |
longname = new(char[DESCLEN]); |
80 |
- sname = new(char*[maxseq+2]); /* MR1 */ |
|
81 |
- seq = new(char*[maxseq+2]); /* MR1 */ |
|
82 |
- l = new(int[maxres]); |
|
83 |
- X = new(char*[maxseq+2]); /* MR1 */ |
|
84 |
- I = new(short unsigned int*[maxseq+2]); /* MR1 */ |
|
85 |
- keep = new(char[maxseq+2]); /* MR1 */ |
|
86 |
- display = new(char[maxseq+2]); /* MR1 */ |
|
87 |
- wg = new(float[maxseq+2]); /* MR1 */ |
|
88 |
- nseqs = new(int[maxres+2]); /* MR1 */ |
|
80 |
+ sname = new char*[maxseq+2]; /* MR1 */ |
|
81 |
+ seq = new char*[maxseq+2]; /* MR1 */ |
|
82 |
+ l = new int[maxres]; |
|
83 |
+ X = new char*[maxseq+2]; /* MR1 */ |
|
84 |
+ I = new short unsigned int*[maxseq+2]; /* MR1 */ |
|
85 |
+ keep = new char[maxseq+2]; /* MR1 */ |
|
86 |
+ display = new char[maxseq+2]; /* MR1 */ |
|
87 |
+ wg = new float[maxseq+2]; /* MR1 */ |
|
88 |
+ nseqs = new int[maxres+2]; /* MR1 */ |
|
89 | 89 |
N_in=L=0; |
90 | 90 |
nres=NULL; // number of residues per sequence k |
91 | 91 |
first=NULL; // first residue in sequence k |
... | ... |
@@ -138,7 +138,7 @@ Alignment::Read(FILE* inf, char infile[], char* firstline) |
138 | 138 |
int k; // Index of sequence being read currently (first=0) |
139 | 139 |
char line[LINELEN]=""; // input line |
140 | 140 |
//char cur_seq[MAXCOL]; // Sequence currently read in |
141 |
- char *cur_seq=new(char[par.maxColCnt]); |
|
141 |
+ char *cur_seq=new char[par.maxColCnt]; |
|
142 | 142 |
char* cur_name; // Sequence currently read in |
143 | 143 |
int linenr=0; // current line number in input file |
144 | 144 |
char skip_sequence=0; |
... | ... |
@@ -180,11 +180,11 @@ Alignment::Read(FILE* inf, char infile[], char* firstline) |
180 | 180 |
} |
181 | 181 |
|
182 | 182 |
// Create space for residues and paste new sequence in |
183 |
- seq[k]=new(char[strlen(cur_seq)+2]); |
|
183 |
+ seq[k]=new char[strlen(cur_seq)+2]; |
|
184 | 184 |
if (!seq[k]) MemoryError("array for input sequences"); |
185 |
- X[k]=new(char[strlen(cur_seq)+2]); |
|
185 |
+ X[k]=new char[strlen(cur_seq)+2]; |
|
186 | 186 |
if (!X[k]) MemoryError("array for input sequences"); |
187 |
- I[k]=new(short unsigned int[strlen(cur_seq)+2]); |
|
187 |
+ I[k]=new short unsigned int[strlen(cur_seq)+2]; |
|
188 | 188 |
if (!I[k]) MemoryError("array for input sequences"); |
189 | 189 |
strcpy(seq[k],cur_seq); |
190 | 190 |
} |
... | ... |
@@ -233,7 +233,7 @@ Alignment::Read(FILE* inf, char infile[], char* firstline) |
233 | 233 |
|
234 | 234 |
// store sequence name |
235 | 235 |
if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]); |
236 |
- sname[k] = new(char[strlen(cur_name)+1]); |
|
236 |
+ sname[k] = new char[strlen(cur_name)+1]; |
|
237 | 237 |
if (!sname[k]) {MemoryError("array for sequence names");} |
238 | 238 |
strcpy(sname[k],cur_name); |
239 | 239 |
} // end if(line contains sequence name) |
... | ... |
@@ -348,11 +348,11 @@ Alignment::Read(FILE* inf, char infile[], char* firstline) |
348 | 348 |
|
349 | 349 |
if (k>=0) //if at least one sequence was read in |
350 | 350 |
{ |
351 |
- seq[k]=new(char[strlen(cur_seq)+2]); |
|
351 |
+ seq[k]=new char[strlen(cur_seq)+2]; |
|
352 | 352 |
if (!seq[k]) MemoryError("array for input sequences"); |
353 |
- X[k]=new(char[strlen(cur_seq)+2]); |
|
353 |
+ X[k]=new char[strlen(cur_seq)+2]; |
|
354 | 354 |
if (!X[k]) MemoryError("array for input sequences"); |
355 |
- I[k]=new(short unsigned int[strlen(cur_seq)+2]); |
|
355 |
+ I[k]=new short unsigned int[strlen(cur_seq)+2]; |
|
356 | 356 |
if (!I[k]) MemoryError("array for input sequences"); |
357 | 357 |
strcpy(seq[k],cur_seq); |
358 | 358 |
} |
... | ... |
@@ -420,7 +420,7 @@ Alignment::Compress(const char infile[]) |
420 | 420 |
/*static short unsigned int h[MAXSEQ];*/ |
421 | 421 |
/*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */ |
422 | 422 |
|
423 |
- h = new(/*short*/ unsigned int[N_in+2]); /* short -> overflow, FS, r235 -> r236 */ |
|
423 |
+ h = new /*short*/ unsigned int[N_in+2]; /* short -> overflow, FS, r235 -> r236 */ |
|
424 | 424 |
float *percent_gaps = NULL; /* FS, 2010-Nov */ |
425 | 425 |
char *match_state = NULL; /* FS, 2010-Nov */ |
426 | 426 |
|
... | ... |
@@ -552,13 +552,13 @@ Alignment::Compress(const char infile[]) |
552 | 552 |
had to move declaration of float *percent_gaps out of switch() |
553 | 553 |
*/ |
554 | 554 |
//float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences) |
555 |
- percent_gaps = new(float[par.maxColCnt]); |
|
555 |
+ percent_gaps = new float[par.maxColCnt]; |
|
556 | 556 |
|
557 | 557 |
//determine number of columns L in alignment |
558 | 558 |
L=strlen(seq[kfirst])-1; |
559 | 559 |
|
560 | 560 |
// Conversion to integer representation, checking for unequal lengths and initialization |
561 |
- if (nres==NULL) nres=new(int[N_in]); |
|
561 |
+ if (nres==NULL) nres=new int[N_in]; |
|
562 | 562 |
for (k=0; k<N_in; k++) |
563 | 563 |
{ |
564 | 564 |
if (!keep[k]) continue; |
... | ... |
@@ -780,7 +780,7 @@ Alignment::Compress(const char infile[]) |
780 | 780 |
had to move declaration of float *percent_gaps out of switch() |
781 | 781 |
*/ |
782 | 782 |
//char match_state[MAXCOL]; //1: column assigned to match state 0: insert state |
783 |
- match_state = new(char[par.maxColCnt]); |
|
783 |
+ match_state = new char[par.maxColCnt]; |
|
784 | 784 |
|
785 | 785 |
// Determine number of columns L in alignment |
786 | 786 |
L=strlen(seq[0]+1); |
... | ... |
@@ -942,7 +942,7 @@ Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int |
942 | 942 |
|
943 | 943 |
|
944 | 944 |
if (par.mark) return n_display; |
945 |
- char *dummy = new(char[N_in+1]); |
|
945 |
+ char *dummy = new char[N_in+1]; |
|
946 | 946 |
int vtmp=v, seqid; |
947 | 947 |
v=0; |
948 | 948 |
n_display=0; |
... | ... |
@@ -1005,12 +1005,12 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in |
1005 | 1005 |
{ |
1006 | 1006 |
// In the beginnning, keep[k] is 1 for all regular amino acid sequences and 0 for all others (ss_conf, ss_pred,...) |
1007 | 1007 |
// In the end, keep[k] will be 1 for all regular representative sequences kept in the alignment, 0 for all others |
1008 |
- char* in=new(char[N_in+1]); // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid |
|
1009 |
- char* inkk=new(char[N_in+1]); // inkk[k]=1 iff in[ksort[k]]=1 else 0; |
|
1010 |
- int* Nmax=new(int[L+2]); // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/ |
|
1011 |
- int* idmaxwin=new(int[L+2]); // minimum value of idmax[i-WFIL,i+WFIL] |
|
1012 |
- int* seqid_prev=new(int[N_in+1]); // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid) |
|
1013 |
- int* N=new(int[L+2]); // N[i] number of already accepted sequences at position i |
|
1008 |
+ char* in=new char[N_in+1]; // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid |
|
1009 |
+ char* inkk=new char[N_in+1]; // inkk[k]=1 iff in[ksort[k]]=1 else 0; |
|
1010 |
+ int* Nmax=new int[L+2]; // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/ |
|
1011 |
+ int* idmaxwin=new int[L+2]; // minimum value of idmax[i-WFIL,i+WFIL] |
|
1012 |
+ int* seqid_prev=new int[N_in+1]; // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid) |
|
1013 |
+ int* N=new int[L+2]; // N[i] number of already accepted sequences at position i |
|
1014 | 1014 |
const int WFIL=25; // see previous line |
1015 | 1015 |
|
1016 | 1016 |
int diffNmax=Ndiff; // current maximum difference of Nmax[i] and Ndiff /* MR1 */ |
... | ... |
@@ -1039,8 +1039,8 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in |
1039 | 1039 |
// Determine first[k], last[k]? |
1040 | 1040 |
if (first==NULL) |
1041 | 1041 |
{ |
1042 |
- first=new(int[N_in]);// first non-gap position in sequence k |
|
1043 |
- last =new(int[N_in]);// last non-gap position in sequence k |
|
1042 |
+ first=new int[N_in];// first non-gap position in sequence k |
|
1043 |
+ last =new int[N_in];// last non-gap position in sequence k |
|
1044 | 1044 |
for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k]) |
1045 | 1045 |
{ |
1046 | 1046 |
for (i=1; i<=L; i++) if (X[k][i]<NAA) break; |
... | ... |
@@ -1054,7 +1054,7 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in |
1054 | 1054 |
if ( (nres==NULL) || (sizeof(nres)<N_in*sizeof(int)) ) |
1055 | 1055 |
{ |
1056 | 1056 |
if (nres) delete[] nres; |
1057 |
- nres=new(int[N_in]); |
|
1057 |
+ nres=new int[N_in]; |
|
1058 | 1058 |
for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k]) |
1059 | 1059 |
{ |
1060 | 1060 |
int nr=0; |
... | ... |
@@ -1068,7 +1068,7 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in |
1068 | 1068 |
// Sort sequences according to length; afterwards, nres[ksort[kk]] is sorted by size |
1069 | 1069 |
if (ksort==NULL) |
1070 | 1070 |
{ |
1071 |
- ksort=new(int[N_in]); // never reuse alignment object for new alignment with more sequences |
|
1071 |
+ ksort=new int[N_in]; // never reuse alignment object for new alignment with more sequences |
|
1072 | 1072 |
for (k=0; k<N_in; k++) ksort[k]=k; |
1073 | 1073 |
QSortInt(nres,ksort,kfirst+1,N_in-1,-1); //Sort sequences after kfirst (query) in descending order |
1074 | 1074 |
} |
... | ... |
@@ -1311,7 +1311,7 @@ Alignment::HomologyFilter(int coverage_core, float qsc_core, float coresc) |
1311 | 1311 |
const int Ndiff_core=0; |
1312 | 1312 |
int n; |
1313 | 1313 |
HMM qcore; |
1314 |
- char* coreseq=new(char[N_in]); // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query) |
|
1314 |
+ char* coreseq=new char[N_in]; // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query) |
|
1315 | 1315 |
for (int k=0; k<N_in; k++) coreseq[k]=keep[k]; // Copy keep[] into coreseq[] |
1316 | 1316 |
|
1317 | 1317 |
// Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) |
... | ... |
@@ -1362,7 +1362,7 @@ Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore) |
1362 | 1362 |
int i; // column in query alignment |
1363 | 1363 |
int a; // amino acid (0..19) |
1364 | 1364 |
int n=1; // number of sequences that passed filter |
1365 |
- float** logodds=new(float*[L+1]); // log-odds ratios for HMM qcore |
|
1365 |
+ float** logodds=new float*[L+1]; // log-odds ratios for HMM qcore |
|
1366 | 1366 |
char gap; // 1: previous state in seq k was a gap 0: previous state in seq k was an amino acid |
1367 | 1367 |
float score; // score of sequence k aligned with qcore |
1368 | 1368 |
|
... | ... |
@@ -1371,8 +1371,8 @@ Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore) |
1371 | 1371 |
// Determine first[k], last[k]? |
1372 | 1372 |
if (first==NULL) |
1373 | 1373 |
{ |
1374 |
- first=new(int[N_in]);// first non-gap position in sequence k |
|
1375 |
- last =new(int[N_in]);// last non-gap position in sequence k |
|
1374 |
+ first=new int[N_in];// first non-gap position in sequence k |
|
1375 |
+ last =new int[N_in];// last non-gap position in sequence k |
|
1376 | 1376 |
for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k]) |
1377 | 1377 |
{ |
1378 | 1378 |
for (i=1; i<=L; i++) if (X[k][i]<NAA) break; |
... | ... |
@@ -1385,7 +1385,7 @@ Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore) |
1385 | 1385 |
// Determine number of residues nres[k]? |
1386 | 1386 |
if (nres==NULL) |
1387 | 1387 |
{ |
1388 |
- nres=new(int[N_in]); |
|
1388 |
+ nres=new int[N_in]; |
|
1389 | 1389 |
for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k]) |
1390 | 1390 |
{ |
1391 | 1391 |
int nr=0; |
... | ... |
@@ -1667,18 +1667,18 @@ Alignment::FrequenciesAndTransitions(HMM& q, char* in) |
1667 | 1667 |
q.sname[q.ncons]=new(char[10]); |
1668 | 1668 |
if (!q.sname[q.ncons]) {MemoryError("array of names for displayed sequences");} |
1669 | 1669 |
strcpy(q.sname[q.ncons],"Consensus"); |
1670 |
- q.seq[q.ncons]=new(char[L+2]); |
|
1670 |
+ q.seq[q.ncons]=new char[L+2]; |
|
1671 | 1671 |
if (!q.seq[q.ncons]) {MemoryError("array of names for displayed sequences");} |
1672 | 1672 |
} |
1673 | 1673 |
if (par.cons) |
1674 | 1674 |
{ |
1675 | 1675 |
// Reserve space for consensus sequence as first sequence in alignment |
1676 | 1676 |
q.nfirst=n++; kfirst=-1; |
1677 |
- q.sname[q.nfirst]=new(char[strlen(name)+11]); |
|
1677 |
+ q.sname[q.nfirst]=new char[strlen(name)+11]; |
|
1678 | 1678 |
if (!q.sname[q.nfirst]) {MemoryError("array of names for displayed sequences");} |
1679 | 1679 |
strcpy(q.sname[q.nfirst],name); |
1680 | 1680 |
strcat(q.sname[q.nfirst],"_consensus"); |
1681 |
- q.seq[q.nfirst]=new(char[L+2]); |
|
1681 |
+ q.seq[q.nfirst]=new char[L+2]; |
|
1682 | 1682 |
if (!q.seq[q.nfirst]) {MemoryError("array of names for displayed sequences");} |
1683 | 1683 |
} |
1684 | 1684 |
// Calculate consensus amino acids using similarity matrix |
... | ... |
@@ -1736,10 +1736,10 @@ Alignment::FrequenciesAndTransitions(HMM& q, char* in) |
1736 | 1736 |
else if (k==kfirst) nn=q.nfirst=n++; |
1737 | 1737 |
else nn=n++; |
1738 | 1738 |
// strcut(sname[k]," "); // delete rest of name line beginning with two spaces " " // Why this?? Problem for pdb seqs without chain |
1739 |
- q.sname[nn]=new(char[strlen(sname[k])+1]); |
|
1739 |
+ q.sname[nn]=new char[strlen(sname[k])+1]; |
|
1740 | 1740 |
if (!q.sname[nn]) {MemoryError("array of names for displayed sequences");} |
1741 | 1741 |
strcpy(q.sname[nn],sname[k]); |
1742 |
- q.seq[nn]=new(char[strlen(seq[k])+1]); |
|
1742 |
+ q.seq[nn]=new char[strlen(seq[k])+1]; |
|
1743 | 1743 |
if (!q.seq[nn]) {MemoryError("array of names for displayed sequences");} |
1744 | 1744 |
strcpy(q.seq[nn],seq[k]); |
1745 | 1745 |
} |
... | ... |
@@ -1826,14 +1826,14 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in) |
1826 | 1826 |
//float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i |
1827 | 1827 |
float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i |
1828 | 1828 |
//float Neff[MAXRES]; // diversity of subalignment i |
1829 |
- float *Neff = new(float[par.maxResLen]); // diversity of subalignment i |
|
1829 |
+ float *Neff = new float[par.maxResLen]; // diversity of subalignment i |
|
1830 | 1830 |
int nseqi=0; // number of sequences in subalignment i |
1831 | 1831 |
int ncol=0; // number of columns j that contribute to Neff[i] |
1832 | 1832 |
char change; // has the set of sequences in subalignment changed? 0:no 1:yes |
1833 | 1833 |
float fj[NAA+3]; // to calculate entropy |
1834 | 1834 |
float sum; |
1835 | 1835 |
|
1836 |
- wi = new(float[N_in+2]); |
|
1836 |
+ wi = new float[N_in+2]; |
|
1837 | 1837 |
|
1838 | 1838 |
// Global weights? |
1839 | 1839 |
if (par.wg==1) |
... | ... |
@@ -1842,7 +1842,7 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in) |
1842 | 1842 |
// Initialization |
1843 | 1843 |
q.Neff_HMM=0.0f; |
1844 | 1844 |
Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0] |
1845 |
- n = new(int*[L+2]); |
|
1845 |
+ n = new int*[L+2]; |
|
1846 | 1846 |
for (j=1; j<=L; j++) n[j]=new(int[NAA+3]); |
1847 | 1847 |
for (j=1; j<=L; j++) |
1848 | 1848 |
for (a=0; a<NAA+3; a++) n[j][a]=0; |
... | ... |
@@ -2052,7 +2052,7 @@ Alignment::Transitions_from_I_state(HMM& q, char* in) |
2052 | 2052 |
//float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i |
2053 | 2053 |
float *wi = NULL; // weight of sequence k in column i, calculated from subalignment i |
2054 | 2054 |
//float Neff[MAXRES]; // diversity of subalignment i |
2055 |
- float *Neff = new(float[par.maxResLen]); // diversity of subalignment i |
|
2055 |
+ float *Neff = new float[par.maxResLen]; // diversity of subalignment i |
|
2056 | 2056 |
int nseqi; // number of sequences in subalignment i |
2057 | 2057 |
int ncol; // number of columns j that contribute to Neff[i] |
2058 | 2058 |
float fj[NAA+3]; // to calculate entropy |
... | ... |
@@ -2060,7 +2060,7 @@ Alignment::Transitions_from_I_state(HMM& q, char* in) |
2060 | 2060 |
float Nlim=0.0; // only for global weights |
2061 | 2061 |
float scale=0.0; // only for global weights |
2062 | 2062 |
|
2063 |
- wi = new(float[N_in+2]); |
|
2063 |
+ wi = new float[N_in+2]; |
|
2064 | 2064 |
|
2065 | 2065 |
// Global weights? |
2066 | 2066 |
if (par.wg==1) |
... | ... |
@@ -2071,7 +2071,7 @@ Alignment::Transitions_from_I_state(HMM& q, char* in) |
2071 | 2071 |
} |
2072 | 2072 |
|
2073 | 2073 |
// Initialization |
2074 |
- n = new(int*[L+2]); |
|
2074 |
+ n = new int*[L+2]; |
|
2075 | 2075 |
for (j=1; j<=L; j++) n[j]=new(int[NAA+3]); |
2076 | 2076 |
|
2077 | 2077 |
////////////////////////////////////////////////////////////////////////////////////////////// |
... | ... |
@@ -2254,7 +2254,7 @@ Alignment::Transitions_from_D_state(HMM& q, char* in) |
2254 | 2254 |
//float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i |
2255 | 2255 |
float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i |
2256 | 2256 |
//float Neff[MAXRES]; // diversity of subalignment i |
2257 |
- float *Neff = new(float[par.maxResLen]); // diversity of subalignment i |
|
2257 |
+ float *Neff = new float[par.maxResLen]; // diversity of subalignment i |
|
2258 | 2258 |
int nseqi=0; // number of sequences in subalignment i (for DEBUGGING) |
2259 | 2259 |
int ncol=0; // number of columns j that contribute to Neff[i] |
2260 | 2260 |
char change; // has the set of sequences in subalignment changed? 0:no 1:yes |
... | ... |
@@ -2263,7 +2263,7 @@ Alignment::Transitions_from_D_state(HMM& q, char* in) |
2263 | 2263 |
float Nlim=0.0; // only for global weights |
2264 | 2264 |
float scale=0.0; // only for global weights |
2265 | 2265 |
|
2266 |
- wi = new(float[N_in+2]); /* FIXME: FS */ |
|
2266 |
+ wi = new float[N_in+2]; /* FIXME: FS */ |
|
2267 | 2267 |
// Global weights? |
2268 | 2268 |
if (par.wg==1) |
2269 | 2269 |
{ |
... | ... |
@@ -2273,7 +2273,7 @@ Alignment::Transitions_from_D_state(HMM& q, char* in) |
2273 | 2273 |
} |
2274 | 2274 |
|
2275 | 2275 |
// Initialization |
2276 |
- n = new(int*[L+2]); |
|
2276 |
+ n = new int*[L+2]; |
|
2277 | 2277 |
for (j=1; j<=L; j++) n[j]=new(int[NAA+3]); |
2278 | 2278 |
for (j=1; j<=L; j++) |
2279 | 2279 |
for (a=0; a<NAA+3; a++) n[j][a]=0; |
... | ... |
@@ -2544,7 +2544,7 @@ Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[]) |
2544 | 2544 |
N_filtered = Tali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); |
2545 | 2545 |
|
2546 | 2546 |
// Record imatch[j] |
2547 |
- int* imatch=new(int[hit.j2+1]); |
|
2547 |
+ int* imatch=new int[hit.j2+1]; |
|
2548 | 2548 |
int step = hit.nsteps; |
2549 | 2549 |
for (j=hit.j1; j<=hit.j2; j++) |
2550 | 2550 |
{ |
... | ... |
@@ -2676,7 +2676,7 @@ Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[]) |
2676 | 2676 |
iprev=i; lprev=l; |
2677 | 2677 |
if (h>=maxcol-1000) // too few columns? Reserve double space |
2678 | 2678 |
{ |
2679 |
- char* new_seq=new(char[2*maxcol]); |
|
2679 |
+ char* new_seq=new char[2*maxcol]; |
|
2680 | 2680 |
strncpy(new_seq,cur_seq,maxcol); //////// check: maxcol-1 ???? |
2681 | 2681 |
delete[](cur_seq); (cur_seq) = NULL; |
2682 | 2682 |
cur_seq=new_seq; |
... | ... |
@@ -2689,14 +2689,14 @@ Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[]) |
2689 | 2689 |
cur_seq[h++]='\0'; |
2690 | 2690 |
|
2691 | 2691 |
keep[N_in] = display[N_in] = KEEP_CONDITIONALLY; |
2692 |
- seq[N_in]=new(char[h]); |
|
2692 |
+ seq[N_in]=new char[h]; |
|
2693 | 2693 |
if (!seq[N_in]) MemoryError("array for input sequences"); |
2694 | 2694 |
strcpy(seq[N_in],cur_seq); |
2695 |
- X[N_in]=new(char[h]); |
|
2695 |
+ X[N_in]=new char[h]; |
|
2696 | 2696 |
if (!X[N_in]) MemoryError("array for input sequences"); |
2697 |
- I[N_in]=new(short unsigned int[h]); |
|
2697 |
+ I[N_in]=new short unsigned int[h]; |
|
2698 | 2698 |
if (!I[N_in]) MemoryError("array for input sequences"); |
2699 |
- sname[N_in]=new(char[strlen(Tali.sname[k])+1]); |
|
2699 |
+ sname[N_in]=new char[strlen(Tali.sname[k])+1]; |
|
2700 | 2700 |
if (!sname[N_in]) MemoryError("array for input sequences"); |
2701 | 2701 |
strcpy(sname[N_in],Tali.sname[k]); |
2702 | 2702 |
N_in++; |
... | ... |
@@ -2727,7 +2727,7 @@ Alignment::AddSequence(char Xk[], int Ik[]) |
2727 | 2727 |
{ |
2728 | 2728 |
int i; // position in query and target |
2729 | 2729 |
if (L<=0) InternalError("L is not set in AddSequence()"); |
2730 |
- X[N_in]=new(char[L+2]); |
|
2730 |
+ X[N_in]=new char[L+2]; |
|
2731 | 2731 |
for (i=0; i<=L+1; i++) X[N_in][i]=Xk[i]; |
2732 | 2732 |
if (Ik==NULL) |
2733 | 2733 |
for (i=0; i<=L+1; i++) I[N_in][i]=0; |
... | ... |
@@ -2777,7 +2777,7 @@ Alignment::GetPositionSpecificWeights(float* w[]) |
2777 | 2777 |
{ |
2778 | 2778 |
|
2779 | 2779 |
// Initialization |
2780 |
- n = new(int*[L+2]); |
|
2780 |
+ n = new int*[L+2]; |
|
2781 | 2781 |
for (j=1; j<=L; j++) n[j]=new(int[NAA+3]); |
2782 | 2782 |
for (j=1; j<=L; j++) |
2783 | 2783 |
for (a=0; a<NAA+3; a++) n[j][a]=0; |
... | ... |
@@ -2894,9 +2894,9 @@ Alignment::Transfer(char **ppcProf, int iCnt){ |
2894 | 2894 |
/* @<allocate memory for sequences etc@> */ |
2895 | 2895 |
for (k = 0; k < iCnt; k++){ |
2896 | 2896 |
#define GOOD_MEASURE 1000 /* Temporary -- can be removed once rest in place */ |
2897 |
- I[k] = new(short unsigned int[iLen+2+GOOD_MEASURE]); |
|
2898 |
- X[k] = new(char[iLen+2+GOOD_MEASURE]); |
|
2899 |
- seq[k] = new(char[iLen+2+GOOD_MEASURE]); |
|
2897 |
+ I[k] = new short unsigned int[iLen+2+GOOD_MEASURE]; |
|
2898 |
+ X[k] = new char[iLen+2+GOOD_MEASURE]; |
|
2899 |
+ seq[k] = new char[iLen+2+GOOD_MEASURE]; |
|
2900 | 2900 |
seq[k][0] = ' '; |
2901 | 2901 |
seq[k][1] = '\0'; |
2902 | 2902 |
if (NULL == ppcProf[k]){ |
msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,3029 @@ |
1 |
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
|
2 |
+ |
|
3 |
+/********************************************************************* |
|
4 |
+ * Clustal Omega - Multiple sequence alignment |
|
5 |
+ * |
|
6 |
+ * Copyright (C) 2010 University College Dublin |
|
7 |
+ * |
|
8 |
+ * Clustal-Omega is free software; you can redistribute it and/or |
|
9 |
+ * modify it under the terms of the GNU General Public License as |
|
10 |
+ * published by the Free Software Foundation; either version 2 of the |
|
11 |
+ * License, or (at your option) any later version. |
|
12 |
+ * |
|
13 |
+ * This file is part of Clustal-Omega. |
|
14 |
+ * |
|
15 |
+ ********************************************************************/ |
|
16 |
+ |
|
17 |
+/* |
|
18 |
+ * RCS $Id: hhalignment-C.h 236 2011-04-14 11:30:04Z fabian $ |
|
19 |
+ */ |
|
20 |
+ |
|
21 |
+ |
|
22 |
+/* |
|
23 |
+ * Changelog: Michael Remmert made changes to hhalign stand-alone code |
|
24 |
+ * FS implemented some of the changes on 2010-10-28 -> MR1 |
|
25 |
+ * |
|
26 |
+ * Note: MR seems to have changed all [aijk]++ to ++[aijk], |
|
27 |
+ * FS did not do that on 2010-10-28 |
|
28 |
+ */ |
|
29 |
+ |
|
30 |
+// hhalignment.C |
|
31 |
+ |
|
32 |
+////////////////////////////////////////////////////////////////////////////// |
|
33 |
+//// Class Alignment |
|
34 |
+////////////////////////////////////////////////////////////////////////////// |
|
35 |
+ |
|
36 |
+// hhalignment.C |
|
37 |
+ |
|
38 |
+#ifndef MAIN |
|
39 |
+#define MAIN |
|
40 |
+#include <iostream> // cin, cout, cerr |
|
41 |
+#include <fstream> // ofstream, ifstream |
|
42 |
+#include <stdio.h> // printf |
|
43 |
+using std::cout; |
|
44 |
+using std::cerr; |
|
45 |
+using std::endl; |
|
46 |
+using std::ios; |
|
47 |
+using std::ifstream; |
|
48 |
+using std::ofstream; |
|
49 |
+#include <stdlib.h> // exit |
|
50 |
+#include <string> // strcmp, strstr |
|
51 |
+#include <math.h> // sqrt, pow |
|
52 |
+#include <limits.h> // INT_MIN |
|
53 |
+#include <float.h> // FLT_MIN |
|
54 |
+#include <time.h> // clock |
|
55 |
+#include <ctype.h> // islower, isdigit etc |
|
56 |
+#include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. |
|
57 |
+#include "list.h" // list data structure |
|
58 |
+#include "hash.h" // hash data structure |
|
59 |
+#include "hhdecl-C.h" |
|
60 |
+#include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. |
|
61 |
+#include "hhhmm.h" |
|
62 |
+#endif |
|
63 |
+ |
|
64 |
+ |
|
65 |
+enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS}; |
|
66 |
+ |
|
67 |
+////////////////////////////////////////////////////////////////////////////// |
|
68 |
+// Class Alignment |
|
69 |
+////////////////////////////////////////////////////////////////////////////// |
|
70 |
+ |
|
71 |
+ |
|
72 |
+////////////////////////////////////////////////////////////////////////////// |
|
73 |
+// Object constructor |
|
74 |
+////////////////////////////////////////////////////////////////////////////// |
|
75 |
+Alignment::Alignment(int maxseq, int maxres) |
|
76 |
+{ |
|
77 |
+ |
|
78 |
+ //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */ |
|
79 |
+ longname = new(char[DESCLEN]); |
|
80 |
+ sname = new(char*[maxseq+2]); /* MR1 */ |
|
81 |
+ seq = new(char*[maxseq+2]); /* MR1 */ |
|
82 |
+ l = new(int[maxres]); |
|
83 |
+ X = new(char*[maxseq+2]); /* MR1 */ |
|
84 |
+ I = new(short unsigned int*[maxseq+2]); /* MR1 */ |
|
85 |
+ keep = new(char[maxseq+2]); /* MR1 */ |
|
86 |
+ display = new(char[maxseq+2]); /* MR1 */ |
|
87 |
+ wg = new(float[maxseq+2]); /* MR1 */ |
|
88 |
+ nseqs = new(int[maxres+2]); /* MR1 */ |
|
89 |
+ N_in=L=0; |
|
90 |
+ nres=NULL; // number of residues per sequence k |
|
91 |
+ first=NULL; // first residue in sequence k |
|
92 |
+ last=NULL; // last residue in sequence k |
|
93 |
+ ksort=NULL; // sequence indices sorted by descending nres[k] |
|
94 |
+ name[0]='\0'; // no name defined yet |
|
95 |
+ longname[0]='\0'; // no name defined yet |
|
96 |
+ fam[0]='\0'; // no name defined yet |
|
97 |
+ file[0]='\0'; // no name defined yet |
|
98 |
+ readCommentLine = '0'; /* MR1 */ |
|
99 |
+} |
|
100 |
+ |
|
101 |
+////////////////////////////////////////////////////////////////////////////// |
|
102 |
+// Object destructor |
|
103 |
+////////////////////////////////////////////////////////////////////////////// |
|
104 |
+Alignment::~Alignment() |
|
105 |
+{ |
|
106 |
+ delete[] longname; longname = NULL; |
|
107 |
+ for(int k=0; k<N_in; k++) |
|
108 |
+ { |
|
109 |
+ delete[] sname[k]; sname[k] = NULL; |
|
110 |
+ delete[] seq[k]; seq[k] = NULL; |
|
111 |
+ delete[] X[k]; X[k] = NULL; |
|
112 |
+ delete[] I[k]; I[k] = NULL; |
|
113 |
+ } |
|
114 |
+ delete[] sname; sname = NULL; |
|
115 |
+ delete[] seq; seq = NULL; |
|
116 |
+ delete[] X; X = NULL; |
|
117 |
+ delete[] I; I = NULL; |
|
118 |
+ delete[] l; l = NULL; |
|
119 |
+ delete[] keep; keep = NULL; |
|
120 |
+ delete[] display; display = NULL; |
|
121 |
+ delete[] wg; wg = NULL; |
|
122 |
+ delete[] nseqs; nseqs = NULL; |
|
123 |
+ delete[] nres; nres = NULL; |
|
124 |
+ delete[] first; first = NULL; |
|
125 |
+ delete[] last; last = NULL; |
|
126 |
+ delete[] ksort; ksort = NULL; |
|
127 |
+} |
|
128 |
+ |
|
129 |
+ |
|
130 |
+/** |
|
131 |
+ * @brief Reads in an alignment from file into matrix seq[k][l] as ASCII |
|
132 |
+ */ |
|
133 |
+void |
|
134 |
+Alignment::Read(FILE* inf, char infile[], char* firstline) |
|
135 |
+{ |
|
136 |
+ int l; // Postion in alignment incl. gaps (first=1) |
|
137 |
+ int h; // Position in input line (first=0) |
|
138 |
+ int k; // Index of sequence being read currently (first=0) |
|
139 |
+ char line[LINELEN]=""; // input line |
|
140 |
+ //char cur_seq[MAXCOL]; // Sequence currently read in |
|
141 |
+ char *cur_seq=new(char[par.maxColCnt]); |
|
142 |
+ char* cur_name; // Sequence currently read in |
|
143 |
+ int linenr=0; // current line number in input file |
|
144 |
+ char skip_sequence=0; |
|
145 |
+ RemoveExtension(file,infile); //copy rootname (w/o path) of infile into file variable of class object |
|
146 |
+ |
|
147 |
+ kss_dssp=ksa_dssp=kss_pred=kss_conf=kfirst=-1; |
|
148 |
+ n_display=0; |
|
149 |
+ N_in=0; |
|
150 |
+ N_filtered=0; |
|
151 |
+ N_ss=0; |
|
152 |
+ cur_seq[0]=' '; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq) |
|
153 |
+ l=1; k=-1; |
|
154 |
+ |
|
155 |
+ // Does firstline already contain first line of file? |
|
156 |
+ if (firstline!= NULL) strcpy(line,firstline); |
|
157 |
+ |
|
158 |
+ ///////////////////////////////////////////////////////////////////////// |
|
159 |
+ // Read infile line by line |
|
160 |
+ /* FIXME: not safe to use MAXSEQ, however, don't think we ever get here (FS) */ |
|
161 |
+ while(firstline || (fgetline(line,LINELEN,inf) && (k<MAXSEQ))) /* FIXME: FS introduced () around &&, precedence! MR1 */ |
|
162 |
+ { |
|
163 |
+ linenr++; |
|
164 |
+ firstline=NULL; |
|
165 |
+ if (line[0]=='>') //line contains sequence name |
|
166 |
+ { |
|
167 |
+ if (k>=MAXSEQ-1) |
|
168 |
+ { |
|
169 |
+ if (v>=1 && k>=MAXSEQ) |
|
170 |
+ cerr<<endl<<"WARNING: maximum number "<<MAXSEQ<<" of sequences exceded in file "<<infile<<"\n"; |
|
171 |
+ break; |
|
172 |
+ } |
|
173 |
+ cur_name=line+1; //beginning of current sequence name |
|
174 |
+ if (k>=0) //if this is at least the second name line |
|
175 |
+ { |
|
176 |
+ if (strlen(cur_seq)==0) |
|
177 |
+ { |
|
178 |
+ cerr<<endl<<"Error: sequence "<<sname[k]<<" contains no residues."<<endl; |
|
179 |
+ throw 1; |
|
180 |
+ } |
|
181 |
+ |
|
182 |
+ // Create space for residues and paste new sequence in |
|
183 |
+ seq[k]=new(char[strlen(cur_seq)+2]); |
|
184 |
+ if (!seq[k]) MemoryError("array for input sequences"); |
|
185 |
+ X[k]=new(char[strlen(cur_seq)+2]); |
|
186 |
+ if (!X[k]) MemoryError("array for input sequences"); |
|
187 |
+ I[k]=new(short unsigned int[strlen(cur_seq)+2]); |
|
188 |
+ if (!I[k]) MemoryError("array for input sequences"); |
|
189 |
+ strcpy(seq[k],cur_seq); |
|
190 |
+ } |
|
191 |
+ skip_sequence=0; |
|
192 |
+ |
|
193 |
+ k++; |
|
194 |
+ l=1; //position in current sequence (first=1) |
|
195 |
+ |
|
196 |
+ // display[k]= 0: do not show in Q-T alignments 1: show if not filtered out later 2: show in any case (do not filter out) |
|
197 |
+ // keep[k] = 0: do not include in profile 1: include if not filtered out later 2: include in any case (do not filter out) |
|
198 |
+ /* {KEEP_NOT=0, KEEP_CONDITIONALLY=1, KEEP_ALWAYS=2} */ |
|
199 |
+ if (line[1]=='@') cur_name++; //skip @-character in name |
|
200 |
+ if (!strncmp(line,">ss_dssp",8)) { |
|
201 |
+ if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=KEEP_NOT; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;} |
|
202 |
+ } |
|
203 |
+ else if (!strncmp(line,">sa_dssp",8)) { |
|
204 |
+ if (ksa_dssp<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;} |
|
205 |
+ } |
|
206 |
+ else if (!strncmp(line,">ss_pred",8)) { |
|
207 |
+ if (kss_pred<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;} |
|
208 |
+ } |
|
209 |
+ else if (!strncmp(line,">ss_conf",8)) { |
|
210 |
+ if (kss_conf<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;} |
|
211 |
+ } |
|
212 |
+ else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) { |
|
213 |
+ display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; N_ss++; |
|
214 |
+ } |
|
215 |
+ else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_" |
|
216 |
+ skip_sequence=1; k--; continue; |
|
217 |
+ } |
|
218 |
+ //store first real seq |
|
219 |
+ else if (kfirst<0) |
|
220 |
+ { |
|
221 |
+ char word[NAMELEN]; |
|
222 |
+ strwrd(word,line); // Copies first word in ptr to str |
|
223 |
+ if (strstr(word,"_consensus")) |
|
224 |
+ {display[k]=2; keep[k]=0; n_display++; kfirst=k;} /* MR1 */ |
|
225 |
+ else |
|
226 |
+ {display[k]=keep[k]=KEEP_ALWAYS; n_display++; kfirst=k;} |
|
227 |
+ } |
|
228 |
+ //store all sequences |
|
229 |
+ else if (par.mark==0) {display[k]=keep[k]=KEEP_CONDITIONALLY; n_display++;} |
|
230 |
+ //store sequences up to nseqdis |
|
231 |
+ else if (line[1]=='@'&& n_display-N_ss<par.nseqdis) {display[k]=keep[k]=KEEP_ALWAYS; n_display++;} |
|
232 |
+ else {display[k]=KEEP_NOT; keep[k]=KEEP_CONDITIONALLY;} |
|
233 |
+ |
|
234 |
+ // store sequence name |
|
235 |
+ if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]); |
|
236 |
+ sname[k] = new(char[strlen(cur_name)+1]); |
|
237 |
+ if (!sname[k]) {MemoryError("array for sequence names");} |
|
238 |
+ strcpy(sname[k],cur_name); |
|
239 |
+ } // end if(line contains sequence name) |
|
240 |
+ |
|
241 |
+ else if (line[0]=='#') // Commentary line? |
|
242 |
+ { |
|
243 |
+ // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174) |
|
244 |
+ if (name[0]) continue; // if already name defined: skip commentary line |
|
245 |
+ char *ptr1, *ptr2; |
|
246 |
+ ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number |
|
247 |
+ strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname |
|
248 |
+ longname[DESCLEN-1]='\0'; |
|
249 |
+ strtr(longname,""," "); |
|
250 |
+ ptr2=strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number |
|
251 |
+ // strcpy(fam,ptr1); // copy AC number to fam |
|
252 |
+ // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off |
|
253 |
+ // strcut_(ptr2); // cut after first word ... |
|
254 |
+ strcpy(name,ptr1); // ... and copy first word into name |
|
255 |
+ readCommentLine = '1'; /* MR1 */ |
|
256 |
+ } |
|
257 |
+ |
|
258 |
+ //line contains sequence residues or SS information and does not belong to a >aa_ sequence |
|
259 |
+ else if (!skip_sequence) |
|
260 |
+ { |
|
261 |
+ if (v>=4) cout<<line<<"\n"; //DEBUG |
|
262 |
+ if (k==-1 && v) |
|
263 |
+ { |
|
264 |
+ cerr<<endl<<"WARNING: No sequence name preceding following line in "<<infile<<":\n\'"<<line<<"\'\n"; |
|
265 |
+ continue; |
|
266 |
+ } |
|
267 |
+ |
|
268 |
+ h=0; //counts characters in current line |
|
269 |
+ |
|
270 |
+ // Check whether all characters are correct; store into cur_seq |
|
271 |
+ if (keep[k] || (k == kfirst) ) // normal line containing residues /* MR1 */ |
|
272 |
+ { |
|
273 |
+ while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) |
|
274 |
+ { |
|
275 |
+ if (aa2i(line[h])>=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1) |
|
276 |
+ {cur_seq[l]=line[h]; l++;} |
|
277 |
+ else if (aa2i(line[h])==-2 && v) |
|
278 |
+ cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n"; |
|
279 |
+ h++; |
|
280 |
+ } |
|
281 |
+ } |
|
282 |
+ else if (k==kss_dssp) // lines with dssp secondary structure states (. - H E C S T G B) |
|
283 |
+ { |
|
284 |
+ while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) |
|
285 |
+ { |
|
286 |
+ if (ss2i(line[h])>=0 && ss2i(line[h])<=7) |
|
287 |
+ {cur_seq[l]=ss2ss(line[h]); l++;} |
|
288 |
+ else if (v) |
|
289 |
+ cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n"; |
|
290 |
+ h++; |
|
291 |
+ } |
|
292 |
+ } |
|
293 |
+ else if (k==ksa_dssp) // lines with dssp solvent accessibility states (. - ???) |
|
294 |
+ { |
|
295 |
+ while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) |
|
296 |
+ { |
|
297 |
+ if (sa2i(line[h])>=0) |
|
298 |
+ cur_seq[l++]=line[h]; |
|
299 |
+ else if (v) |
|
300 |
+ cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n"; |
|
301 |
+ h++; |
|
302 |
+ } |
|
303 |
+ } |
|
304 |
+ else if (k==kss_pred) // lines with predicted secondary structure (. - H E C) |
|
305 |
+ { |
|
306 |
+ while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) |
|
307 |
+ { |
|
308 |
+ if (ss2i(line[h])>=0 && ss2i(line[h])<=3) |
|
309 |
+ {cur_seq[l]=ss2ss(line[h]); l++;} |
|
310 |
+ else if (v) |
|
311 |
+ cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n"; |
|
312 |
+ h++; |
|
313 |
+ } |
|
314 |
+ } |
|
315 |
+ else if (k==kss_conf) // lines with confidence values should contain only 0-9, '-', or '.' |
|
316 |
+ { |
|
317 |
+ while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) |
|
318 |
+ { |
|
319 |
+ if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9')) |
|
320 |
+ {cur_seq[l]=line[h]; l++;} |
|
321 |
+ else if (v) |
|
322 |
+ cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n"; |
|
323 |
+ h++; |
|
324 |
+ } |
|
325 |
+ } |
|
326 |
+ else if (display[k]) // other lines such as >sa_pred etc |
|
327 |
+ { |
|
328 |
+ while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1) |
|
329 |
+ { |
|
330 |
+ if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B')) |
|
331 |
+ {cur_seq[l]=line[h]; l++;} |
|
332 |
+ else if (v) |
|
333 |
+ cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n"; |
|
334 |
+ h++; |
|
335 |
+ } |
|
336 |
+ } |
|
337 |
+ if (v && l>=/*MAXCOL*/par.maxColCnt-1) |
|
338 |
+ { |
|
339 |
+ cerr<<endl<<"WARNING: maximum number of residues "<</*MAXCOL*/par.maxColCnt-2<<" exceded in sequence "<<sname[k]<<"\n"; |
|
340 |
+ skip_sequence=1; |
|
341 |
+ } |
|
342 |
+ cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character |
|
343 |
+ } //end else |
|
344 |
+ |
|
345 |
+ } |
|
346 |
+ ///////////////////////////////////////////////////////////////////////// |
|
347 |
+ |
|
348 |
+ |
|
349 |
+ if (k>=0) //if at least one sequence was read in |
|
350 |
+ { |
|
351 |
+ seq[k]=new(char[strlen(cur_seq)+2]); |
|
352 |
+ if (!seq[k]) MemoryError("array for input sequences"); |
|
353 |
+ X[k]=new(char[strlen(cur_seq)+2]); |
|
354 |
+ if (!X[k]) MemoryError("array for input sequences"); |
|
355 |
+ I[k]=new(short unsigned int[strlen(cur_seq)+2]); |
|
356 |
+ if (!I[k]) MemoryError("array for input sequences"); |
|
357 |
+ strcpy(seq[k],cur_seq); |
|
358 |
+ } |
|
359 |
+ else |
|
360 |
+ {cerr<<endl<<"Error: no sequences found in file "<<infile<<"\n"; throw 1;} |
|
361 |
+ |
|
362 |
+ N_in = k+1; |
|
363 |
+ |
|
364 |
+ // Set name, longname, fam |
|
365 |
+ if (!*name) // longname, name and family were not set by '#...' line yet -> extract from first sequence |
|
366 |
+ { |
|
367 |
+ char* ptr; |
|
368 |
+ // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst] |
|
369 |
+ strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence |
|
370 |
+ longname[DESCLEN-1]='\0'; |
|
371 |
+ strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname... |
|
372 |
+ name[NAMELEN-1]='\0'; |
|
373 |
+ ptr = strcut(name); // ...until first white-space character |
|
374 |
+ if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word? |
|
375 |
+ { |
|
376 |
+ lwrstr(name); // Transform upper case to lower case |
|
377 |
+ strcut(ptr); // Non-white-space characters until next white-space character.. |
|
378 |
+ strcpy(fam,ptr); // ...are the SCOP familiy code |
|
379 |
+ } |
|
380 |
+ else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code |
|
381 |
+ { |
|
382 |
+ strcpy(fam,name); // set family name = Pfam code |
|
383 |
+ } |
|
384 |
+ } |
|
385 |
+ |
|
386 |
+ |
|
387 |
+ |
|
388 |
+ delete[] cur_seq; cur_seq = NULL; |
|
389 |
+ |
|
390 |
+ // Checking for warning messages |
|
391 |
+ if (v==0) return; |
|
392 |
+ if (v>=2) cout<<"Read "<<infile<<" with "<<N_in<<" sequences\n"; |
|
393 |
+ if (v>=3) cout<<"Query sequence for alignment has number "<<kfirst<<" (0 is first)\n"; |
|
394 |
+ return; |
|
395 |
+} |
|
396 |
+ |
|
397 |
+/* |
|
398 |
+ * At this point GetSeqsFromHMM() slots in, however, |
|
399 |
+ * only needed in hhbliys.C, so will skip it for moment, MR1 |
|
400 |
+ */ |
|
401 |
+ |
|
402 |
+ |
|
403 |
+///////////////////////////////////////////////////////////////////////////// |
|
404 |
+/** |
|
405 |
+ * @brief Convert ASCII in seq[k][l] to int (0-20) in X[k][i], |
|
406 |
+ * throw out all insert states, record their number in I[k][i] |
|
407 |
+ * and store sequences to be displayed in seq[k] */ |
|
408 |
+///////////////////////////////////////////////////////////////////////////// |
|
409 |
+void |
|
410 |
+Alignment::Compress(const char infile[]) |
|
411 |
+{ |
|
412 |
+ int i; // Index for match state (first=1) |
|
413 |
+ int l; // Postion in alignment incl. gaps (first=1) |
|
414 |
+ int k; // Index for sequences (first=0) |
|
415 |
+ int a; // amino acid index |
|
416 |
+ char c; |
|
417 |
+ int unequal_lengths=0; /* k: seq k doesn't have same number |
|
418 |
+ of match states as seq 0 => WARNING */ |
|
419 |
+ /* points to next character in seq[k] to be written */ |
|
420 |
+ /*static short unsigned int h[MAXSEQ];*/ |
|
421 |
+ /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */ |
|
422 |
+ |
|
423 |
+ h = new(/*short*/ unsigned int[N_in+2]); /* short -> overflow, FS, r235 -> r236 */ |
|
424 |
+ float *percent_gaps = NULL; /* FS, 2010-Nov */ |
|
425 |
+ char *match_state = NULL; /* FS, 2010-Nov */ |
|
426 |
+ |
|
427 |
+ |
|
428 |
+ // Initialize |
|
429 |
+ for (k=0;k<N_in; k++) |
|
430 |
+ {I[k][0]=0;} |
|
431 |
+ |
|
432 |
+ if (v>=3) |
|
433 |
+ { |
|
434 |
+ if (par.M==1) |
|
435 |
+ cout<<"Using match state assignment by capital letters (a2m format)\n"; |
|
436 |
+ else if (par.M==2) cout<<"Using percentage-rule match state assignment\n"; |
|
437 |
+ else if (par.M==3) cout<<"Using residues of first sequence as match states\n"; |
|
438 |
+ } |
|
439 |
+ |
|
440 |
+ // Create matrices X and I with amino acids represented by integer numbers |
|
441 |
+ switch(par.M) |
|
442 |
+ { |
|
443 |
+ |
|
444 |
+ ///////////////////////////////////////////////////////////////////////// |
|
445 |
+ /* a2m/a3m format: match states capital case, |
|
446 |
+ inserts lower case, delete states '-', inserted gaps '.' |
|
447 |
+ The confidence values for ss prediction are interpreted as follows: |
|
448 |
+ 0-9:match states(!) '-' :match state '.':insert */ |
|
449 |
+ case 1: |
|
450 |
+ default: |
|
451 |
+ |
|
452 |
+ // Warn if alignment is ment to be -M first or -M NN instead of A2M/A3M |
|
453 |
+ if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ... |
|
454 |
+ { |
|
455 |
+ for (k=1; k<N_in; k++) |
|
456 |
+ if (strpbrk(seq[k],"abcdefghiklmnpqrstuvwxyz.")) break; |
|
457 |
+ if (k==N_in) // ... but alignment contains no lower case residue |
|
458 |
+ printf("WARNING: input alignment %s looks like aligned FASTA instead of A2M/A3M format. Consider using '-M first' or '-M 50'\n",infile); |
|
459 |
+ } |
|
460 |
+ |
|
461 |
+ // Remove '.' characters from seq[k] |
|
462 |
+ for(k=0; k<N_in; k++) |
|
463 |
+ { |
|
464 |
+ char* ptrS=seq[k]; // pointer to source: character in seq[k] |
|
465 |
+ char* ptrD=seq[k]; // pointer to destination: seq[k] |
|
466 |
+ while(1) // omit '.' symbols |
|
467 |
+ { |
|
468 |
+ if (*ptrS!='.') {*ptrD=*ptrS; ptrD++;} //leave out '.' symbols |
|
469 |
+ if (!*ptrS) break; |
|
470 |
+ ptrS++; |
|
471 |
+ } |
|
472 |
+ } |
|
473 |
+ L=/*MAXRES*/par.maxResLen-2; // needed because L=imin(L,i) |
|
474 |
+ for (k=0; k<N_in; k++) |
|
475 |
+ { |
|
476 |
+ i=1; l=1; // start at i=1, not i=0! |
|
477 |
+ if (keep[k]) //skip >ss_dssp, >ss_pred, >ss_conf, >aa_... sequences |
|
478 |
+ { |
|
479 |
+ while((c=seq[k][l++])) // assign residue to c at same time |
|
480 |
+ { |
|
481 |
+ if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character |
|
482 |
+ else if (c!='.') //match state = upper case character |
|
483 |
+ { |
|
484 |
+ X[k][i]=aa2i(c); |
|
485 |
+ I[k][i]=0; |
|
486 |
+ i++; |
|
487 |
+ } |
|
488 |
+ } |
|
489 |
+ } |
|
490 |
+ else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states? |
|
491 |
+ { |
|
492 |
+ while((c=seq[k][l++])) // assign residue to c at same time |
|
493 |
+ if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character |
|
494 |
+ } |
|
495 |
+ else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values? |
|
496 |
+ { |
|
497 |
+ while((c=seq[k][l++])) // assign residue to c at same time |
|
498 |
+ if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character |
|
499 |
+ } |
|
500 |
+ else if (k==kss_conf) // does alignment contain sequence of prediction confidence values? |
|
501 |
+ { |
|
502 |
+ while((c=seq[k][l++])) // assign residue to c at same time |
|
503 |
+ if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-' |
|
504 |
+ } |
|
505 |
+ else if (k==kfirst) // does alignment contain sequence of prediction confidence values? |
|
506 |
+ { |
|
507 |
+ while((c=seq[k][l++])) // assign residue to c at same time |
|
508 |
+ if (c!='.') |
|
509 |
+ { |
|
510 |
+ X[k][i]=aa2i(c); |
|
511 |
+ I[k][i]=0; |
|
512 |
+ ++i; |
|
513 |
+ } |
|
514 |
+ } |
|
515 |
+ else continue; |
|
516 |
+ i--; |
|
517 |
+ if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths |
|
518 |
+ L=imin(L,i); |
|
519 |
+ } |
|
520 |
+ if (unequal_lengths) break; |
|
521 |
+ |
|
522 |
+ //Replace GAP with ENDGAP for all end gaps /* MR1 */ |
|
523 |
+ for (k=0; k<N_in; ++k) |
|
524 |
+ { |
|
525 |
+ if (!keep[k]) continue; |
|
526 |
+ for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */ |
|
527 |
+ for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */ |
|
528 |
+ } |
|
529 |
+ |
|
530 |
+ for (i=1; i<=L; i++) this->l[i]=i; //assign column indices to match states |
|
531 |
+ if (L<=0) |
|
532 |
+ { |
|
533 |
+ cout<<"\nError: Alignment in "<<infile<<" contains no match states. Consider using -M first or -M <int> option"<<endl; |
|
534 |
+ throw 1; |
|
535 |
+ } |
|
536 |
+ |
|
537 |
+ if (L==/*MAXRES*/par.maxResLen-2 && v>=2) |
|
538 |
+ { |
|
539 |
+ printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L); |
|
540 |
+ break; |
|
541 |
+ } |
|
542 |
+ if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" match states\n"; |
|
543 |
+ break; |
|
544 |
+ |
|
545 |
+ ///////////////////////////////////////////////////////////////////////// |
|
546 |
+ // gap-rule assignment of match states |
|
547 |
+ case 2: |
|
548 |
+ int nl[NAA+2]; //nl[a] = number of seq's with amino acid a at position l |
|
549 |
+ /* Note: allocating statically is fine most of the time |
|
550 |
+ but when the sequences/profiles get really long |
|
551 |
+ we might run out of memory, so must really do it dynamically. |
|
552 |
+ had to move declaration of float *percent_gaps out of switch() |
|
553 |
+ */ |
|
554 |
+ //float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences) |
|
555 |
+ percent_gaps = new(float[par.maxColCnt]); |
|
556 |
+ |
|
557 |
+ //determine number of columns L in alignment |
|
558 |
+ L=strlen(seq[kfirst])-1; |
|
559 |
+ |
|
560 |
+ // Conversion to integer representation, checking for unequal lengths and initialization |
|
561 |
+ if (nres==NULL) nres=new(int[N_in]); |
|
562 |
+ for (k=0; k<N_in; k++) |
|
563 |
+ { |
|
564 |
+ if (!keep[k]) continue; |
|
565 |
+ int nr=0; |
|
566 |
+ wg[k]=0; nres[k]=0; |
|
567 |
+ for (l=1; l<=L; l++) |
|
568 |
+ { |
|
569 |
+ X[k][l]=aa2i(seq[k][l]); |
|
570 |
+ if (X[k][l]<NAA) nr++; |
|
571 |
+ } |
|
572 |
+ nres[k]=nr; |
|
573 |
+ if (seq[k][L+1]!='\0' && !unequal_lengths) unequal_lengths=k; |
|
574 |
+ } |
|
575 |
+ if (unequal_lengths) break; |
|
576 |
+ |
|
577 |
+ // Quick and dirty calculation of the weight per sequence wg[k] |
|
578 |
+ for (l=1; l<=L; l++) // for all positions l in alignment |
|
579 |
+ { |
|
580 |
+ int naa=0; //number of different amino acids |
|
581 |
+ for (a=0; a<20; a++) nl[a]=0; |
|
582 |
+ for (k=0; k<N_in; k++) if (keep[k]) nl[ (int)X[k][l]]++; |
|
583 |
+ for (a=0; a<20; a++) if(nl[a]) naa++; |
|
584 |
+ if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY) |
|
585 |
+ for (k=0; k<N_in; k++) |
|
586 |
+ if (keep[k] && (X[k][l]<20) ) |
|
587 |
+ { |
|
588 |
+ //wg[k]+=1.0/float(nl[ (int)X[k][l]]*naa*nres[k]+30.0); /* original version */ |
|
589 |
+ wg[k] += 1.0/float(nl[ (int)X[k][l]]*naa*(nres[k]+30.0)); /* MR1 */ |
|
590 |
+ // wg[k] += 1.0/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */ |
|
591 |
+ // wg[k] += (naa-1.0)/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */ |
|
592 |
+ } |
|
593 |
+ } /* 1=l<=L*/ |
|
594 |
+ |
|
595 |
+ //Replace GAP with ENDGAP for all end gaps |
|
596 |
+ for (k=0; k<N_in; ++k) |
|
597 |
+ { |
|
598 |
+ if (!keep[k]) continue; |
|
599 |
+ for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */ |
|
600 |
+ for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */ |
|
601 |
+ } |
|
602 |
+ |
|
603 |
+ // Add up percentage of gaps |
|
604 |
+ for (l=1; l<=L; l++) |
|
605 |
+ { |
|
606 |
+ float res=0; |
|
607 |
+ float gap=0; |
|
608 |
+ for (k=0; k< N_in; k++){ |
|
609 |
+ if (keep[k]){ |
|
610 |
+ if ( X[k][l]<GAP) res+=wg[k]; /* MR1, AA or ANY, changed from <ANY */ |
|
611 |
+ else if ( X[k][l] != ENDGAP) gap+=wg[k]; /* MR1, else: GAP. ENDGAPs are ignored for counting percentage */ |
|
612 |
+ } |
|
613 |
+ } |
|
614 |
+ percent_gaps[l]=100.*gap/(res+gap); |
|
615 |
+ if (v>=4) cout<<"percent gaps["<<l<<"]="<<percent_gaps[l]<<" first seq:"<<seq[0][l]<<"\n"; |
|
616 |
+ } |
|
617 |
+ |
|
618 |
+ /* Insert states 'bloat' the HMM, |
|
619 |
+ throwing them out 'slims' down the HMM. |
|
620 |
+ A slimmer HMM takes less time to construct. |
|
621 |
+ However, the marriage of Clustal and Hhalign |
|
622 |
+ is particularly sensitive to residues |
|
623 |
+ at the very end of the profile; these I call |
|
624 |
+ 'telomeres'. Telomeres must not be shed when |
|
625 |
+ throwing out insert states, for the telomeres |
|
626 |
+ we set the match threshold to 100%. |
|
627 |
+ */ |
|
628 |
+#define MGAP_LOGIC 0 |
|
629 |
+#define TELOMERE_LOGIC 1 |
|
630 |
+#define TELOMERE_DYNAMIC 0 |
|
631 |
+ |
|
632 |
+#define ALWAYS_ACCEPT 101.0 /* do NOT change this parameter, must be >=100, |
|
633 |
+ make slightly bigger than 100% -- to be sure to be sure */ |
|
634 |
+#define DEFAULT_MGAPS 100.0 /* Soeding's default is 50, omega default prior to telomere logic was 100 |
|
635 |
+ FIXME: this used to be par.Mgaps, |
|
636 |
+ in a later version re-introduce par.Mgaps to keep this value flexible */ |
|
637 |
+#define TELOMER_LENGTH 10 /* this parameter must be > 0 (unless DEFAULT_MGAPS=100), |
|
638 |
+ if it is too big (L/2) then telomere logic has no effect, |
|
639 |
+ don't think it should be changed (much) */ |
|
640 |
+#define TELOMER_FRACTION 0.10 |
|
641 |
+ //#define HMM_MIN_LENGTH 0.923 |
|
642 |
+#define HMM_MIN_LENGTH 0.950 |
|
643 |
+#define FORTRAN_OFFSET 1 |
|
644 |
+ double dDefaultMgaps; |
|
645 |
+ dDefaultMgaps = DEFAULT_MGAPS; |
|
646 |
+ |
|
647 |
+#if TELOMERE_LOGIC /* turn telomere logic on (1) or off (0) */ |
|
648 |
+ int iTelomereLength; |
|
649 |
+ |
|
650 |
+#if TELOMERE_DYNAMIC /* keep telomere length 'dynamic' */ |
|
651 |
+ iTelomereLength = TELOMER_LENGTH > (int)(L*TELOMER_FRACTION) ? TELOMER_LENGTH : (int)(L*TELOMER_FRACTION); |
|
652 |
+#else |
|
653 |
+ iTelomereLength = TELOMER_LENGTH; |
|
654 |
+#endif /* this was dynamic telomere */ |
|
655 |
+#endif /* this was telomere logic */ |
|
656 |
+ |
|
657 |
+ /* if HMMs get too small (much smaller than profile length L) |
|
658 |
+ then one is liable to get a back-tracking error. |
|
659 |
+ So we should ensure that the DEFAULT_MGAPS parameter does NOT |
|
660 |
+ shrink the HMM too much. |
|
661 |
+ take percentage-gap vector, sort it, and fix dDefaultMgaps, |
|
662 |
+ such that at least (HMM_MIN_LENGTH)*(L) are left |
|
663 |
+ */ |
|
664 |
+#if MGAP_LOGIC /* try to adapt Mgaps to size of final HMM */ |
|
665 |
+ { |
|
666 |
+ float *pfPercentGaps = NULL; |
|
667 |
+ if (NULL == (pfPercentGaps = (float *)malloc((L+1)*sizeof(float)))){ |
|
668 |
+ printf("%s:%s:%d: could not malloc %d float for sorted percent-gaps\n", |
|
669 |
+ __FUNCTION__, __FILE__, __LINE__, L+1); |
|
670 |
+ dDefaultMgaps = DEFAULT_MGAPS; |
|
671 |
+ } |
|
672 |
+ else { |
|
673 |
+ for (l = 0; l < L; l++) { |
|
674 |
+ pfPercentGaps[l] = percent_gaps[l+FORTRAN_OFFSET]; |
|
675 |
+ } |
|
676 |
+ qsort(pfPercentGaps, L, sizeof(float), CompFltAsc); |
|
677 |
+ |
|
678 |
+ dDefaultMgaps = pfPercentGaps[(int)(HMM_MIN_LENGTH*L)]; |
|
679 |
+ if (dDefaultMgaps < DEFAULT_MGAPS){ |
|
680 |
+ //printf("Mgaps = %f <- %f\n", DEFAULT_MGAPS, dDefaultMgaps); |
|
681 |
+ dDefaultMgaps = DEFAULT_MGAPS; |
|
682 |
+ } |
|
683 |
+ else { |
|
684 |
+ //printf("Mgaps = %f\n", dDefaultMgaps); |
|
685 |
+ } |
|
686 |
+ |
|
687 |
+ free(pfPercentGaps); pfPercentGaps = NULL; |
|
688 |
+ } |
|
689 |
+ } |
|
690 |
+#endif /* tried to adapt Mgaps to size of final HMM */ |
|
691 |
+ |
|
692 |
+ // Throw out insert states and keep only match states |
|
693 |
+ i=0; |
|
694 |
+ for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';} |
|
695 |
+ for (l=1; l<=L; l++) |
|
696 |
+ { |
|
697 |
+#if TELOMERE_LOGIC |
|
698 |
+ float fMgaps = ALWAYS_ACCEPT; |
|
699 |
+ if ( (l < iTelomereLength) || (L-l < iTelomereLength) ){ |
|
700 |
+ /* residue is in telomere, always retain this position */ |
|
701 |
+ fMgaps = ALWAYS_ACCEPT; |
|
702 |
+ } |
|
703 |
+ else if (0){ |
|
704 |
+ /* FIXME: would like to put a transition phase in here, |
|
705 |
+ where the Mgap value gradually goes down from 100 to DEFAULT_MGAPS, |
|
706 |
+ however, may not be necessary and will make code more clunky */ |
|
707 |
+ } |
|
708 |
+ else { |
|
709 |
+ /* position is in centre of sequence, |
|
710 |
+ retain position if less than DEFAULT_MGAPS% gaps at this position, |
|
711 |
+ for example, if DEFAULT_MGAPS=30 throw out if more than 30% gap. |
|
712 |
+ conversely, if DEFAULT_MGAPS=100 throw out if more than 100% gaps, |
|
713 |
+ which can never happen, so always retain */ |
|
714 |
+ fMgaps = dDefaultMgaps; |
|
715 |
+ } |
|
716 |
+ if (percent_gaps[l] <= fMgaps) |
|
717 |
+#else /* this was telomere logic */ |
|
718 |
+ if (percent_gaps[l]<=float(par.Mgaps)) |
|
719 |
+#endif /* this was Soeding default */ |
|
720 |
+ { |
|
721 |
+ if (i>=/*MAXRES*/par.maxResLen-2) { |
|
722 |
+ if (v>=1) |
|
723 |
+ printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i); |
|
724 |
+ break; |
|
725 |
+ } |
|
726 |
+ i++; |
|
727 |
+ this->l[i]=l; |
|
728 |
+ for (k=0; k<N_in; k++) |
|
729 |
+ { |
|
730 |
+ if (keep[k]) |
|
731 |
+ { |
|
732 |
+ seq[k][h[k]++]=MatchChr(seq[k][l]); |
|
733 |
+ X[k][i]=X[k][l]; |
|
734 |
+ I[k][i]=0; |
|
735 |
+ } |
|
736 |
+ else if (k==kss_dssp || k==kss_pred) |
|
737 |
+ { |
|
738 |
+ seq[k][h[k]++]=MatchChr(seq[k][l]); |
|
739 |
+ X[k][i]=ss2i(seq[k][l]); |
|
740 |
+ } |
|
741 |
+ else if (k==ksa_dssp) |
|
742 |
+ { |
|
743 |
+ seq[k][h[k]++]=MatchChr(seq[k][l]); |
|
744 |
+ X[k][i]=sa2i(seq[k][l]); |
|
745 |
+ } |
|
746 |
+ else if (k==kss_conf) |
|
747 |
+ { |
|
748 |
+ seq[k][h[k]++]=seq[k][l]; |
|
749 |
+ X[k][i]=cf2i(seq[k][l]); |
|
750 |
+ } |
|
751 |
+ } |
|
752 |
+ } |
|
753 |
+ else |
|
754 |
+ { |
|
755 |
+ for (k=0; k<N_in; k++) |
|
756 |
+ if (keep[k] && X[k][l]<GAP) |
|
757 |
+ { |
|
758 |
+ I[k][i]++; |
|
759 |
+ seq[k][h[k]++]=InsertChr(seq[k][l]); |
|
760 |
+ } |
|
761 |
+ } |
|
762 |
+ } |
|
763 |
+ for (k=0; k<N_in; k++) seq[k][h[k]]='\0'; |
|
764 |
+ |
|
765 |
+ //printf("%d\t%d\t%d\tN/L/M\n", N_in, L, i); /* -------- FIXME */ |
|
766 |
+ |
|
767 |
+ if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n"; |
|
768 |
+ L = i; //Number of match states |
|
769 |
+ |
|
770 |
+ delete[] percent_gaps; percent_gaps = NULL; |
|
771 |
+ break; |
|
772 |
+ |
|
773 |
+ |
|
774 |
+ //////////////////////////////////////////////////////////////////////// |
|
775 |
+ // Using residues of first sequence as match states |
|
776 |
+ case 3: |
|
777 |
+ /* Note: allocating statically is fine most of the time |
|
778 |
+ but when the sequences/profiles get really long |
|
779 |
+ we might run out of memory, so must really do it dynamically. |
|
780 |
+ had to move declaration of float *percent_gaps out of switch() |
|
781 |
+ */ |
|
782 |
+ //char match_state[MAXCOL]; //1: column assigned to match state 0: insert state |
|
783 |
+ match_state = new(char[par.maxColCnt]); |
|
784 |
+ |
|
785 |
+ // Determine number of columns L in alignment |
|
786 |
+ L=strlen(seq[0]+1); |
|
787 |
+ if (v>=3) printf("Length of first seq = %i\n",L); |
|
788 |
+ // Check for sequences with unequal lengths |
|
789 |
+ for (k=1; k<N_in; k++) |
|
790 |
+ if (int(strlen(seq[k]+1))!=L) {unequal_lengths=k; break;} |
|
791 |
+ if (unequal_lengths) break; |
|
792 |
+ |
|
793 |
+ // Determine match states: seq kfirst has residue at pos l -> match state |
|
794 |
+ for (l=1; l<=L; l++) |
|
795 |
+ if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0; |
|
796 |
+ // Throw out insert states and keep only match states |
|
797 |
+ for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';} |
|
798 |
+ i=0; |
|
799 |
+ for (l=1; l<=L; l++) |
|
800 |
+ { |
|
801 |
+ if (match_state[l]) // does sequence 0 have residue at position l? |
|
802 |
+ { |
|
803 |
+ if (i>=/*MAXRES*/par.maxResLen-2) { |
|
804 |
+ if (v>=1) |
|
805 |
+ printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i); |
|
806 |
+ break; |
|
807 |
+ } |
|
808 |
+ i++; |
|
809 |
+ this->l[i]=l; |
|
810 |
+ for (k=0; k<N_in; k++) |
|
811 |
+ { |
|
812 |
+ if (keep[k]) |
|
813 |
+ { |
|
814 |
+ seq[k][h[k]++]=MatchChr(seq[k][l]); |
|
815 |
+ X[k][i]=aa2i(seq[k][l]); |
|
816 |
+ I[k][i]=0; |
|
817 |
+ } |
|
818 |
+ else if (k==kss_dssp || k==kss_pred) |
|
819 |
+ { |
|
820 |
+ seq[k][h[k]++]=MatchChr(seq[k][l]); |
|
821 |
+ X[k][i]=ss2i(seq[k][l]); |
|
822 |
+ } |
|
823 |
+ else if (k==ksa_dssp) |
|
824 |
+ { |
|
825 |
+ seq[k][h[k]++]=MatchChr(seq[k][l]); |
|
826 |
+ X[k][i]=sa2i(seq[k][l]); |
|
827 |
+ } |
|
828 |
+ else if (k==kss_conf) |
|
829 |
+ { |
|
830 |
+ seq[k][h[k]++]=seq[k][l]; |
|
831 |
+ X[k][i]=cf2i(seq[k][l]); |
|
832 |
+ } |
|
833 |
+ } |
|
834 |
+ } |
|
835 |
+ else |
|
836 |
+ { |
|
837 |
+ for (k=0; k<N_in; k++) |
|
838 |
+ if (keep[k] && aa2i(seq[k][l])<GAP) |
|
839 |
+ { |
|
840 |
+ I[k][i]++; |
|
841 |
+ seq[k][h[k]++]=InsertChr(seq[k][l]); |
|
842 |
+ } |
|
843 |
+ } |
|
844 |
+ } |
|
845 |
+ for (k=0; k<N_in; k++) seq[k][h[k]]='\0'; |
|
846 |
+ |
|
847 |
+ //Replace GAP with ENDGAP for all end gaps /* MR1 */ |
|
848 |
+ for (k=0; k<N_in; ++k) |
|
849 |
+ { |
|
850 |
+ if (!keep[k]) continue; |
|
851 |
+ for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1, note i++ <- ++i */ |
|
852 |
+ for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */ |
|
853 |
+ } |
|
854 |
+ |
|
855 |
+ if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n"; |
|
856 |
+ L = i; //Number of match states |
|
857 |
+ |
|
858 |
+ delete[] match_state; match_state = NULL; |
|
859 |
+ break; |
|
860 |
+ |
|
861 |
+ } //end switch() |
|
862 |
+ /////////////////////////////////////////////////////////////////////////// |
|
863 |
+ |
|
864 |
+ |
|
865 |
+ // Error |
|
866 |
+ if (unequal_lengths) |
|
867 |
+ { |
|
868 |
+ strcut(sname[unequal_lengths]); |
|
869 |
+ cerr<<endl<<"Error: sequences in "<<infile<<" do not all have the same number of columns, \ne.g. first sequence and sequence "<<sname[unequal_lengths]<<".\n"; |
|
870 |
+ if(par.M==1) cerr<<".\nCheck input format for '-M a2m' option and consider using '-M first' or '-M 50'\n"; |
|
871 |
+ throw 1; |
|
872 |
+ } |
|
873 |
+ |
|
874 |
+ // Avert user about -cons option? |
|
875 |
+ if (v>=2 && !par.cons) |
|
876 |
+ { |
|
877 |
+ for (i=1; i<=L; i++) |
|
878 |
+ if (X[kfirst][i]==GAP) |
|
879 |
+ { |
|
880 |
+ printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n"); |
|
881 |
+ break; |
|
882 |
+ } |
|
883 |
+ } |
|
884 |
+ /* MR1 |
|
885 |
+ //Replace GAP with ENDGAP for all end gaps |
|
886 |
+ for (k=0; k<N_in; k++) |
|
887 |
+ { |
|
888 |
+ if (!keep[k]) continue; |
|
889 |
+ for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; |
|
890 |
+ for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; |
|
891 |
+ }*/ |
|
892 |
+ |
|
893 |
+ // DEBUG |
|
894 |
+ if (v>=4) |
|
895 |
+ for (k=0; k<N_in; k++) |
|
896 |
+ { |
|
897 |
+ if (!display[k]) continue; |
|
898 |
+ cout<<">"<<sname[k]<<"\n"; |
|
899 |
+ if (k==kss_dssp || k==kss_pred) {for (i=1; i<=L; i++) cout<<char(i2ss(X[k][i]));} |
|
900 |
+ else if (k==kss_conf) {for (i=1; i<=L; i++) cout<<char(i2cf(X[k][i]));} |
|
901 |
+ else if (k==ksa_dssp) {for (i=1; i<=L; i++) cout<<char(i2sa(X[k][i]));} |
|
902 |
+ else |
|
903 |
+ { |
|
904 |
+ for (i=1; i<=L; i++) cout<<char(i2aa(X[k][i])); |
|
905 |
+ cout<<"\n"; |
|
906 |
+ for (i=1; i<=L; i++) |
|
907 |
+ if (I[k][i]==0) cout<<"-"; else if (I[k][i]>9) cout<<"X"; else cout<<I[k][i]; |
|
908 |
+ } |
|
909 |
+ cout<<"\n"; |
|
910 |
+ } |
|
911 |
+ |
|
912 |
+ delete[](h); h = NULL; |
|
913 |
+} |
|
914 |
+ |
|
915 |
+ |
|
916 |
+/** |
|
917 |
+ * @brief Remove sequences with seq. identity larger than seqid percent |
|
918 |
+ *(remove the shorter of two) or coverage<cov_thr |
|
919 |
+ * |
|
920 |
+ * FIXME: originally max_seqid is a variable that is the cutoff |
|
921 |
+ * above which sequences are thrown out. We want to throw out sequences |
|
922 |
+ * when building the HMM but not for display, there we want to keep all. |
|
923 |
+ * This should be really easy, but there is some hidden stuff going on |
|
924 |
+ * in this function, so I did a minimal-invasive change and just stuck |
|
925 |
+ * (effectively) a hard-wired 100 instead of the variable. |
|
926 |
+ * At a later stage we should get rid of this function alltogether |
|
927 |
+ * as it does gobble up some time (and is quadratic in noof sequences, I think) |
|
928 |
+ * FS, 2010-10-04 |
|
929 |
+ */ |
|
930 |
+//////////////////////////////////////////////////////////////////////////// |
|
931 |
+/* |
|
932 |
+ */ |
|
933 |
+inline int |
|
934 |
+Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int N) |
|
935 |
+{ |
|
936 |
+ |
|
937 |
+ /* FIXME |
|
938 |
+ * by just returning n_display and not doing anything |
|
939 |
+ * I think we display everything and not do any work for it |
|
940 |
+ */ |
|
941 |
+ return n_display; /* FS, 2010-10-04*/ |
|
942 |
+ |
|
943 |
+ |
|
944 |
+ if (par.mark) return n_display; |
|
945 |
+ char *dummy = new(char[N_in+1]); |
|
946 |
+ int vtmp=v, seqid; |
|
947 |
+ v=0; |
|
948 |
+ n_display=0; |
|
949 |
+ if (kss_dssp>=0) display[kss_dssp]=KEEP_NOT; |
|
950 |
+ if (ksa_dssp>=0) display[ksa_dssp]=KEEP_NOT; |
|
951 |
+ if (kss_pred>=0) display[kss_pred]=KEEP_NOT; |
|
952 |
+ if (kss_conf>=0) display[kss_conf]=KEEP_NOT; |
|
953 |
+ for (seqid=imin(10,max_seqid); n_display<N && seqid<=max_seqid; seqid++) |
|
954 |
+ { |
|
955 |
+ for (int k=0; k<N_in; k++) dummy[k]=display[k]; |
|
956 |
+ n_display = Filter2(dummy,coverage,qid,qsc,20,seqid,0); |
|
957 |
+ // printf("Seqid=%3i n_display=%4i\n",seqid,n_display); |
|
958 |
+ } |
|
959 |
+ if (n_display>N) |
|
960 |
+ { |
|
961 |
+ for (int k=0; k<N_in; k++) dummy[k]=display[k]; |
|
962 |
+ n_display = Filter2(dummy,coverage,qid,qsc,20,--(--seqid),0); |
|
963 |
+ } |
|
964 |
+ v=vtmp; |
|
965 |
+ for (int k=0; k<N_in; k++) display[k]=dummy[k]; |
|
966 |
+ if (kss_dssp>=0) {display[kss_dssp]=KEEP_CONDITIONALLY; n_display++;} |
|
967 |
+ if (ksa_dssp>=0) {display[ksa_dssp]=KEEP_CONDITIONALLY; n_display++;} |
|
968 |
+ if (kss_pred>=0) {display[kss_pred]=KEEP_CONDITIONALLY; n_display++;} |
|
969 |
+ if (kss_conf>=0) {display[kss_conf]=KEEP_CONDITIONALLY; n_display++;} |
|
970 |
+ delete[] dummy; dummy = NULL; |
|
971 |
+ return n_display; |
|
972 |
+} |
|
973 |
+ |
|
974 |
+///////////////////////////////////////////////////////////////////////////////////// |
|
975 |
+// Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage<cov_thr |
|
976 |
+///////////////////////////////////////////////////////////////////////////////////// |
|
977 |
+inline int Alignment::Filter(int max_seqid, int coverage, int qid, float qsc, int N) |
|
978 |
+{ |
|
979 |
+ return Filter2(keep,coverage,qid,qsc,20,max_seqid,N); |
|
980 |
+} |
|
981 |
+ |
|
982 |
+///////////////////////////////////////////////////////////////////////////// |
|
983 |
+/* |
|
984 |
+ * @brief Select set of representative sequences in the multiple sequence alignment |
|
985 |
+ * |
|
986 |
+ * Filter criteria: |
|
987 |
+ * Remove sequences with coverage of query less than "coverage" percent |
|
988 |
+ * Remove sequences with sequence identity to query of less than "qid" percent |
|
989 |
+ * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent |
|
990 |
+ * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1 |
|
991 |
+ * and seqid2 (%), where the minimum seqid threshold is determined such that, |
|
992 |
+ * in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left. |
|
993 |
+ * This ensures that in multi-domain proteins sequences covering one domain are not |
|
994 |
+ * removed completely because sequences covering other domains are more diverse. |
|
995 |
+ * |
|
996 |
+ * Allways the shorter of two compared sequences is removed (=> sort sequences by length first). |
|