Browse code

Multiple fixes; final comit before release; version number bumped to 0.99.6

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102514 bc3139a8-67e5-0310-9ffc-ced21a209358

Ulrich Bodenhofer authored on 15/04/2015 13:20:09
Showing1 changed files
... ...
@@ -77,15 +77,15 @@ Alignment::Alignment(int maxseq, int maxres)
77 77
 
78 78
     //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */
79 79
   longname = new(char[DESCLEN]);
80
-  sname = new(char*[maxseq+2]); /* MR1 */
81
-  seq = new(char*[maxseq+2]); /* MR1 */
82
-  l = new(int[maxres]);
83
-  X = new(char*[maxseq+2]);  /* MR1 */
84
-  I = new(short unsigned int*[maxseq+2]); /* MR1 */
85
-  keep = new(char[maxseq+2]); /* MR1 */
86
-  display = new(char[maxseq+2]); /* MR1 */
87
-  wg = new(float[maxseq+2]); /* MR1 */
88
-  nseqs = new(int[maxres+2]); /* MR1 */
80
+  sname = new char*[maxseq+2]; /* MR1 */
81
+  seq = new char*[maxseq+2]; /* MR1 */
82
+  l = new int[maxres];
83
+  X = new char*[maxseq+2];  /* MR1 */
84
+  I = new short unsigned int*[maxseq+2]; /* MR1 */
85
+  keep = new char[maxseq+2]; /* MR1 */
86
+  display = new char[maxseq+2]; /* MR1 */
87
+  wg = new float[maxseq+2]; /* MR1 */
88
+  nseqs = new int[maxres+2]; /* MR1 */
89 89
   N_in=L=0;
90 90
   nres=NULL; // number of residues per sequence k
91 91
   first=NULL; // first residue in sequence k
... ...
@@ -138,7 +138,7 @@ Alignment::Read(FILE* inf, char infile[], char* firstline)
138 138
   int k; // Index of sequence being read currently (first=0)
139 139
   char line[LINELEN]=""; // input line
140 140
   //char cur_seq[MAXCOL]; // Sequence currently read in
141
-  char *cur_seq=new(char[par.maxColCnt]);
141
+  char *cur_seq=new char[par.maxColCnt];
142 142
   char* cur_name; // Sequence currently read in
143 143
   int linenr=0; // current line number in input file
144 144
   char skip_sequence=0;
... ...
@@ -180,11 +180,11 @@ Alignment::Read(FILE* inf, char infile[], char* firstline)
180 180
                               }
181 181
 
182 182
                           // Create space for residues and paste new sequence in
183
-                          seq[k]=new(char[strlen(cur_seq)+2]);
183
+                          seq[k]=new char[strlen(cur_seq)+2];
184 184
                           if (!seq[k]) MemoryError("array for input sequences");
185
-                          X[k]=new(char[strlen(cur_seq)+2]);
185
+                          X[k]=new char[strlen(cur_seq)+2];
186 186
                           if (!X[k]) MemoryError("array for input sequences");
187
-                          I[k]=new(short unsigned int[strlen(cur_seq)+2]);
187
+                          I[k]=new short unsigned int[strlen(cur_seq)+2];
188 188
                           if (!I[k]) MemoryError("array for input sequences");
189 189
                           strcpy(seq[k],cur_seq);
190 190
                       }
... ...
@@ -233,7 +233,7 @@ Alignment::Read(FILE* inf, char infile[], char* firstline)
233 233
 
234 234
                   // store sequence name
235 235
                   if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]);
236
-                  sname[k] = new(char[strlen(cur_name)+1]);
236
+                  sname[k] = new char[strlen(cur_name)+1];
237 237
                   if (!sname[k]) {MemoryError("array for sequence names");}
238 238
                   strcpy(sname[k],cur_name);
239 239
               } // end if(line contains sequence name)
... ...
@@ -348,11 +348,11 @@ Alignment::Read(FILE* inf, char infile[], char* firstline)
348 348
 
349 349
   if (k>=0) //if at least one sequence was read in
350 350
       {
351
-          seq[k]=new(char[strlen(cur_seq)+2]);
351
+          seq[k]=new char[strlen(cur_seq)+2];
352 352
           if (!seq[k]) MemoryError("array for input sequences");
353
-          X[k]=new(char[strlen(cur_seq)+2]);
353
+          X[k]=new char[strlen(cur_seq)+2];
354 354
           if (!X[k]) MemoryError("array for input sequences");
355
-          I[k]=new(short unsigned int[strlen(cur_seq)+2]);
355
+          I[k]=new short unsigned int[strlen(cur_seq)+2];
356 356
           if (!I[k]) MemoryError("array for input sequences");
357 357
           strcpy(seq[k],cur_seq);
358 358
       }
... ...
@@ -420,7 +420,7 @@ Alignment::Compress(const char infile[])
420 420
     /*static short unsigned int h[MAXSEQ];*/
421 421
     /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */
422 422
 
423
-    h = new(/*short*/ unsigned int[N_in+2]); /* short -> overflow, FS, r235 -> r236 */
423
+    h = new /*short*/ unsigned int[N_in+2]; /* short -> overflow, FS, r235 -> r236 */
424 424
     float *percent_gaps = NULL; /* FS, 2010-Nov */
425 425
     char *match_state = NULL;  /* FS, 2010-Nov */
426 426
 
... ...
@@ -552,13 +552,13 @@ Alignment::Compress(const char infile[])
552 552
                had to move declaration of float *percent_gaps out of switch()
553 553
             */
554 554
             //float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences)
555
-            percent_gaps = new(float[par.maxColCnt]);
555
+            percent_gaps = new float[par.maxColCnt];
556 556
 
557 557
             //determine number of columns L in alignment
558 558
             L=strlen(seq[kfirst])-1;
559 559
 
560 560
             // Conversion to integer representation, checking for unequal lengths and initialization
561
-            if (nres==NULL) nres=new(int[N_in]);
561
+            if (nres==NULL) nres=new int[N_in];
562 562
             for (k=0; k<N_in; k++)
563 563
                 {
564 564
                     if (!keep[k]) continue;
... ...
@@ -780,7 +780,7 @@ Alignment::Compress(const char infile[])
780 780
                had to move declaration of float *percent_gaps out of switch()
781 781
             */
782 782
             //char match_state[MAXCOL]; //1: column assigned to match state 0: insert state
783
-            match_state = new(char[par.maxColCnt]);
783
+            match_state = new char[par.maxColCnt];
784 784
 
785 785
             // Determine number of columns L in alignment
786 786
             L=strlen(seq[0]+1);
... ...
@@ -942,7 +942,7 @@ Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int
942 942
 
943 943
 
944 944
     if (par.mark) return n_display;
945
-    char *dummy = new(char[N_in+1]);
945
+    char *dummy = new char[N_in+1];
946 946
     int vtmp=v, seqid;
947 947
     v=0;
948 948
     n_display=0;
... ...
@@ -1005,12 +1005,12 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
1005 1005
 {
1006 1006
     // In the beginnning, keep[k] is 1 for all regular amino acid sequences and 0 for all others (ss_conf, ss_pred,...)
1007 1007
     // In the end, keep[k] will be 1 for all regular representative sequences kept in the alignment, 0 for all others
1008
-    char* in=new(char[N_in+1]); // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid
1009
-    char* inkk=new(char[N_in+1]); // inkk[k]=1 iff in[ksort[k]]=1 else 0;
1010
-    int* Nmax=new(int[L+2]); // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/
1011
-    int* idmaxwin=new(int[L+2]); // minimum value of idmax[i-WFIL,i+WFIL]
1012
-    int* seqid_prev=new(int[N_in+1]); // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid)
1013
-    int* N=new(int[L+2]); // N[i] number of already accepted sequences at position i
1008
+    char* in=new char[N_in+1]; // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid
1009
+    char* inkk=new char[N_in+1]; // inkk[k]=1 iff in[ksort[k]]=1 else 0;
1010
+    int* Nmax=new int[L+2]; // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/
1011
+    int* idmaxwin=new int[L+2]; // minimum value of idmax[i-WFIL,i+WFIL]
1012
+    int* seqid_prev=new int[N_in+1]; // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid)
1013
+    int* N=new int[L+2]; // N[i] number of already accepted sequences at position i
1014 1014
     const int WFIL=25; // see previous line
1015 1015
 
1016 1016
     int diffNmax=Ndiff;       // current  maximum difference of Nmax[i] and Ndiff /* MR1 */
... ...
@@ -1039,8 +1039,8 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
1039 1039
     // Determine first[k], last[k]?
1040 1040
     if (first==NULL)
1041 1041
         {
1042
-            first=new(int[N_in]);// first non-gap position in sequence k
1043
-            last =new(int[N_in]);// last  non-gap position in sequence k
1042
+            first=new int[N_in];// first non-gap position in sequence k
1043
+            last =new int[N_in];// last  non-gap position in sequence k
1044 1044
             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1045 1045
                 {
1046 1046
                     for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
... ...
@@ -1054,7 +1054,7 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
1054 1054
     if ( (nres==NULL)  || (sizeof(nres)<N_in*sizeof(int)) )
1055 1055
         {
1056 1056
     	 if (nres) delete[] nres;
1057
-    		nres=new(int[N_in]);
1057
+    		nres=new int[N_in];
1058 1058
             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1059 1059
                 {
1060 1060
                     int nr=0;
... ...
@@ -1068,7 +1068,7 @@ Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, in
1068 1068
     // Sort sequences according to length; afterwards, nres[ksort[kk]] is sorted by size
1069 1069
     if (ksort==NULL)
1070 1070
         {
1071
-            ksort=new(int[N_in]); // never reuse alignment object for new alignment with more sequences
1071
+            ksort=new int[N_in]; // never reuse alignment object for new alignment with more sequences
1072 1072
             for (k=0; k<N_in; k++) ksort[k]=k;
1073 1073
             QSortInt(nres,ksort,kfirst+1,N_in-1,-1); //Sort sequences after kfirst (query) in descending order
1074 1074
         }
... ...
@@ -1311,7 +1311,7 @@ Alignment::HomologyFilter(int coverage_core, float qsc_core, float coresc)
1311 1311
     const int Ndiff_core=0;
1312 1312
     int n;
1313 1313
     HMM qcore;
1314
-    char* coreseq=new(char[N_in]); // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query)
1314
+    char* coreseq=new char[N_in]; // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query)
1315 1315
     for (int k=0; k<N_in; k++) coreseq[k]=keep[k]; // Copy keep[] into coreseq[]
1316 1316
 
1317 1317
     // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
... ...
@@ -1362,7 +1362,7 @@ Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore)
1362 1362
     int i; // column in query alignment
1363 1363
     int a; // amino acid (0..19)
1364 1364
     int n=1; // number of sequences that passed filter
1365
-    float** logodds=new(float*[L+1]); // log-odds ratios for HMM qcore
1365
+    float** logodds=new float*[L+1]; // log-odds ratios for HMM qcore
1366 1366
     char gap; // 1: previous state in seq k was a gap 0: previous state in seq k was an amino acid
1367 1367
     float score; // score of sequence k aligned with qcore
1368 1368
 
... ...
@@ -1371,8 +1371,8 @@ Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore)
1371 1371
     // Determine first[k], last[k]?
1372 1372
     if (first==NULL)
1373 1373
         {
1374
-            first=new(int[N_in]);// first non-gap position in sequence k
1375
-            last =new(int[N_in]);// last non-gap position in sequence k
1374
+            first=new int[N_in];// first non-gap position in sequence k
1375
+            last =new int[N_in];// last non-gap position in sequence k
1376 1376
             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1377 1377
                 {
1378 1378
                     for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
... ...
@@ -1385,7 +1385,7 @@ Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore)
1385 1385
     // Determine number of residues nres[k]?
1386 1386
     if (nres==NULL)
1387 1387
         {
1388
-            nres=new(int[N_in]);
1388
+            nres=new int[N_in];
1389 1389
             for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1390 1390
                 {
1391 1391
                     int nr=0;
... ...
@@ -1667,18 +1667,18 @@ Alignment::FrequenciesAndTransitions(HMM& q, char* in)
1667 1667
                     q.sname[q.ncons]=new(char[10]);
1668 1668
                     if (!q.sname[q.ncons]) {MemoryError("array of names for displayed sequences");}
1669 1669
                     strcpy(q.sname[q.ncons],"Consensus");
1670
-                    q.seq[q.ncons]=new(char[L+2]);
1670
+                    q.seq[q.ncons]=new char[L+2];
1671 1671
                     if (!q.seq[q.ncons]) {MemoryError("array of names for displayed sequences");}
1672 1672
                 }
1673 1673
             if (par.cons)
1674 1674
                 {
1675 1675
                     // Reserve space for consensus sequence as first sequence in alignment
1676 1676
                     q.nfirst=n++; kfirst=-1;
1677
-                    q.sname[q.nfirst]=new(char[strlen(name)+11]);
1677
+                    q.sname[q.nfirst]=new char[strlen(name)+11];
1678 1678
                     if (!q.sname[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1679 1679
                     strcpy(q.sname[q.nfirst],name);
1680 1680
                     strcat(q.sname[q.nfirst],"_consensus");
1681
-                    q.seq[q.nfirst]=new(char[L+2]);
1681
+                    q.seq[q.nfirst]=new char[L+2];
1682 1682
                     if (!q.seq[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1683 1683
                 }
1684 1684
             // Calculate consensus amino acids using similarity matrix
... ...
@@ -1736,10 +1736,10 @@ Alignment::FrequenciesAndTransitions(HMM& q, char* in)
1736 1736
                     else if (k==kfirst) nn=q.nfirst=n++;
1737 1737
                     else nn=n++;
1738 1738
                     // strcut(sname[k]," "); // delete rest of name line beginning with two spaces " " // Why this?? Problem for pdb seqs without chain
1739
-                    q.sname[nn]=new(char[strlen(sname[k])+1]);
1739
+                    q.sname[nn]=new char[strlen(sname[k])+1];
1740 1740
                     if (!q.sname[nn]) {MemoryError("array of names for displayed sequences");}
1741 1741
                     strcpy(q.sname[nn],sname[k]);
1742
-                    q.seq[nn]=new(char[strlen(seq[k])+1]);
1742
+                    q.seq[nn]=new char[strlen(seq[k])+1];
1743 1743
                     if (!q.seq[nn]) {MemoryError("array of names for displayed sequences");}
1744 1744
                     strcpy(q.seq[nn],seq[k]);
1745 1745
                 }
... ...
@@ -1826,14 +1826,14 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
1826 1826
   //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
1827 1827
   float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
1828 1828
   //float Neff[MAXRES]; // diversity of subalignment i
1829
-  float *Neff = new(float[par.maxResLen]); // diversity of subalignment i
1829
+  float *Neff = new float[par.maxResLen]; // diversity of subalignment i
1830 1830
   int nseqi=0; // number of sequences in subalignment i
1831 1831
   int ncol=0; // number of columns j that contribute to Neff[i]
1832 1832
   char change; // has the set of sequences in subalignment changed? 0:no 1:yes
1833 1833
   float fj[NAA+3]; // to calculate entropy
1834 1834
   float sum;
1835 1835
 
1836
-  wi = new(float[N_in+2]);
1836
+  wi = new float[N_in+2];
1837 1837
 
1838 1838
   // Global weights?
1839 1839
   if (par.wg==1)
... ...
@@ -1842,7 +1842,7 @@ Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
1842 1842
   // Initialization
1843 1843
   q.Neff_HMM=0.0f;
1844 1844
   Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0]
1845
-  n = new(int*[L+2]);
1845
+  n = new int*[L+2];
1846 1846
   for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
1847 1847
   for (j=1; j<=L; j++)
1848 1848
     for (a=0; a<NAA+3; a++) n[j][a]=0;
... ...
@@ -2052,7 +2052,7 @@ Alignment::Transitions_from_I_state(HMM& q, char* in)
2052 2052
     //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2053 2053
     float *wi = NULL; // weight of sequence k in column i, calculated from subalignment i
2054 2054
     //float Neff[MAXRES]; // diversity of subalignment i
2055
-    float *Neff = new(float[par.maxResLen]); // diversity of subalignment i
2055
+    float *Neff = new float[par.maxResLen]; // diversity of subalignment i
2056 2056
     int nseqi; // number of sequences in subalignment i
2057 2057
     int ncol; // number of columns j that contribute to Neff[i]
2058 2058
     float fj[NAA+3]; // to calculate entropy
... ...
@@ -2060,7 +2060,7 @@ Alignment::Transitions_from_I_state(HMM& q, char* in)
2060 2060
     float Nlim=0.0; // only for global weights
2061 2061
     float scale=0.0; // only for global weights
2062 2062
 
2063
-    wi = new(float[N_in+2]);
2063
+    wi = new float[N_in+2];
2064 2064
 
2065 2065
     // Global weights?
2066 2066
     if (par.wg==1)
... ...
@@ -2071,7 +2071,7 @@ Alignment::Transitions_from_I_state(HMM& q, char* in)
2071 2071
         }
2072 2072
 
2073 2073
     // Initialization
2074
-    n = new(int*[L+2]);
2074
+    n = new int*[L+2];
2075 2075
     for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2076 2076
 
2077 2077
     //////////////////////////////////////////////////////////////////////////////////////////////
... ...
@@ -2254,7 +2254,7 @@ Alignment::Transitions_from_D_state(HMM& q, char* in)
2254 2254
     //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2255 2255
     float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
2256 2256
     //float Neff[MAXRES]; // diversity of subalignment i 
2257
-    float *Neff = new(float[par.maxResLen]); // diversity of subalignment i 
2257
+    float *Neff = new float[par.maxResLen]; // diversity of subalignment i
2258 2258
     int nseqi=0; // number of sequences in subalignment i (for DEBUGGING)
2259 2259
     int ncol=0; // number of columns j that contribute to Neff[i]
2260 2260
     char change; // has the set of sequences in subalignment changed? 0:no 1:yes
... ...
@@ -2263,7 +2263,7 @@ Alignment::Transitions_from_D_state(HMM& q, char* in)
2263 2263
     float Nlim=0.0; // only for global weights
2264 2264
     float scale=0.0; // only for global weights
2265 2265
 
2266
-    wi = new(float[N_in+2]); /* FIXME: FS */
2266
+    wi = new float[N_in+2]; /* FIXME: FS */
2267 2267
     // Global weights?
2268 2268
     if (par.wg==1)
2269 2269
         {
... ...
@@ -2273,7 +2273,7 @@ Alignment::Transitions_from_D_state(HMM& q, char* in)
2273 2273
         }
2274 2274
 
2275 2275
     // Initialization
2276
-    n = new(int*[L+2]);
2276
+    n = new int*[L+2];
2277 2277
     for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2278 2278
     for (j=1; j<=L; j++)
2279 2279
         for (a=0; a<NAA+3; a++) n[j][a]=0;
... ...
@@ -2544,7 +2544,7 @@ Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[])
2544 2544
  N_filtered = Tali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2545 2545
 
2546 2546
  // Record imatch[j]
2547
- int* imatch=new(int[hit.j2+1]);
2547
+ int* imatch=new int[hit.j2+1];
2548 2548
  int step = hit.nsteps;
2549 2549
  for (j=hit.j1; j<=hit.j2; j++)
2550 2550
  {
... ...
@@ -2676,7 +2676,7 @@ Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[])
2676 2676
  iprev=i; lprev=l;
2677 2677
  if (h>=maxcol-1000) // too few columns? Reserve double space
2678 2678
  {
2679
- char* new_seq=new(char[2*maxcol]);
2679
+ char* new_seq=new char[2*maxcol];
2680 2680
  strncpy(new_seq,cur_seq,maxcol); //////// check: maxcol-1 ????
2681 2681
  delete[](cur_seq); (cur_seq) = NULL;
2682 2682
  cur_seq=new_seq;
... ...
@@ -2689,14 +2689,14 @@ Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[])
2689 2689
  cur_seq[h++]='\0';
2690 2690
 
2691 2691
  keep[N_in] = display[N_in] = KEEP_CONDITIONALLY;
2692
- seq[N_in]=new(char[h]);
2692
+ seq[N_in]=new char[h];
2693 2693
  if (!seq[N_in]) MemoryError("array for input sequences");
2694 2694
  strcpy(seq[N_in],cur_seq);
2695
- X[N_in]=new(char[h]);
2695
+ X[N_in]=new char[h];
2696 2696
  if (!X[N_in]) MemoryError("array for input sequences");
2697
- I[N_in]=new(short unsigned int[h]);
2697
+ I[N_in]=new short unsigned int[h];
2698 2698
  if (!I[N_in]) MemoryError("array for input sequences");
2699
- sname[N_in]=new(char[strlen(Tali.sname[k])+1]);
2699
+ sname[N_in]=new char[strlen(Tali.sname[k])+1];
2700 2700
  if (!sname[N_in]) MemoryError("array for input sequences");
2701 2701
  strcpy(sname[N_in],Tali.sname[k]);
2702 2702
  N_in++;
... ...
@@ -2727,7 +2727,7 @@ Alignment::AddSequence(char Xk[], int Ik[])
2727 2727
 {
2728 2728
  int i; // position in query and target
2729 2729
  if (L<=0) InternalError("L is not set in AddSequence()");
2730
- X[N_in]=new(char[L+2]);
2730
+ X[N_in]=new char[L+2];
2731 2731
  for (i=0; i<=L+1; i++) X[N_in][i]=Xk[i];
2732 2732
  if (Ik==NULL)
2733 2733
  for (i=0; i<=L+1; i++) I[N_in][i]=0;
... ...
@@ -2777,7 +2777,7 @@ Alignment::GetPositionSpecificWeights(float* w[])
2777 2777
  {
2778 2778
 
2779 2779
  // Initialization
2780
- n = new(int*[L+2]);
2780
+ n = new int*[L+2];
2781 2781
  for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2782 2782
  for (j=1; j<=L; j++)
2783 2783
  for (a=0; a<NAA+3; a++) n[j][a]=0;
... ...
@@ -2894,9 +2894,9 @@ Alignment::Transfer(char **ppcProf, int iCnt){
2894 2894
     /* @<allocate memory for sequences etc@> */
2895 2895
     for (k = 0; k < iCnt; k++){
2896 2896
 #define GOOD_MEASURE 1000 /* Temporary -- can be removed once rest in place */
2897
-        I[k] = new(short unsigned int[iLen+2+GOOD_MEASURE]);
2898
-        X[k] = new(char[iLen+2+GOOD_MEASURE]);
2899
-        seq[k] = new(char[iLen+2+GOOD_MEASURE]);
2897
+        I[k] = new short unsigned int[iLen+2+GOOD_MEASURE];
2898
+        X[k] = new char[iLen+2+GOOD_MEASURE];
2899
+        seq[k] = new char[iLen+2+GOOD_MEASURE];
2900 2900
         seq[k][0] = ' ';
2901 2901
         seq[k][1] = '\0';
2902 2902
         if (NULL == ppcProf[k]){
Browse code

add package to the repository

msa


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358

Sonali Arora authored on 10/04/2015 00:12:33
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,3029 @@
1
+/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
+
3
+/*********************************************************************
4
+ * Clustal Omega - Multiple sequence alignment
5
+ *
6
+ * Copyright (C) 2010 University College Dublin
7
+ *
8
+ * Clustal-Omega is free software; you can redistribute it and/or
9
+ * modify it under the terms of the GNU General Public License as
10
+ * published by the Free Software Foundation; either version 2 of the
11
+ * License, or (at your option) any later version.
12
+ *
13
+ * This file is part of Clustal-Omega.
14
+ *
15
+ ********************************************************************/
16
+
17
+/*
18
+ *  RCS $Id: hhalignment-C.h 236 2011-04-14 11:30:04Z fabian $
19
+ */
20
+
21
+
22
+/*
23
+ * Changelog: Michael Remmert made changes to hhalign stand-alone code
24
+ * FS implemented some of the changes on 2010-10-28 -> MR1
25
+ *
26
+ * Note: MR seems to have changed all [aijk]++ to ++[aijk],
27
+ * FS did not do that on 2010-10-28
28
+ */
29
+
30
+// hhalignment.C
31
+
32
+//////////////////////////////////////////////////////////////////////////////
33
+//// Class Alignment
34
+//////////////////////////////////////////////////////////////////////////////
35
+
36
+// hhalignment.C
37
+
38
+#ifndef MAIN
39
+#define MAIN
40
+#include <iostream> // cin, cout, cerr
41
+#include <fstream> // ofstream, ifstream
42
+#include <stdio.h> // printf
43
+using std::cout;
44
+using std::cerr;
45
+using std::endl;
46
+using std::ios;
47
+using std::ifstream;
48
+using std::ofstream;
49
+#include <stdlib.h> // exit
50
+#include <string> // strcmp, strstr
51
+#include <math.h> // sqrt, pow
52
+#include <limits.h> // INT_MIN
53
+#include <float.h> // FLT_MIN
54
+#include <time.h> // clock
55
+#include <ctype.h> // islower, isdigit etc
56
+#include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
57
+#include "list.h" // list data structure
58
+#include "hash.h" // hash data structure
59
+#include "hhdecl-C.h"
60
+#include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
61
+#include "hhhmm.h"
62
+#endif
63
+
64
+
65
+enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS};
66
+
67
+//////////////////////////////////////////////////////////////////////////////
68
+// Class Alignment
69
+//////////////////////////////////////////////////////////////////////////////
70
+
71
+
72
+//////////////////////////////////////////////////////////////////////////////
73
+// Object constructor
74
+//////////////////////////////////////////////////////////////////////////////
75
+Alignment::Alignment(int maxseq, int maxres)
76
+{
77
+
78
+    //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */
79
+  longname = new(char[DESCLEN]);
80
+  sname = new(char*[maxseq+2]); /* MR1 */
81
+  seq = new(char*[maxseq+2]); /* MR1 */
82
+  l = new(int[maxres]);
83
+  X = new(char*[maxseq+2]);  /* MR1 */
84
+  I = new(short unsigned int*[maxseq+2]); /* MR1 */
85
+  keep = new(char[maxseq+2]); /* MR1 */
86
+  display = new(char[maxseq+2]); /* MR1 */
87
+  wg = new(float[maxseq+2]); /* MR1 */
88
+  nseqs = new(int[maxres+2]); /* MR1 */
89
+  N_in=L=0;
90
+  nres=NULL; // number of residues per sequence k
91
+  first=NULL; // first residue in sequence k
92
+  last=NULL; // last residue in sequence k
93
+  ksort=NULL; // sequence indices sorted by descending nres[k]
94
+  name[0]='\0'; // no name defined yet
95
+  longname[0]='\0'; // no name defined yet
96
+  fam[0]='\0'; // no name defined yet
97
+  file[0]='\0'; // no name defined yet
98
+  readCommentLine = '0'; /* MR1 */
99
+}
100
+
101
+//////////////////////////////////////////////////////////////////////////////
102
+// Object destructor
103
+//////////////////////////////////////////////////////////////////////////////
104
+Alignment::~Alignment()
105
+{
106
+  delete[] longname; longname = NULL;
107
+  for(int k=0; k<N_in; k++)
108
+    {
109
+      delete[] sname[k]; sname[k] = NULL;
110
+      delete[] seq[k]; seq[k] = NULL;
111
+      delete[] X[k]; X[k] = NULL;
112
+      delete[] I[k]; I[k] = NULL;
113
+    }
114
+  delete[] sname; sname = NULL;
115
+  delete[] seq; seq = NULL;
116
+  delete[] X; X = NULL;
117
+  delete[] I; I = NULL;
118
+  delete[] l; l = NULL;
119
+  delete[] keep; keep = NULL;
120
+  delete[] display; display = NULL;
121
+  delete[] wg; wg = NULL;
122
+  delete[] nseqs; nseqs = NULL;
123
+  delete[] nres; nres = NULL;
124
+  delete[] first; first = NULL;
125
+  delete[] last; last = NULL;
126
+  delete[] ksort; ksort = NULL;
127
+}
128
+
129
+
130
+/**
131
+ * @brief Reads in an alignment from file into matrix seq[k][l] as ASCII
132
+ */
133
+void 
134
+Alignment::Read(FILE* inf, char infile[], char* firstline)
135
+{
136
+  int l; // Postion in alignment incl. gaps (first=1)
137
+  int h; // Position in input line (first=0)
138
+  int k; // Index of sequence being read currently (first=0)
139
+  char line[LINELEN]=""; // input line
140
+  //char cur_seq[MAXCOL]; // Sequence currently read in
141
+  char *cur_seq=new(char[par.maxColCnt]);
142
+  char* cur_name; // Sequence currently read in
143
+  int linenr=0; // current line number in input file
144
+  char skip_sequence=0;
145
+  RemoveExtension(file,infile); //copy rootname (w/o path) of infile into file variable of class object
146
+
147
+  kss_dssp=ksa_dssp=kss_pred=kss_conf=kfirst=-1;
148
+  n_display=0;
149
+  N_in=0;
150
+  N_filtered=0;
151
+  N_ss=0;
152
+  cur_seq[0]=' '; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
153
+  l=1; k=-1;
154
+
155
+  // Does firstline already contain first line of file?
156
+  if (firstline!= NULL) strcpy(line,firstline);
157
+
158
+  /////////////////////////////////////////////////////////////////////////
159
+  // Read infile line by line
160
+  /* FIXME: not safe to use MAXSEQ, however, don't think we ever get here (FS) */
161
+  while(firstline || (fgetline(line,LINELEN,inf) && (k<MAXSEQ))) /* FIXME: FS introduced () around &&, precedence! MR1 */
162
+      {
163
+          linenr++;
164
+          firstline=NULL;
165
+          if (line[0]=='>') //line contains sequence name
166
+              {
167
+                  if (k>=MAXSEQ-1)
168
+                      {
169
+                          if (v>=1 && k>=MAXSEQ)
170
+                              cerr<<endl<<"WARNING: maximum number "<<MAXSEQ<<" of sequences exceded in file "<<infile<<"\n";
171
+                          break;
172
+                      }
173
+                  cur_name=line+1; //beginning of current sequence name
174
+                  if (k>=0) //if this is at least the second name line
175
+                      {
176
+                          if (strlen(cur_seq)==0)
177
+                              {
178
+                                  cerr<<endl<<"Error: sequence "<<sname[k]<<" contains no residues."<<endl;
179
+                                  throw 1;
180
+                              }
181
+
182
+                          // Create space for residues and paste new sequence in
183
+                          seq[k]=new(char[strlen(cur_seq)+2]);
184
+                          if (!seq[k]) MemoryError("array for input sequences");
185
+                          X[k]=new(char[strlen(cur_seq)+2]);
186
+                          if (!X[k]) MemoryError("array for input sequences");
187
+                          I[k]=new(short unsigned int[strlen(cur_seq)+2]);
188
+                          if (!I[k]) MemoryError("array for input sequences");
189
+                          strcpy(seq[k],cur_seq);
190
+                      }
191
+                  skip_sequence=0;
192
+
193
+                  k++;
194
+                  l=1; //position in current sequence (first=1)
195
+
196
+                  // display[k]= 0: do not show in Q-T alignments 1: show if not filtered out later 2: show in any case (do not filter out)
197
+                  // keep[k] = 0: do not include in profile 1: include if not filtered out later 2: include in any case (do not filter out)
198
+                  /* {KEEP_NOT=0, KEEP_CONDITIONALLY=1, KEEP_ALWAYS=2} */
199
+                  if (line[1]=='@') cur_name++; //skip @-character in name
200
+                  if (!strncmp(line,">ss_dssp",8)) {
201
+                      if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=KEEP_NOT; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
202
+                  }
203
+                  else if (!strncmp(line,">sa_dssp",8)) {
204
+                      if (ksa_dssp<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
205
+                  }
206
+                  else if (!strncmp(line,">ss_pred",8)) {
207
+                      if (kss_pred<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;}
208
+                  }
209
+                  else if (!strncmp(line,">ss_conf",8)) {
210
+                      if (kss_conf<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;}
211
+                  }
212
+                  else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) {
213
+                      display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; N_ss++;
214
+                  }
215
+                  else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_"
216
+                      skip_sequence=1; k--; continue;
217
+                  }
218
+                  //store first real seq
219
+                  else if (kfirst<0)
220
+                      {
221
+                          char word[NAMELEN];
222
+                          strwrd(word,line); // Copies first word in ptr to str
223
+                          if (strstr(word,"_consensus"))
224
+                              {display[k]=2; keep[k]=0; n_display++; kfirst=k;} /* MR1 */
225
+                          else
226
+                              {display[k]=keep[k]=KEEP_ALWAYS; n_display++; kfirst=k;}
227
+                      }
228
+                  //store all sequences
229
+                  else if (par.mark==0) {display[k]=keep[k]=KEEP_CONDITIONALLY; n_display++;}
230
+                  //store sequences up to nseqdis
231
+                  else if (line[1]=='@'&& n_display-N_ss<par.nseqdis) {display[k]=keep[k]=KEEP_ALWAYS; n_display++;}
232
+                  else {display[k]=KEEP_NOT; keep[k]=KEEP_CONDITIONALLY;}
233
+
234
+                  // store sequence name
235
+                  if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]);
236
+                  sname[k] = new(char[strlen(cur_name)+1]);
237
+                  if (!sname[k]) {MemoryError("array for sequence names");}
238
+                  strcpy(sname[k],cur_name);
239
+              } // end if(line contains sequence name)
240
+
241
+          else if (line[0]=='#') // Commentary line?
242
+              {
243
+                  // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174)
244
+                  if (name[0]) continue; // if already name defined: skip commentary line
245
+                  char *ptr1, *ptr2;
246
+                  ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number
247
+                  strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname
248
+                  longname[DESCLEN-1]='\0';
249
+                  strtr(longname,""," ");
250
+                  ptr2=strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number
251
+                  // strcpy(fam,ptr1); // copy AC number to fam
252
+                  // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off
253
+                  // strcut_(ptr2); // cut after first word ...
254
+                  strcpy(name,ptr1); // ... and copy first word into name
255
+                  readCommentLine = '1'; /* MR1 */
256
+              }
257
+
258
+          //line contains sequence residues or SS information and does not belong to a >aa_ sequence
259
+          else if (!skip_sequence)
260
+              {
261
+                  if (v>=4) cout<<line<<"\n"; //DEBUG
262
+                  if (k==-1 && v)
263
+                      {
264
+                          cerr<<endl<<"WARNING: No sequence name preceding following line in "<<infile<<":\n\'"<<line<<"\'\n";
265
+                          continue;
266
+                      }
267
+
268
+                  h=0; //counts characters in current line
269
+
270
+                  // Check whether all characters are correct; store into cur_seq
271
+                  if (keep[k] || (k == kfirst) ) // normal line containing residues /* MR1 */
272
+                      {
273
+                          while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
274
+                              {
275
+                                  if (aa2i(line[h])>=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1)
276
+                                      {cur_seq[l]=line[h]; l++;}
277
+                                  else if (aa2i(line[h])==-2 && v)
278
+                                      cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
279
+                                  h++;
280
+                              }
281
+                      }
282
+                  else if (k==kss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
283
+                      {
284
+                          while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
285
+                              {
286
+                                  if (ss2i(line[h])>=0 && ss2i(line[h])<=7)
287
+                                      {cur_seq[l]=ss2ss(line[h]); l++;}
288
+                                  else if (v)
289
+                                      cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
290
+                                  h++;
291
+                              }
292
+                      }
293
+                  else if (k==ksa_dssp) // lines with dssp solvent accessibility states (. - ???)
294
+                      {
295
+                          while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
296
+                              {
297
+                                  if (sa2i(line[h])>=0)
298
+                                      cur_seq[l++]=line[h];
299
+                                  else if (v)
300
+                                      cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
301
+                                  h++;
302
+                              }
303
+                      }
304
+                  else if (k==kss_pred) // lines with predicted secondary structure (. - H E C)
305
+                      {
306
+                          while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
307
+                              {
308
+                                  if (ss2i(line[h])>=0 && ss2i(line[h])<=3)
309
+                                      {cur_seq[l]=ss2ss(line[h]); l++;}
310
+                                  else if (v)
311
+                                      cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
312
+                                  h++;
313
+                              }
314
+                      }
315
+                  else if (k==kss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
316
+                      {
317
+                          while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
318
+                              {
319
+                                  if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9'))
320
+                                      {cur_seq[l]=line[h]; l++;}
321
+                                  else if (v)
322
+                                      cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
323
+                                  h++;
324
+                              }
325
+                      }
326
+                  else if (display[k]) // other lines such as >sa_pred etc
327
+                      {
328
+                          while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
329
+                              {
330
+                                  if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B'))
331
+                                      {cur_seq[l]=line[h]; l++;}
332
+                                  else if (v)
333
+                                      cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
334
+                                  h++;
335
+                              }
336
+                      }
337
+                  if (v && l>=/*MAXCOL*/par.maxColCnt-1) 
338
+                      {
339
+                          cerr<<endl<<"WARNING: maximum number of residues "<</*MAXCOL*/par.maxColCnt-2<<" exceded in sequence "<<sname[k]<<"\n";
340
+                          skip_sequence=1;
341
+                      }
342
+                  cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
343
+              } //end else
344
+
345
+      }
346
+  /////////////////////////////////////////////////////////////////////////
347
+
348
+
349
+  if (k>=0) //if at least one sequence was read in
350
+      {
351
+          seq[k]=new(char[strlen(cur_seq)+2]);
352
+          if (!seq[k]) MemoryError("array for input sequences");
353
+          X[k]=new(char[strlen(cur_seq)+2]);
354
+          if (!X[k]) MemoryError("array for input sequences");
355
+          I[k]=new(short unsigned int[strlen(cur_seq)+2]);
356
+          if (!I[k]) MemoryError("array for input sequences");
357
+          strcpy(seq[k],cur_seq);
358
+      }
359
+  else
360
+      {cerr<<endl<<"Error: no sequences found in file "<<infile<<"\n"; throw 1;}
361
+
362
+  N_in = k+1;
363
+
364
+  // Set name, longname, fam
365
+  if (!*name) // longname, name and family were not set by '#...' line yet -> extract from first sequence
366
+      {
367
+          char* ptr;
368
+          // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst]
369
+          strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence
370
+          longname[DESCLEN-1]='\0';
371
+          strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname...
372
+          name[NAMELEN-1]='\0';
373
+          ptr = strcut(name); // ...until first white-space character
374
+          if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word?
375
+              {
376
+                  lwrstr(name); // Transform upper case to lower case
377
+                  strcut(ptr); // Non-white-space characters until next white-space character..
378
+                  strcpy(fam,ptr); // ...are the SCOP familiy code
379
+              }
380
+          else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code
381
+              {
382
+                  strcpy(fam,name); // set family name = Pfam code
383
+              }
384
+      }
385
+  
386
+  
387
+  
388
+  delete[] cur_seq; cur_seq = NULL;
389
+  
390
+  // Checking for warning messages
391
+  if (v==0) return;
392
+  if (v>=2) cout<<"Read "<<infile<<" with "<<N_in<<" sequences\n";
393
+  if (v>=3) cout<<"Query sequence for alignment has number "<<kfirst<<" (0 is first)\n";
394
+  return;
395
+}
396
+
397
+/*
398
+ * At this point GetSeqsFromHMM() slots in, however,
399
+ * only needed in hhbliys.C, so will skip it for moment, MR1
400
+ */
401
+
402
+
403
+/////////////////////////////////////////////////////////////////////////////
404
+/**
405
+ * @brief  Convert ASCII in seq[k][l] to int (0-20) in X[k][i],
406
+ *  throw out all insert states, record their number in I[k][i]
407
+ *  and store sequences to be displayed in seq[k] */
408
+/////////////////////////////////////////////////////////////////////////////
409
+void 
410
+Alignment::Compress(const char infile[])
411
+{
412
+    int i; // Index for match state (first=1)
413
+    int l; // Postion in alignment incl. gaps (first=1)
414
+    int k; // Index for sequences (first=0)
415
+    int a; // amino acid index
416
+    char c;
417
+    int unequal_lengths=0; /* k: seq k doesn't have same number
418
+                              of match states as seq 0 => WARNING */
419
+    /* points to next character in seq[k] to be written */
420
+    /*static short unsigned int h[MAXSEQ];*/
421
+    /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */
422
+
423
+    h = new(/*short*/ unsigned int[N_in+2]); /* short -> overflow, FS, r235 -> r236 */
424
+    float *percent_gaps = NULL; /* FS, 2010-Nov */
425
+    char *match_state = NULL;  /* FS, 2010-Nov */
426
+
427
+
428
+    // Initialize 
429
+    for (k=0;k<N_in; k++) 
430
+        {I[k][0]=0;}
431
+
432
+    if (v>=3)
433
+        {
434
+            if (par.M==1)
435
+                cout<<"Using match state assignment by capital letters (a2m format)\n";
436
+            else if (par.M==2) cout<<"Using percentage-rule match state assignment\n";
437
+            else if (par.M==3) cout<<"Using residues of first sequence as match states\n";
438
+        }
439
+
440
+    // Create matrices X and I with amino acids represented by integer numbers
441
+    switch(par.M)
442
+        {
443
+
444
+            /////////////////////////////////////////////////////////////////////////
445
+            /* a2m/a3m format: match states capital case,
446
+               inserts lower case, delete states '-', inserted gaps '.'
447
+               The confidence values for ss prediction are interpreted as follows:
448
+               0-9:match states(!) '-' :match state '.':insert */
449
+        case 1:
450
+        default:
451
+
452
+            // Warn if alignment is ment to be -M first or -M NN instead of A2M/A3M
453
+            if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ...
454
+                {
455
+                    for (k=1; k<N_in; k++)
456
+                        if (strpbrk(seq[k],"abcdefghiklmnpqrstuvwxyz.")) break;
457
+                    if (k==N_in) // ... but alignment contains no lower case residue
458
+                        printf("WARNING: input alignment %s looks like aligned FASTA instead of A2M/A3M format. Consider using '-M first' or '-M 50'\n",infile);
459
+                }
460
+
461
+            // Remove '.' characters from seq[k]
462
+            for(k=0; k<N_in; k++)
463
+                {
464
+                    char* ptrS=seq[k]; // pointer to source: character in seq[k]
465
+                    char* ptrD=seq[k]; // pointer to destination: seq[k]
466
+                    while(1) // omit '.' symbols
467
+                        {
468
+                            if (*ptrS!='.') {*ptrD=*ptrS; ptrD++;} //leave out '.' symbols
469
+                            if (!*ptrS) break;
470
+                            ptrS++;
471
+                        }
472
+                }
473
+            L=/*MAXRES*/par.maxResLen-2; // needed because L=imin(L,i)
474
+            for (k=0; k<N_in; k++)
475
+                {
476
+                    i=1; l=1; // start at i=1, not i=0!
477
+                    if (keep[k]) //skip >ss_dssp, >ss_pred, >ss_conf, >aa_... sequences
478
+                        {
479
+                            while((c=seq[k][l++])) // assign residue to c at same time
480
+                                {
481
+                                    if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character
482
+                                    else if (c!='.') //match state = upper case character
483
+                                        {
484
+                                            X[k][i]=aa2i(c);
485
+                                            I[k][i]=0;
486
+                                            i++;
487
+                                        }
488
+                                }
489
+                        }
490
+                    else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states?
491
+                        {
492
+                            while((c=seq[k][l++])) // assign residue to c at same time
493
+                                if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character
494
+                        }
495
+                    else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values?
496
+                        {
497
+                            while((c=seq[k][l++])) // assign residue to c at same time
498
+                                if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character
499
+                        }
500
+                    else if (k==kss_conf) // does alignment contain sequence of prediction confidence values?
501
+                        {
502
+                            while((c=seq[k][l++])) // assign residue to c at same time
503
+                                if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-'
504
+                        }
505
+                    else if (k==kfirst)        // does alignment contain sequence of prediction confidence values?
506
+                        {
507
+                            while((c=seq[k][l++]))  // assign residue to c at same time
508
+                                if (c!='.')
509
+                                    {
510
+                                        X[k][i]=aa2i(c);
511
+                                        I[k][i]=0;
512
+                                        ++i;
513
+                                    }
514
+                        }
515
+                    else continue;
516
+                    i--;
517
+                    if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths
518
+                    L=imin(L,i);
519
+                }
520
+            if (unequal_lengths) break;
521
+
522
+            //Replace GAP with ENDGAP for all end gaps /* MR1 */
523
+            for (k=0; k<N_in; ++k)
524
+                {
525
+                    if (!keep[k]) continue;
526
+                    for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
527
+                    for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
528
+                }
529
+
530
+            for (i=1; i<=L; i++) this->l[i]=i; //assign column indices to match states
531
+            if (L<=0)
532
+                {
533
+                    cout<<"\nError: Alignment in "<<infile<<" contains no match states. Consider using -M first or -M <int> option"<<endl;
534
+                    throw 1;
535
+                }
536
+            
537
+            if (L==/*MAXRES*/par.maxResLen-2 && v>=2) 
538
+                {
539
+                    printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L);
540
+                    break;
541
+                }
542
+            if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" match states\n";
543
+            break;
544
+
545
+            /////////////////////////////////////////////////////////////////////////
546
+            // gap-rule assignment of match states
547
+        case 2:
548
+            int nl[NAA+2]; //nl[a] = number of seq's with amino acid a at position l
549
+            /* Note: allocating statically is fine most of the time 
550
+               but when the sequences/profiles get really long 
551
+               we might run out of memory, so must really do it dynamically. 
552
+               had to move declaration of float *percent_gaps out of switch()
553
+            */
554
+            //float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences)
555
+            percent_gaps = new(float[par.maxColCnt]);
556
+
557
+            //determine number of columns L in alignment
558
+            L=strlen(seq[kfirst])-1;
559
+
560
+            // Conversion to integer representation, checking for unequal lengths and initialization
561
+            if (nres==NULL) nres=new(int[N_in]);
562
+            for (k=0; k<N_in; k++)
563
+                {
564
+                    if (!keep[k]) continue;
565
+                    int nr=0;
566
+                    wg[k]=0; nres[k]=0;
567
+                    for (l=1; l<=L; l++)
568
+                        {
569
+                            X[k][l]=aa2i(seq[k][l]);
570
+                            if (X[k][l]<NAA) nr++;
571
+                        }
572
+                    nres[k]=nr;
573
+                    if (seq[k][L+1]!='\0' && !unequal_lengths) unequal_lengths=k;
574
+                }
575
+            if (unequal_lengths) break;
576
+
577
+            // Quick and dirty calculation of the weight per sequence wg[k]
578
+            for (l=1; l<=L; l++) // for all positions l in alignment
579
+                {
580
+                    int naa=0; //number of different amino acids
581
+                    for (a=0; a<20; a++) nl[a]=0;
582
+                    for (k=0; k<N_in; k++) if (keep[k]) nl[ (int)X[k][l]]++;
583
+                    for (a=0; a<20; a++) if(nl[a]) naa++;
584
+                    if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
585
+                    for (k=0; k<N_in; k++)
586
+                        if (keep[k] && (X[k][l]<20) )
587
+                            {
588
+                                //wg[k]+=1.0/float(nl[ (int)X[k][l]]*naa*nres[k]+30.0); /* original version */
589
+                                wg[k] += 1.0/float(nl[ (int)X[k][l]]*naa*(nres[k]+30.0)); /* MR1 */
590
+                                // wg[k] += 1.0/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
591
+                                // wg[k] += (naa-1.0)/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
592
+                            }
593
+                } /* 1=l<=L*/
594
+
595
+            //Replace GAP with ENDGAP for all end gaps
596
+            for (k=0; k<N_in; ++k)
597
+                {
598
+                    if (!keep[k]) continue;
599
+                    for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
600
+                    for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
601
+                }
602
+
603
+            // Add up percentage of gaps
604
+            for (l=1; l<=L; l++)
605
+                {
606
+                    float res=0;
607
+                    float gap=0;
608
+                    for (k=0; k< N_in; k++){
609
+                        if (keep[k]){
610
+                            if ( X[k][l]<GAP) res+=wg[k]; /* MR1, AA or ANY, changed from <ANY */
611
+                            else if ( X[k][l] != ENDGAP) gap+=wg[k];  /* MR1, else: GAP. ENDGAPs are ignored for counting percentage */
612
+                        }
613
+                    }
614
+                    percent_gaps[l]=100.*gap/(res+gap);
615
+                    if (v>=4) cout<<"percent gaps["<<l<<"]="<<percent_gaps[l]<<" first seq:"<<seq[0][l]<<"\n";
616
+                }
617
+
618
+            /* Insert states 'bloat' the HMM,
619
+               throwing them out 'slims' down the HMM.
620
+               A slimmer HMM takes less time to construct.
621
+               However, the marriage of Clustal and Hhalign
622
+               is particularly sensitive to residues
623
+               at the very end of the profile; these I call
624
+               'telomeres'. Telomeres must not be shed when
625
+               throwing out insert states, for the telomeres
626
+               we set the match threshold to 100%.
627
+             */
628
+#define MGAP_LOGIC 0
629
+#define TELOMERE_LOGIC 1
630
+#define TELOMERE_DYNAMIC 0
631
+
632
+#define ALWAYS_ACCEPT 101.0 /* do NOT change this parameter, must be >=100,
633
+                               make slightly bigger than 100% -- to be sure to be sure */
634
+#define DEFAULT_MGAPS 100.0 /* Soeding's default is 50, omega default prior to telomere logic was 100
635
+                               FIXME: this used to be par.Mgaps,
636
+                               in a later version re-introduce par.Mgaps to keep this value flexible */
637
+#define TELOMER_LENGTH 10   /* this parameter must be > 0 (unless DEFAULT_MGAPS=100),
638
+                               if it is too big (L/2) then telomere logic has no effect,
639
+                               don't think it should be changed (much) */
640
+#define TELOMER_FRACTION 0.10
641
+            //#define HMM_MIN_LENGTH 0.923
642
+#define HMM_MIN_LENGTH 0.950
643
+#define FORTRAN_OFFSET 1
644
+            double dDefaultMgaps;
645
+            dDefaultMgaps = DEFAULT_MGAPS;
646
+
647
+#if TELOMERE_LOGIC /* turn telomere logic on (1) or off (0) */
648
+            int iTelomereLength;
649
+
650
+#if TELOMERE_DYNAMIC /* keep telomere length 'dynamic' */
651
+            iTelomereLength = TELOMER_LENGTH > (int)(L*TELOMER_FRACTION) ? TELOMER_LENGTH : (int)(L*TELOMER_FRACTION);
652
+#else
653
+            iTelomereLength = TELOMER_LENGTH;
654
+#endif /* this was dynamic telomere */
655
+#endif /* this was telomere logic */
656
+
657
+            /* if HMMs get too small (much smaller than profile length L)
658
+               then one is liable to get a back-tracking error.
659
+               So we should ensure that the DEFAULT_MGAPS parameter does NOT
660
+               shrink the HMM too much.
661
+               take percentage-gap vector, sort it, and fix dDefaultMgaps,
662
+               such that at least (HMM_MIN_LENGTH)*(L) are left
663
+             */
664
+#if MGAP_LOGIC /* try to adapt Mgaps to size of final HMM */
665
+            {
666
+                float *pfPercentGaps = NULL;
667
+                if (NULL == (pfPercentGaps = (float *)malloc((L+1)*sizeof(float)))){
668
+                    printf("%s:%s:%d: could not malloc %d float for sorted percent-gaps\n",
669
+                           __FUNCTION__, __FILE__, __LINE__, L+1);
670
+                    dDefaultMgaps = DEFAULT_MGAPS;
671
+                }
672
+                else {
673
+                    for (l = 0; l < L; l++) {
674
+                        pfPercentGaps[l] = percent_gaps[l+FORTRAN_OFFSET];
675
+                    }
676
+                    qsort(pfPercentGaps, L, sizeof(float), CompFltAsc);
677
+
678
+                    dDefaultMgaps = pfPercentGaps[(int)(HMM_MIN_LENGTH*L)];
679
+                    if (dDefaultMgaps < DEFAULT_MGAPS){
680
+                        //printf("Mgaps = %f <- %f\n", DEFAULT_MGAPS, dDefaultMgaps);
681
+                        dDefaultMgaps = DEFAULT_MGAPS;
682
+                    }
683
+                    else {
684
+                        //printf("Mgaps = %f\n", dDefaultMgaps);
685
+                    }
686
+
687
+                    free(pfPercentGaps); pfPercentGaps = NULL;
688
+                }
689
+            }
690
+#endif /* tried to adapt Mgaps to size of final HMM */
691
+
692
+            // Throw out insert states and keep only match states
693
+            i=0;
694
+            for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
695
+            for (l=1; l<=L; l++)
696
+                {
697
+#if TELOMERE_LOGIC
698
+                    float fMgaps = ALWAYS_ACCEPT;
699
+                    if ( (l < iTelomereLength) || (L-l < iTelomereLength) ){
700
+                        /* residue is in telomere, always retain this position */
701
+                        fMgaps = ALWAYS_ACCEPT;
702
+                    }
703
+                    else if (0){
704
+                        /* FIXME: would like to put a transition phase in here,
705
+                           where the Mgap value gradually goes down from 100 to DEFAULT_MGAPS,
706
+                           however, may not be necessary and will make code more clunky */
707
+                    }
708
+                    else {
709
+                        /* position is in centre of sequence,
710
+                           retain position if less than DEFAULT_MGAPS% gaps at this position,
711
+                           for example, if DEFAULT_MGAPS=30 throw out if more than 30% gap.
712
+                           conversely, if DEFAULT_MGAPS=100 throw out if more than 100% gaps,
713
+                           which can never happen, so always retain */
714
+                        fMgaps = dDefaultMgaps;
715
+                    }
716
+                    if (percent_gaps[l] <= fMgaps)
717
+#else /* this was telomere logic */
718
+                    if (percent_gaps[l]<=float(par.Mgaps))
719
+#endif /* this was Soeding default */
720
+                        {
721
+                            if (i>=/*MAXRES*/par.maxResLen-2) {
722
+                                if (v>=1)
723
+                                    printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
724
+                                break;
725
+                            }
726
+                            i++;
727
+                            this->l[i]=l;
728
+                            for (k=0; k<N_in; k++)
729
+                                {
730
+                                    if (keep[k])
731
+                                        {
732
+                                            seq[k][h[k]++]=MatchChr(seq[k][l]);
733
+                                            X[k][i]=X[k][l];
734
+                                            I[k][i]=0;
735
+                                        }
736
+                                    else if (k==kss_dssp || k==kss_pred)
737
+                                        {
738
+                                            seq[k][h[k]++]=MatchChr(seq[k][l]);
739
+                                            X[k][i]=ss2i(seq[k][l]);
740
+                                        }
741
+                                    else if (k==ksa_dssp)
742
+                                        {
743
+                                            seq[k][h[k]++]=MatchChr(seq[k][l]);
744
+                                            X[k][i]=sa2i(seq[k][l]);
745
+                                        }
746
+                                    else if (k==kss_conf)
747
+                                        {
748
+                                            seq[k][h[k]++]=seq[k][l];
749
+                                            X[k][i]=cf2i(seq[k][l]);
750
+                                        }
751
+                                }
752
+                        }
753
+                    else
754
+                        {
755
+                            for (k=0; k<N_in; k++)
756
+                                if (keep[k] && X[k][l]<GAP)
757
+                                    {
758
+                                        I[k][i]++;
759
+                                        seq[k][h[k]++]=InsertChr(seq[k][l]);
760
+                                    }
761
+                        }
762
+                }
763
+            for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
764
+
765
+            //printf("%d\t%d\t%d\tN/L/M\n", N_in, L, i); /* -------- FIXME  */
766
+
767
+            if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
768
+            L = i; //Number of match states
769
+
770
+            delete[] percent_gaps; percent_gaps = NULL;
771
+            break;
772
+
773
+
774
+            ////////////////////////////////////////////////////////////////////////
775
+            // Using residues of first sequence as match states
776
+        case 3:
777
+            /* Note: allocating statically is fine most of the time 
778
+               but when the sequences/profiles get really long 
779
+               we might run out of memory, so must really do it dynamically. 
780
+               had to move declaration of float *percent_gaps out of switch()
781
+            */
782
+            //char match_state[MAXCOL]; //1: column assigned to match state 0: insert state
783
+            match_state = new(char[par.maxColCnt]);
784
+
785
+            // Determine number of columns L in alignment
786
+            L=strlen(seq[0]+1);
787
+            if (v>=3) printf("Length of first seq = %i\n",L);
788
+            // Check for sequences with unequal lengths
789
+            for (k=1; k<N_in; k++)
790
+                if (int(strlen(seq[k]+1))!=L) {unequal_lengths=k; break;}
791
+            if (unequal_lengths) break;
792
+
793
+            // Determine match states: seq kfirst has residue at pos l -> match state
794
+            for (l=1; l<=L; l++)
795
+                if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0;
796
+            // Throw out insert states and keep only match states
797
+            for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
798
+            i=0;
799
+            for (l=1; l<=L; l++)
800
+                {
801
+                    if (match_state[l]) // does sequence 0 have residue at position l?
802
+                        {
803
+                            if (i>=/*MAXRES*/par.maxResLen-2) {
804
+                                if (v>=1)
805
+                                    printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
806
+                                break;
807
+                            }
808
+                            i++;
809
+                            this->l[i]=l;
810
+                            for (k=0; k<N_in; k++)
811
+                                {
812
+                                    if (keep[k])
813
+                                        {
814
+                                            seq[k][h[k]++]=MatchChr(seq[k][l]);
815
+                                            X[k][i]=aa2i(seq[k][l]);
816
+                                            I[k][i]=0;
817
+                                        }
818
+                                    else if (k==kss_dssp || k==kss_pred)
819
+                                        {
820
+                                            seq[k][h[k]++]=MatchChr(seq[k][l]);
821
+                                            X[k][i]=ss2i(seq[k][l]);
822
+                                        }
823
+                                    else if (k==ksa_dssp)
824
+                                        {
825
+                                            seq[k][h[k]++]=MatchChr(seq[k][l]);
826
+                                            X[k][i]=sa2i(seq[k][l]);
827
+                                        }
828
+                                    else if (k==kss_conf)
829
+                                        {
830
+                                            seq[k][h[k]++]=seq[k][l];
831
+                                            X[k][i]=cf2i(seq[k][l]);
832
+                                        }
833
+                                }
834
+                        }
835
+                    else
836
+                        {
837
+                            for (k=0; k<N_in; k++)
838
+                                if (keep[k] && aa2i(seq[k][l])<GAP)
839
+                                    {
840
+                                        I[k][i]++;
841
+                                        seq[k][h[k]++]=InsertChr(seq[k][l]);
842
+                                    }
843
+                        }
844
+                }
845
+            for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
846
+
847
+            //Replace GAP with ENDGAP for all end gaps /* MR1 */
848
+            for (k=0; k<N_in; ++k)
849
+                {
850
+                    if (!keep[k]) continue;
851
+                    for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1, note i++ <- ++i */
852
+                    for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
853
+                }
854
+
855
+            if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
856
+            L = i; //Number of match states
857
+
858
+            delete[] match_state; match_state = NULL;
859
+            break;
860
+
861
+        } //end switch()
862
+    ///////////////////////////////////////////////////////////////////////////
863
+
864
+
865
+    // Error
866
+    if (unequal_lengths)
867
+        {
868
+            strcut(sname[unequal_lengths]);
869
+            cerr<<endl<<"Error: sequences in "<<infile<<" do not all have the same number of columns, \ne.g. first sequence and sequence "<<sname[unequal_lengths]<<".\n";
870
+            if(par.M==1) cerr<<".\nCheck input format for '-M a2m' option and consider using '-M first' or '-M 50'\n";
871
+            throw 1;
872
+        }
873
+
874
+    // Avert user about -cons option?
875
+    if (v>=2 && !par.cons)
876
+        {
877
+            for (i=1; i<=L; i++)
878
+                if (X[kfirst][i]==GAP)
879
+                    {
880
+                        printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n");
881
+                        break;
882
+                    }
883
+        }
884
+    /* MR1
885
+    //Replace GAP with ENDGAP for all end gaps
886
+    for (k=0; k<N_in; k++)
887
+    {
888
+    if (!keep[k]) continue;
889
+    for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP;
890
+    for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP;
891
+    }*/
892
+
893
+    // DEBUG
894
+    if (v>=4)
895
+        for (k=0; k<N_in; k++)
896
+            {
897
+                if (!display[k]) continue;
898
+                cout<<">"<<sname[k]<<"\n";
899
+                if (k==kss_dssp || k==kss_pred) {for (i=1; i<=L; i++) cout<<char(i2ss(X[k][i]));}
900
+                else if (k==kss_conf) {for (i=1; i<=L; i++) cout<<char(i2cf(X[k][i]));}
901
+                else if (k==ksa_dssp) {for (i=1; i<=L; i++) cout<<char(i2sa(X[k][i]));}
902
+                else
903
+                    {
904
+                        for (i=1; i<=L; i++) cout<<char(i2aa(X[k][i]));
905
+                        cout<<"\n";
906
+                        for (i=1; i<=L; i++)
907
+                            if (I[k][i]==0) cout<<"-"; else if (I[k][i]>9) cout<<"X"; else cout<<I[k][i];
908
+                    }
909
+                cout<<"\n";
910
+            }
911
+
912
+    delete[](h); h = NULL;
913
+}
914
+
915
+
916
+/**
917
+ * @brief Remove sequences with seq. identity larger than seqid percent
918
+ *(remove the shorter of two) or coverage<cov_thr
919
+ *
920
+ * FIXME: originally max_seqid is a variable that is the cutoff
921
+ *  above which sequences are thrown out. We want to throw out sequences
922
+ *  when building the HMM but not for display, there we want to keep all.
923
+ *  This should be really easy, but there is some hidden stuff going on
924
+ *  in this function, so I did a minimal-invasive change and just stuck
925
+ *  (effectively) a hard-wired 100 instead of the variable.
926
+ *  At a later stage we should get rid of this function alltogether
927
+ *  as it does gobble up some time (and is quadratic in noof sequences, I think)
928
+ *  FS, 2010-10-04
929
+ */
930
+////////////////////////////////////////////////////////////////////////////
931
+/*
932
+ */
933
+inline int 
934
+Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int N)
935
+{
936
+
937
+    /* FIXME
938
+     * by just returning n_display and not doing anything
939
+     * I think we display everything and not do any work for it
940
+     */
941
+    return n_display; /* FS, 2010-10-04*/
942
+
943
+
944
+    if (par.mark) return n_display;
945
+    char *dummy = new(char[N_in+1]);
946
+    int vtmp=v, seqid;
947
+    v=0;
948
+    n_display=0;
949
+    if (kss_dssp>=0) display[kss_dssp]=KEEP_NOT;
950
+    if (ksa_dssp>=0) display[ksa_dssp]=KEEP_NOT;
951
+    if (kss_pred>=0) display[kss_pred]=KEEP_NOT;
952
+    if (kss_conf>=0) display[kss_conf]=KEEP_NOT;
953
+    for (seqid=imin(10,max_seqid); n_display<N && seqid<=max_seqid; seqid++)
954
+        {
955
+            for (int k=0; k<N_in; k++) dummy[k]=display[k];
956
+            n_display = Filter2(dummy,coverage,qid,qsc,20,seqid,0);
957
+            // printf("Seqid=%3i n_display=%4i\n",seqid,n_display);
958
+        }
959
+    if (n_display>N)
960
+        {
961
+            for (int k=0; k<N_in; k++) dummy[k]=display[k];
962
+            n_display = Filter2(dummy,coverage,qid,qsc,20,--(--seqid),0);
963
+        }
964
+    v=vtmp;
965
+    for (int k=0; k<N_in; k++) display[k]=dummy[k];
966
+    if (kss_dssp>=0) {display[kss_dssp]=KEEP_CONDITIONALLY; n_display++;}
967
+    if (ksa_dssp>=0) {display[ksa_dssp]=KEEP_CONDITIONALLY; n_display++;}
968
+    if (kss_pred>=0) {display[kss_pred]=KEEP_CONDITIONALLY; n_display++;}
969
+    if (kss_conf>=0) {display[kss_conf]=KEEP_CONDITIONALLY; n_display++;}
970
+    delete[] dummy; dummy = NULL;
971
+    return n_display;
972
+}
973
+
974
+/////////////////////////////////////////////////////////////////////////////////////
975
+// Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage<cov_thr
976
+/////////////////////////////////////////////////////////////////////////////////////
977
+inline int Alignment::Filter(int max_seqid, int coverage, int qid, float qsc, int N)
978
+{
979
+    return Filter2(keep,coverage,qid,qsc,20,max_seqid,N);
980
+}
981
+
982
+/////////////////////////////////////////////////////////////////////////////
983
+/*
984
+ * @brief Select set of representative sequences in the multiple sequence alignment
985
+ *
986
+ * Filter criteria:
987
+ * Remove sequences with coverage of query less than "coverage" percent
988
+ * Remove sequences with sequence identity to query of less than "qid" percent
989
+ * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent
990
+ * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1
991
+ * and seqid2 (%), where the minimum seqid threshold is determined such that,
992
+ * in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left.
993
+ * This ensures that in multi-domain proteins sequences covering one domain are not
994
+ * removed completely because sequences covering other domains are more diverse.
995
+ *
996
+ * Allways the shorter of two compared sequences is removed (=> sort sequences by length first).