src/ClustalOmega/src/squid/alignio.c
dafeef0b
 /*****************************************************************
  * SQUID - a library of functions for biological sequence analysis
  * Copyright (C) 1992-2002 Washington University School of Medicine
  * 
  *     This source code is freely distributed under the terms of the
  *     GNU General Public License. See the files COPYRIGHT and LICENSE
  *     for details.
  *****************************************************************/
 
 /* alignio.c
  * SRE, Mon Jul 12 11:57:37 1993
  * RCS $Id: alignio.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: alignio.c,v 1.11 2002/10/09 14:26:09 eddy Exp)
  * 
  * Input/output of sequence alignments.
  */
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include "squid.h"
 #include "sre_random.h"
 
 /* Function: AllocAlignment()
  * 
  * Purpose:  Allocate space for an alignment, given the number
  *           of sequences and the alignment length in columns.
  *           
  * Args:     nseq     - number of sequences
  *           alen     - width of alignment
  *           ret_aseq - RETURN: alignment itself
  *           ainfo    - RETURN: other info associated with alignment
  *           
  * Return:   (void)
  *           aseq, ainfo free'd by caller: FreeAlignment(aseq, &ainfo).
  *           note that ainfo itself is alloc'ed in caller, usually
  *           just by a "AINFO ainfo" definition.
  */
 void
 AllocAlignment(int nseq, int alen, char ***ret_aseq, AINFO *ainfo)
 {
   char **aseq;
   int idx;
 
   InitAinfo(ainfo);
 
   aseq = (char **) MallocOrDie (sizeof(char *) * nseq);
   for (idx = 0; idx < nseq; idx++)
     aseq[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
 
   ainfo->alen  = alen;
   ainfo->nseq  = nseq;
 
   ainfo->wgt   = (float *) MallocOrDie (sizeof(float) * nseq);
   FSet(ainfo->wgt, nseq, 1.0);
 
   ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
   for (idx = 0; idx < nseq; idx++)
     ainfo->sqinfo[idx].flags = 0;
 
   *ret_aseq = aseq;
 }
  
 
 /* Function: InitAinfo()
  * Date:     SRE, Tue Jan 19 10:16:02 1999 [St. Louis]
  *
  * Purpose:  Initialize the fields in ainfo structure to
  *           default (null) values. Does nothing with 
  *           fields that are dependent on nseq or alen.
  *
  * Args:     ainfo  - optional info structure for an alignment
  *
  * Returns:  (void). ainfo is modified.
  */
 void
 InitAinfo(AINFO *ainfo)
 {
   ainfo->name  = NULL;
   ainfo->desc  = NULL;
   ainfo->cs    = NULL;
   ainfo->rf    = NULL;
   ainfo->acc   = NULL;
   ainfo->au    = NULL;
   ainfo->flags = 0;
 
   ainfo->tc1  = ainfo->tc2 = 0.0;
   ainfo->nc1  = ainfo->nc2 = 0.0;
   ainfo->ga1  = ainfo->ga2 = 0.0;
 }
 
 
 /* Function: FreeAlignment()
  * 
  * Purpose:  Free the space allocated to alignment, names, and optional
  *           information. 
  *           
  * Args:     aseqs - sequence alignment
  *           ainfo - associated alignment data.
  */                  
 void
 FreeAlignment(char **aseqs, AINFO *ainfo)
 {
   int i;
 
   for (i = 0; i < ainfo->nseq; i++)
     {
       if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
       if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
     }
   if (ainfo->cs   != NULL) free(ainfo->cs);
   if (ainfo->rf   != NULL) free(ainfo->rf);
   if (ainfo->name != NULL) free(ainfo->name);
   if (ainfo->desc != NULL) free(ainfo->desc);
   if (ainfo->acc  != NULL) free(ainfo->acc);
   if (ainfo->au   != NULL) free(ainfo->au);
 
   free(ainfo->sqinfo);
   free(ainfo->wgt);
   Free2DArray((void **) aseqs, ainfo->nseq);
 }
 
 
 
 /* Function: SAMizeAlignment()
  * Date:     SRE, Tue Jun 30 09:49:40 1998 [St. Louis]
  *
  * Purpose:  Make a "best effort" attempt to convert an alignment
  *           to SAM gap format: - in delete col, . in insert col.
  *           Only works if alignment adheres to SAM's upper/lower
  *           case convention, which is true for instance of old
  *           HMMER alignments.
  *
  * Args:     aseq  - alignment to convert
  *           nseq  - number of seqs in alignment
  *           alen  - length of alignment
  *
  * Returns:  (void)
  */
 void
 SAMizeAlignment(char **aseq, int nseq, int alen)
 {
   int col;			/* counter for aligned columns */
   int i;			/* counter for seqs */
   int sawlower, sawupper, sawgap;
   char gapchar;
 
   for (col = 0; col < alen; col++)
     {
       sawlower = sawupper = sawgap = 0;
 				/* pass 1: do we see only upper or lower? */
       for (i = 0; i < nseq; i++)
 	{
 	  if (isgap(aseq[i][col]))         { sawgap   = 1; continue; }
 	  if (isupper((int) aseq[i][col])) { sawupper = 1; continue; }
 	  if (islower((int) aseq[i][col]))   sawlower = 1;
 	}
 				/* select gap character for column */
       gapchar = '-';		/* default */
       if (sawlower && ! sawupper) gapchar = '.';
 
 				/* pass 2: set gap char */
       for (i = 0; i < nseq; i++)
 	if (isgap(aseq[i][col])) aseq[i][col] = gapchar;
     }
 }
 
 
 /* Function: SAMizeAlignmentByGapFrac()
  * Date:     SRE, Tue Jun 30 10:58:38 1998 [St. Louis]
  *
  * Purpose:  Convert an alignment to SAM's gap and case
  *           conventions, using gap fraction in a column
  *           to choose match versus insert columns. In match columns,
  *           residues are upper case and gaps are '-'.
  *           In insert columns, residues are lower case and
  *           gaps are '.'
  *
  * Args:     aseq   - aligned sequences
  *           nseq   - number of sequences
  *           alen   - length of alignment
  *           maxgap - if more gaps than this fraction, column is insert.
  *
  * Returns:  (void) Characters in aseq may be altered.
  */
 void
 SAMizeAlignmentByGapFrac(char **aseq, int nseq, int alen, float maxgap)
 {
   int apos;			/* counter over columns */
   int idx;			/* counter over sequences */
   int ngap;			/* number of gaps seen */
 
   for (apos = 0; apos < alen; apos++)
     {
 				/* count gaps */
       ngap = 0;
       for (idx = 0; idx < nseq; idx++)
 	if (isgap(aseq[idx][apos])) ngap++;
       
 				/* convert to SAM conventions */
       if ((float) ngap / (float) nseq > maxgap)
 	{			/* insert column */
 	  for (idx = 0; idx < nseq; idx++)
 	    if (isgap(aseq[idx][apos])) aseq[idx][apos] = '.';
 	    else aseq[idx][apos] = (char) tolower((int) aseq[idx][apos]);
 	}
       else			
 	{			/* match column */
 	  for (idx = 0; idx < nseq; idx++)
 	    if (isgap(aseq[idx][apos])) aseq[idx][apos] = '-';
 	    else aseq[idx][apos] = (char) toupper((int) aseq[idx][apos]);
 	}
     }
 }
 
 
 
 
 /* Function: MakeAlignedString()
  * 
  * Purpose:  Given a raw string of some type (secondary structure, say),
  *           align it to a given aseq by putting gaps wherever the
  *           aseq has gaps. 
  *           
  * Args:     aseq:  template for alignment
  *           alen:  length of aseq
  *           ss:    raw string to align to aseq
  *           ret_s: RETURN: aligned ss
  *           
  * Return:   1 on success, 0 on failure (and squid_errno is set.)
  *           ret_ss is malloc'ed here and must be free'd by caller.
  */
 int
 MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
 {
   char *new; 
   int   apos, rpos;
 
   new = (char *) MallocOrDie ((alen+1) * sizeof(char));
   for (apos = rpos = 0; apos < alen; apos++)
     if (! isgap(aseq[apos]))
       {
 	new[apos] = ss[rpos];
 	rpos++;
       }
     else
       new[apos] = '.';
   new[apos] = '\0';
 
   if (rpos != strlen(ss))
     { squid_errno = SQERR_PARAMETER; free(new); return 0; }
   *ret_s = new;
   return 1;
 }
 
 
 /* Function: MakeDealignedString()
  * 
  * Purpose:  Given an aligned string of some type (either sequence or 
  *           secondary structure, for instance), dealign it relative
  *           to a given aseq. Return a ptr to the new string.
  *           
  * Args:     aseq  : template alignment 
  *           alen  : length of aseq
  *           ss:   : string to make dealigned copy of; same length as aseq
  *           ret_s : RETURN: dealigned copy of ss
  *           
  * Return:   1 on success, 0 on failure (and squid_errno is set)
  *           ret_s is alloc'ed here and must be freed by caller
  */
 int
 MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
 {
   char *new; 
   int   apos, rpos;
 
   new = (char *) MallocOrDie ((alen+1) * sizeof(char));
   for (apos = rpos = 0; apos < alen; apos++)
     if (! isgap(aseq[apos]))
       {
 	new[rpos] = ss[apos];
 	rpos++;
       }
   new[rpos] = '\0';
   if (alen != strlen(ss))
     { squid_errno = SQERR_PARAMETER; free(new); return 0; }
   *ret_s = new;
   return 1;
 }
 
 
 /* Function: DealignedLength()
  *
  * Purpose:  Count the number of non-gap symbols in seq.
  *           (i.e. find the length of the unaligned sequence)
  * 
  * Args:     aseq - aligned sequence to count symbols in, \0 terminated
  * 
  * Return:   raw length of seq.
  */
 int
 DealignedLength(char *aseq)
 {
   int rlen; 
   for (rlen = 0; *aseq; aseq++)
     if (! isgap(*aseq)) rlen++;
   return rlen;
 }
 
 
 /* Function: WritePairwiseAlignment()
  * 
  * Purpose:  Write a nice formatted pairwise alignment out,
  *           with a BLAST-style middle line showing identities
  *           as themselves (single letter) and conservative
  *           changes as '+'.
  *           
  * Args:     ofp          - open fp to write to (stdout, perhaps)
  *           aseq1, aseq2 - alignments to write (not necessarily 
  *                          flushed right with gaps)
  *           name1, name2 - names of sequences
  *           spos1, spos2 - starting position in each (raw) sequence
  *           pam          - PAM matrix; positive values define
  *                          conservative changes
  *           indent       - how many extra spaces to print on left
  *           
  * Return:  1 on success, 0 on failure
  */
 int
 WritePairwiseAlignment(FILE *ofp,
 		       char *aseq1, char *name1, int spos1,
 		       char *aseq2, char *name2, int spos2,
 		       int **pam, int indent)
 {
   char sname1[11];              /* shortened name               */
   char sname2[11];             
   int  still_going;		/* True if writing another block */
   char buf1[61];		/* buffer for writing seq1; CPL+1*/
   char bufmid[61];              /* buffer for writing consensus  */
   char buf2[61];
   char *s1, *s2;                /* ptrs into each sequence          */
   int  count1, count2;		/* number of symbols we're writing  */
   int  rpos1, rpos2;		/* position in raw seqs             */
   int  rawcount1, rawcount2;	/* number of nongap symbols written */
   int  apos;
 
   strncpy(sname1, name1, 10);
   sname1[10] = '\0';
   strtok(sname1, WHITESPACE);
 
   strncpy(sname2, name2, 10);
   sname2[10] = '\0';
   strtok(sname2, WHITESPACE);
 
   s1 = aseq1; 
   s2 = aseq2;
   rpos1 = spos1;
   rpos2 = spos2;
 
   still_going = TRUE;
   while (still_going)
     {
       still_going = FALSE;
       
 				/* get next line's worth from both */
       strncpy(buf1, s1, 60); buf1[60] = '\0';
       strncpy(buf2, s2, 60); buf2[60] = '\0';
       count1 = strlen(buf1);
       count2 = strlen(buf2);
 
 				/* is there still more to go? */
       if ((count1 == 60 && s1[60] != '\0') ||
 	  (count2 == 60 && s2[60] != '\0'))
 	still_going = TRUE;
 
 				/* shift seq ptrs by a line */
       s1 += count1;
       s2 += count2;
 
 				/* assemble the consensus line */
       for (apos = 0; apos < count1 && apos < count2; apos++)
 	{
 	  if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
 	    {
 	      if (buf1[apos] == buf2[apos])
 		bufmid[apos] = buf1[apos];
 	      else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
 		bufmid[apos] = '+';
 	      else
 		bufmid[apos] = ' ';
 	    }
 	  else
 	    bufmid[apos] = ' ';
 	}
       bufmid[apos] = '\0';
 
       rawcount1 = 0;
       for (apos = 0; apos < count1; apos++)
 	if (!isgap(buf1[apos])) rawcount1++;
       
       rawcount2 = 0;
       for (apos = 0; apos < count2; apos++)
 	if (!isgap(buf2[apos])) rawcount2++;
 
       (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", 
 		     sname1, rpos1, buf1, rpos1 + rawcount1 -1);
       (void) fprintf(ofp, "%*s                 %s\n", indent, "",
 		     bufmid);
       (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", 
 		     sname2, rpos2, buf2, rpos2 + rawcount2 -1);
       (void) fprintf(ofp, "\n");
 
       rpos1 += rawcount1;
       rpos2 += rawcount2;
     }
 
   return 1;
 }
 
 
 /* Function: MingapAlignment()
  * 
  * Purpose:  Remove all-gap columns from a multiple sequence alignment
  *           and its associated data. The alignment is assumed to be
  *           flushed (all aseqs the same length).
  */
 int
 MingapAlignment(char **aseqs, AINFO *ainfo)
 {
   int apos;			/* position in original alignment */
   int mpos;			/* position in new alignment      */
   int idx;
 
   /* We overwrite aseqs, using its allocated memory.
    */
   for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
     {
 				/* check for all-gap in column */
       for (idx = 0; idx < ainfo->nseq; idx++)
 	if (! isgap(aseqs[idx][apos]))
 	  break;
       if (idx == ainfo->nseq) continue;
 	
 				/* shift alignment and ainfo */
       if (mpos != apos)
 	{
 	  for (idx = 0; idx < ainfo->nseq; idx++)
 	    aseqs[idx][mpos] = aseqs[idx][apos];
 	  
 	  if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos];
 	  if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos];
 	}
       mpos++;
     }
 				/* null terminate everything */
   for (idx = 0; idx < ainfo->nseq; idx++)
     aseqs[idx][mpos] = '\0';
   ainfo->alen = mpos;	/* set new length */
   if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0';
   if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0';
   return 1;
 }
 
 
 
 /* Function: RandomAlignment()
  * 
  * Purpose:  Create a random alignment from raw sequences.
  * 
  *           Ideally, we would like to sample an alignment from the
  *           space of possible alignments according to its probability,
  *           given a prior probability distribution for alignments.
  *           I don't see how to describe such a distribution, let alone
  *           sample it.
  *           
  *           This is a rough approximation that tries to capture some
  *           desired properties. We assume the alignment is generated
  *           by a simple HMM composed of match and insert states.
  *           Given parameters (pop, pex) for the probability of opening
  *           and extending an insertion, we can find the expected number
  *           of match states, M, in the underlying model for each sequence.
  *           We use an average M taken over all the sequences (this is
  *           an approximation. The expectation of M given all the sequence
  *           lengths is a nasty-looking summation.)
  *           
  *           M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) ) 
  *           
  *           Then, we assign positions in each raw sequence onto the M match
  *           states and M+1 insert states of this "HMM", by rolling random
  *           numbers and inserting the (rlen-M) inserted positions randomly
  *           into the insert slots, taking into account the relative probability
  *           of open vs. extend.
  *           
  *           The resulting alignment has two desired properties: insertions
  *           tend to follow the HMM-like exponential distribution, and
  *           the "sparseness" of the alignment is controllable through
  *           pop and pex.
  *           
  * Args:     rseqs     - raw sequences to "align", 0..nseq-1
  *           sqinfo    - array of 0..nseq-1 info structures for the sequences
  *           nseq      - number of sequences   
  *           pop       - probability to open insertion (0<pop<1)
  *           pex       - probability to extend insertion (0<pex<1)
  *           ret_aseqs - RETURN: alignment (flushed)
  *           ainfo     - fill in: alignment info
  * 
  * Return:   1 on success, 0 on failure. Sets squid_errno to indicate cause
  *           of failure.
  */                      
 int
 RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
 		char ***ret_aseqs, AINFO *ainfo)
 {
   char **aseqs;                 /* RETURN: alignment   */
   int    alen;			/* length of alignment */
   int   *rlen;                  /* lengths of each raw sequence */
   int    M;			/* length of "model"   */
   int  **ins;                   /* insertion counts, 0..nseq-1 by 0..M */
   int   *master_ins;            /* max insertion counts, 0..M */
   int    apos, rpos, idx;
   int    statepos;
   int    count;
   int    minlen;
 
   /* calculate expected length of model, M
    */
   rlen = (int *) MallocOrDie (sizeof(int) * nseq);
   M = 0;
   minlen = 9999999;
   for (idx = 0; idx < nseq; idx++)
     {
       rlen[idx] = strlen(rseqs[idx]);
       M += rlen[idx];
       minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
     }
   M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
   M /= nseq;
   if (M > minlen) M = minlen;
 
   /* make arrays that count insertions in M+1 possible insert states
    */
   ins = (int **) MallocOrDie (sizeof(int *) * nseq);
   master_ins = (int *) MallocOrDie (sizeof(int) * (M+1));
   for (idx = 0; idx < nseq; idx++)
     {
       ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1));
       for (rpos = 0; rpos <= M; rpos++)
 	ins[idx][rpos] = 0;
     }
 				/* normalize */
   pop = pop / (pop+pex);
   pex = 1.0 - pop;
 				/* make insertions for individual sequences */
   for (idx = 0; idx < nseq; idx++)
     {
       apos = -1;
       for (rpos = 0; rpos < rlen[idx]-M; rpos++)
 	{
 	  if (sre_random() < pop || apos == -1)	/* open insertion */
 	    apos = CHOOSE(M+1);        /* choose 0..M */
 	  ins[idx][apos]++;
 	}
     }
 				/* calculate master_ins, max inserts */
   alen = M;
   for (apos = 0; apos <= M; apos++)
     {
       master_ins[apos] = 0;
       for (idx = 0; idx < nseq; idx++)
 	if (ins[idx][apos] > master_ins[apos])
 	  master_ins[apos] = ins[idx][apos];
       alen += master_ins[apos];
     }
 
 
   /* Now, construct alignment
    */
   aseqs = (char **) MallocOrDie (sizeof (char *) * nseq);
   for (idx = 0; idx < nseq; idx++)
     aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1));
   for (idx = 0; idx < nseq; idx++)
     {
       apos = rpos = 0;
 
       for (statepos = 0; statepos <= M; statepos++)
 	{
 	  for (count = 0; count < ins[idx][statepos]; count++)
 	    aseqs[idx][apos++] = rseqs[idx][rpos++];
 	  for (; count < master_ins[statepos]; count++)
 	    aseqs[idx][apos++] = ' ';
 
 	  if (statepos != M)
 	    aseqs[idx][apos++] = rseqs[idx][rpos++];
 	}
       aseqs[idx][alen] = '\0';
     }
   ainfo->flags = 0;
   ainfo->alen  = alen; 
   ainfo->nseq  = nseq;
   ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq);
   for (idx = 0; idx < nseq; idx++)
     SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
 
   free(rlen);
   free(master_ins);
   Free2DArray((void **) ins, nseq);
   *ret_aseqs = aseqs;
   return 1;
 }
 
 /* Function: AlignmentHomogenousGapsym()
  * Date:     SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis]
  *
  * Purpose:  Sometimes we've got to convert alignments to 
  *           a lowest common denominator, and we need
  *           a single specific gap character -- for example,
  *           PSI-BLAST blastpgp -B takes a very simplistic
  *           alignment input format which appears to only
  *           allow '-' as a gap symbol.
  *           
  *           Anything matching the isgap() macro is
  *           converted.
  *
  * Args:     aseq   - aligned character strings, [0..nseq-1][0..alen-1]
  *           nseq   - number of aligned strings
  *           alen   - length of alignment
  *           gapsym - character to use for gaps.         
  *
  * Returns:  void ("never fails")
  */
 void
 AlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym)
 {
   int i, apos;
 
   for (i = 0; i < nseq; i++)
     for (apos = 0; apos < alen; apos++)
       if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;
 }