/***************************************************************** * SQUID - a library of functions for biological sequence analysis * Copyright (C) 1992-2002 Washington University School of Medicine * * This source code is freely distributed under the terms of the * GNU General Public License. See the files COPYRIGHT and LICENSE * for details. *****************************************************************/ /* clustal.c * SRE, Sun Jun 6 17:50:45 1999 [bus from Madison, 1999 worm mtg] * * Import/export of ClustalV/W multiple sequence alignment * formatted files. Derivative of msf.c; MSF is a pretty * generic interleaved format. * * RCS $Id: clustal.c 277 2013-05-16 15:42:49Z fabian $ (Original squid RCS Id: clustal.c,v 1.1 1999/07/15 22:26:53 eddy Exp) */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> #include "squid.h" #include "msa.h" #include "../clustal/log.h" #ifdef CLUSTALO /* needed for PACKAGE_VERSION */ #include "../config.h" /* DD: isnumber is in BSD/OSX but not GCC/Linux */ #ifndef isnumber #define isnumber(c) ( (c>='0') && (c<='9')) #endif #ifndef min #define min( a, b ) ( ((a) < (b)) ? (a) : (b) ) #endif /*These are all the positively scoring groups that occur in the Gonnet Pam250 matrix. There are strong and weak groups, defined as strong score >0.5 and weak score =<0.5. Strong matching columns to be assigned ':' and weak matches assigned '.' in the clustal output format. amino_strong = res_cat1 amino_weak = res_cat2 */ char *amino_strong[] = {"STA", "NEQK", "NHQK", "NDEQ", "QHRK", "MILV", "MILF", "HY", "FYW", NULL}; char *amino_weak[] = {"CSA", "ATV", "SAG", "STNK", "STPA", "SGND", "SNDEQK", "NDEQHK", "NEQHRK", "FVLIM", "HFY", NULL}; #endif #ifdef TESTDRIVE_CLUSTAL /***************************************************************** * msf.c test driver: * cc -DTESTDRIVE_CLUSTAL -g -O2 -Wall -o test clustal.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c -lm * */ int main(int argc, char **argv) { MSAFILE *afp; MSA *msa; char *file; file = argv[1]; if ((afp = MSAFileOpen(file, MSAFILE_CLUSTAL, NULL)) == NULL) Die("Couldn't open %s\n", file); while ((msa = ReadClustal(afp)) != NULL) { #ifdef CLUSTALO #define LINE_WRAP 60 WriteClustal(stdout, msa, LINE_WRAP); #else WriteClustal(stdout, msa); #endif MSAFree(msa); } MSAFileClose(afp); throw(ClustalOmegaException, "0"); } /******************************************************************/ #endif /* testdrive_clustal */ /* Function: ReadClustal() * Date: SRE, Sun Jun 6 17:53:49 1999 [bus from Madison, 1999 worm mtg] * * Purpose: Parse an alignment read from an open Clustal format * alignment file. Clustal is a single-alignment format. * Return the alignment, or NULL if we have no data. * * Args: afp - open alignment file * * Returns: MSA * - an alignment object * caller responsible for an MSAFree() * NULL if no more alignments * * Diagnostics: * Will Die() here with a (potentially) useful message * if a parsing error occurs. */ MSA * ReadClustal(MSAFILE *afp) { MSA *msa; char *s; int slen; int sqidx; char *name; char *seq; char *s2; if (feof(afp->f)) return NULL; /* Skip until we see the CLUSTAL header */ while ((s = MSAFileGetLine(afp)) != NULL) { if (strncmp(s, "CLUSTAL", 7) == 0 && strstr(s, "multiple sequence alignment") != NULL) break; } if (s == NULL) return NULL; msa = MSAAlloc(10, 0); /* Now we're in the sequence section. * As discussed above, if we haven't seen a sequence name, then we * don't include the sequence in the alignment. * Watch out for conservation markup lines that contain *.: chars */ while ((s = MSAFileGetLine(afp)) != NULL) { if ((name = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; if ((seq = sre_strtok(&s, WHITESPACE, &slen)) == NULL) continue; s2 = sre_strtok(&s, "\n", NULL); /* The test for a conservation markup line */ if (strpbrk(name, ".*:") != NULL && strpbrk(seq, ".*:") != NULL) continue; #ifdef CLUSTALO /* extra bit at the end of a line might be the unaligned residue count */ if (s2 != NULL) { int i; for (i=0; i<strlen(s2); i++) { if (! isnumber(s2[i])) { Die("Parse failed at line %d, file %s: possibly using spaces as gaps", afp->linenumber, afp->fname); } } } #else if (s2 != NULL) Die("Parse failed at line %d, file %s: possibly using spaces as gaps", afp->linenumber, afp->fname); #endif /* It's not blank, and it's not a coord line: must be sequence */ sqidx = MSAGetSeqidx(msa, name, msa->lastidx+1); msa->lastidx = sqidx; msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen); } MSAVerifyParse(msa); /* verifies, and also sets alen and wgt. */ return msa; } /* Function: WriteClustal() * Date: SRE, Sun Jun 6 18:12:47 1999 [bus from Madison, worm mtg 1999] * * Purpose: Write an alignment in Clustal format to an open file. * * Args: fp - file that's open for writing. * msa - alignment to write. * * Returns: (void) */ void #ifdef CLUSTALO WriteClustal(FILE *fp, MSA *msa, int iWrap, int bResno) #else WriteClustal(FILE *fp, MSA *msa) #endif { int idx; /* counter for sequences */ int len; /* tmp variable for name lengths */ int namelen; /* maximum name length used */ int pos; /* position counter */ #ifdef CLUSTALO char *buf; /* buffer for writing seq */ int cpl = msa->alen<iWrap ? msa->alen+10 : (iWrap > 0 ? iWrap : 60); /* char per line (< 64) */ #else char buf[80]; /* buffer for writing seq */ int cpl = 60; /* char per line (< 64) */ #endif /* consensus line stuff */ int subpos; char first; int bail; int strong_bins[9]; int weak_bins[11]; int cons; int bin; #ifdef CLUSTALO int *piResCnt = NULL; #endif #ifdef CLUSTALO if (1 == bResno){ if (NULL == (piResCnt = (int *)malloc(msa->nseq * sizeof(int)))){ Die("%s:%s:%d: could not malloc %d int for residue count", __FUNCTION__, __FILE__, __LINE__, msa->nseq); } else { memset(piResCnt, 0, msa->nseq * sizeof(int)); } } /* do print residue numbers */ if (NULL == (buf = (char *)malloc(cpl+20))){ Die("%s:%s:%d: could not malloc %d char for buffer", __FUNCTION__, __FILE__, __LINE__, cpl+20); } else { memset(buf, 0, cpl+20); } #endif /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < msa->nseq; idx++) if ((len = strlen(msa->sqname[idx])) > namelen) namelen = len; #ifdef CLUSTALO fprintf(fp, "ClustalOmega %s \n", PACKAGE_VERSION); #else fprintf(fp, "CLUSTAL W(1.5) multiple sequence alignment\n"); #endif /***************************************************** * Write the sequences *****************************************************/ #ifdef CLUSTALO fprintf(fp, "\n"); /* original had two blank lines */ #endif for (pos = 0; pos < msa->alen; pos += cpl) { fprintf(fp, "\n"); /* Blank line between sequence blocks */ for (idx = 0; idx < msa->nseq; idx++) { strncpy(buf, msa->aseq[idx] + pos, cpl); buf[cpl] = '\0'; #ifdef CLUSTALO if (1 == bResno){ char *pc = NULL; for (pc = buf; *pc != '\0'; pc++){ if ( ( (*pc >= 'a') && (*pc <= 'z') ) || ( (*pc >= 'A') && (*pc <= 'Z') ) ){ piResCnt[idx]++; } } fprintf(fp, "%-*s %s\t%d\n", namelen+5, msa->sqname[idx], buf, piResCnt[idx]); } else { fprintf(fp, "%-*s %s\n", namelen+5, msa->sqname[idx], buf); } #else fprintf(fp, "%*s %s\n", namelen, msa->sqname[idx], buf); #endif } #ifdef CLUSTALO /* do consensus dots */ /* print namelen+5 spaces */ for(subpos = 0; subpos <= namelen+5; subpos++) fprintf(fp, " "); for(subpos = pos; subpos < min(pos + cpl, msa->alen); subpos++) { /* see if 100% conservation */ first = msa->aseq[0][subpos]; bail = 0; for (idx = 1; idx < msa->nseq; idx++) { if(msa->aseq[idx][subpos] != first) { bail = 1; break; } } if(!bail) fprintf(fp, "*"); else { /* if not then check strong */ for(bin = 0; bin < 9; bin++) strong_bins[bin] = 0; /* clear the bins */ for(idx = 0; idx < msa->nseq; idx++) { switch(msa->aseq[idx][subpos]) { case 'S': strong_bins[0]++; break; case 'T': strong_bins[0]++; break; case 'A': strong_bins[0]++; break; case 'N': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; break; case 'E': strong_bins[1]++; strong_bins[3]++; break; case 'Q': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; strong_bins[4]++; break; case 'K': strong_bins[1]++; strong_bins[2]++; strong_bins[4]++; break; case 'D': strong_bins[3]++; break; case 'R': strong_bins[4]++; break; case 'H': strong_bins[4]++; strong_bins[7]++; break; case 'M': strong_bins[5]++; strong_bins[6]++; break; case 'I': strong_bins[5]++; strong_bins[6]++; break; case 'L': strong_bins[5]++; strong_bins[6]++; break; case 'V': strong_bins[5]++; break; case 'F': strong_bins[6]++; strong_bins[8]++; break; case 'Y': strong_bins[7]++; strong_bins[8]++; break; case 'W': strong_bins[8]++; break; } } bail = 0; for(bin = 0; bin < 9; bin++) if(strong_bins[bin] == msa->nseq) { bail = 1; break; } if(bail) fprintf(fp, ":"); else { /* check weak */ for(bin = 0; bin < 11; bin++) weak_bins[bin] = 0; /* clear the bins */ for(idx = 0; idx < msa->nseq; idx++) { switch(msa->aseq[idx][subpos]) { case 'C': weak_bins[0]++; break; case 'S': weak_bins[0]++; weak_bins[2]++; weak_bins[3]++; weak_bins[4]++; weak_bins[5]++; weak_bins[6]++; break; case 'A': weak_bins[0]++; weak_bins[1]++; weak_bins[2]++; weak_bins[4]++; break; case 'T': weak_bins[1]++; weak_bins[3]++; weak_bins[4]++; break; case 'V': weak_bins[1]++; weak_bins[9]++; break; case 'G': weak_bins[2]++; break; case 'N': weak_bins[3]++; weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'K': weak_bins[3]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'D': weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; break; case 'E': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'Q': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'H': weak_bins[7]++; weak_bins[8]++; weak_bins[10]++; break; case 'R': weak_bins[8]++; break; case 'F': weak_bins[9]++; weak_bins[10]++; break; case 'L': weak_bins[9]++; break; case 'I': weak_bins[9]++; break; case 'M': weak_bins[9]++; break; case 'Y': weak_bins[10]++; break; } } bail = 0; for(bin = 0; bin < 11; bin++) if(weak_bins[bin] == msa->nseq) { bail = 1; break; } if(bail) fprintf(fp, "."); else fprintf(fp, " "); } } } fprintf(fp,"\n"); #endif } #ifdef CLUSTALO free(piResCnt); piResCnt = NULL; #endif return; } /* Function: WriteClustalForR() * Date: SRE, Sun Jun 6 18:12:47 1999 [bus from Madison, worm mtg 1999] * * Purpose: Write an alignment in Clustal format to an open file. * * Args: msa - alignment to write. * * Returns: void */ void #ifdef CLUSTALO WriteClustalForR(MSA *msa, int iWrap, int bResno, int *msa_c, char ***msa_v) #else WriteClustalForR(MSA *msa, int *msa_c, char ***msa_v) #endif { //Log(&rLog, LOG_INFO, "WriteClustalForR start"); int idx; /* counter for sequences */ int len; /* tmp variable for name lengths */ int namelen; /* maximum name length used */ int pos; /* position counter */ #ifdef CLUSTALO char *buf; /* buffer for writing seq */ int cpl = msa->alen<iWrap ? msa->alen+10 : (iWrap > 0 ? iWrap : 60); /* char per line (< 64) */ #else char buf[80]; /* buffer for writing seq */ int cpl = 60; /* char per line (< 64) */ #endif /* consensus line stuff */ int subpos; char first; int bail; int strong_bins[9]; int weak_bins[11]; int cons; int bin; #ifdef CLUSTALO int *piResCnt = NULL; #endif #ifdef CLUSTALO if (1 == bResno) { if (NULL == (piResCnt = (int *)malloc(msa->nseq * sizeof(int)))) { Die("%s:%s:%d: could not malloc %d int for residue count", __FUNCTION__, __FILE__, __LINE__, msa->nseq); } else { memset(piResCnt, 0, msa->nseq * sizeof(int)); } } /* do print residue numbers */ if (NULL == (buf = (char *)malloc(cpl+20))) { Die("%s:%s:%d: could not malloc %d char for buffer", __FUNCTION__, __FILE__, __LINE__, cpl+20); } else { memset(buf, 0, cpl+20); } #endif /* calculate max namelen used */ namelen = 0; for (idx = 0; idx < msa->nseq; idx++) if ((len = strlen(msa->sqname[idx])) > namelen) namelen = len; int vecSize = ((int) (msa->alen / cpl) + 1) * (msa->nseq + 2) + 3; *msa_c = vecSize; *msa_v = malloc(sizeof(char*) * vecSize); int a_idx = 0; (*msa_v)[a_idx] = malloc((512) * sizeof(char*)); #ifdef CLUSTALO //fprintf(fp, "CLUSTAL O(%s) multiple sequence alignment\n", PACKAGE_VERSION); sprintf((*msa_v)[a_idx++], "ClustalOmega %s", PACKAGE_VERSION); #else //fprintf(fp, "CLUSTAL W(1.5) multiple sequence alignment\n"); sprintf((*msa_v)[a_idx++], "CLUSTAL W(1.5) multiple sequence alignment"); #endif /***************************************************** * Write the sequences *****************************************************/ #ifdef CLUSTALO //fprintf(fp, "\n"); /* Blank line between sequence blocks */ (*msa_v)[a_idx] = malloc((2) * sizeof(char*)); sprintf((*msa_v)[a_idx++], " "); //(*msa_v)[a_idx++] = '\0'; #endif for (pos = 0; pos < msa->alen; pos += cpl) { //fprintf(fp, "\n"); /* Blank line between sequence blocks */ (*msa_v)[a_idx] = malloc((2) * sizeof(char*)); sprintf((*msa_v)[a_idx++], " "); //(*msa_v)[a_idx++] = '\0'; for (idx = 0; idx < msa->nseq; idx++) { //char cmd[512]; (*msa_v)[a_idx] = malloc((512) * sizeof(char*)); strncpy(buf, msa->aseq[idx] + pos, cpl); buf[cpl] = '\0'; #ifdef CLUSTALO if (1 == bResno) { char *pc = NULL; for (pc = buf; *pc != '\0'; pc++) { if ( ( (*pc >= 'a') && (*pc <= 'z') ) || ( (*pc >= 'A') && (*pc <= 'Z') ) ) { piResCnt[idx]++; } } //fprintf(fp, "%-*s %s\t%d\n", namelen+5, msa->sqname[idx], buf, piResCnt[idx]); sprintf((*msa_v)[a_idx++], "%-*s %s\t%d", namelen+5, msa->sqname[idx], buf, piResCnt[idx]); } else { //fprintf(fp, "%-*s %s\n", namelen+5, msa->sqname[idx], buf); sprintf((*msa_v)[a_idx++], "%-*s %s", namelen+5, msa->sqname[idx], buf); } #else //fprintf(fp, "%*s %s\n", namelen, msa->sqname[idx], buf); sprintf((*msa_v)[a_idx++], "%*s %s", namelen, msa->sqname[idx], buf); #endif //Log(&rLog, LOG_INFO, "%s", (*msa_v)[(a_idx - 1)]); } #ifdef CLUSTALO /* do consensus dots */ //char cmd[256] = ""; (*msa_v)[a_idx] = malloc((512) * sizeof(char*)); char *cur = (*msa_v)[a_idx]; int conpos = 0; /* print namelen+5 spaces */ for(subpos = 0; subpos <= namelen+5; subpos++) { //fprintf(fp, " "); //snprintf(cmd + strlen(cmd), (sizeof cmd) - strlen(cmd), " "); conpos += sprintf(cur + conpos, " "); } for(subpos = pos; subpos < min(pos + cpl, msa->alen); subpos++) { /* see if 100% conservation */ first = msa->aseq[0][subpos]; bail = 0; for (idx = 1; idx < msa->nseq; idx++) { if(msa->aseq[idx][subpos] != first) { bail = 1; break; } } if(!bail) //fprintf(fp, "*"); conpos += sprintf(cur + conpos, "*"); else { /* if not then check strong */ for(bin = 0; bin < 9; bin++) strong_bins[bin] = 0; /* clear the bins */ for(idx = 0; idx < msa->nseq; idx++) { switch(msa->aseq[idx][subpos]) { case 'S': strong_bins[0]++; break; case 'T': strong_bins[0]++; break; case 'A': strong_bins[0]++; break; case 'N': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; break; case 'E': strong_bins[1]++; strong_bins[3]++; break; case 'Q': strong_bins[1]++; strong_bins[2]++; strong_bins[3]++; strong_bins[4]++; break; case 'K': strong_bins[1]++; strong_bins[2]++; strong_bins[4]++; break; case 'D': strong_bins[3]++; break; case 'R': strong_bins[4]++; break; case 'H': strong_bins[4]++; strong_bins[7]++; break; case 'M': strong_bins[5]++; strong_bins[6]++; break; case 'I': strong_bins[5]++; strong_bins[6]++; break; case 'L': strong_bins[5]++; strong_bins[6]++; break; case 'V': strong_bins[5]++; break; case 'F': strong_bins[6]++; strong_bins[8]++; break; case 'Y': strong_bins[7]++; strong_bins[8]++; break; case 'W': strong_bins[8]++; break; } } bail = 0; for(bin = 0; bin < 9; bin++) if(strong_bins[bin] == msa->nseq) { bail = 1; break; } if(bail) //fprintf(fp, ":"); conpos += sprintf(cur + conpos, ":"); else { /* check weak */ for(bin = 0; bin < 11; bin++) weak_bins[bin] = 0; /* clear the bins */ for(idx = 0; idx < msa->nseq; idx++) { switch(msa->aseq[idx][subpos]) { case 'C': weak_bins[0]++; break; case 'S': weak_bins[0]++; weak_bins[2]++; weak_bins[3]++; weak_bins[4]++; weak_bins[5]++; weak_bins[6]++; break; case 'A': weak_bins[0]++; weak_bins[1]++; weak_bins[2]++; weak_bins[4]++; break; case 'T': weak_bins[1]++; weak_bins[3]++; weak_bins[4]++; break; case 'V': weak_bins[1]++; weak_bins[9]++; break; case 'G': weak_bins[2]++; break; case 'N': weak_bins[3]++; weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'K': weak_bins[3]++; weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'D': weak_bins[5]++; weak_bins[6]++; weak_bins[7]++; break; case 'E': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'Q': weak_bins[6]++; weak_bins[7]++; weak_bins[8]++; break; case 'H': weak_bins[7]++; weak_bins[8]++; weak_bins[10]++; break; case 'R': weak_bins[8]++; break; case 'F': weak_bins[9]++; weak_bins[10]++; break; case 'L': weak_bins[9]++; break; case 'I': weak_bins[9]++; break; case 'M': weak_bins[9]++; break; case 'Y': weak_bins[10]++; break; } } bail = 0; for(bin = 0; bin < 11; bin++) if(weak_bins[bin] == msa->nseq) { bail = 1; break; } if(bail) //fprintf(fp, "."); conpos += sprintf(cur + conpos, "."); else //fprintf(fp, " "); conpos += sprintf(cur + conpos, " "); } } } //fprintf(fp,"\n"); //(*msa_v)[a_idx++] = cmd; a_idx++; //Log(&rLog, LOG_INFO, "%s", (*msa_v)[(a_idx-1)]); #endif } //Log(&rLog, LOG_INFO, "index %i", a_idx); while (a_idx < vecSize) { (*msa_v)[a_idx] = malloc((2) * sizeof(char*)); sprintf((*msa_v)[a_idx++], " "); //(*msa_v)[a_idx++] = '\0'; } #ifdef CLUSTALO free(piResCnt); piResCnt = NULL; #endif free(buf); return; }