Browse code

add package to the repository

msa


git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358

Sonali Arora authored on 10/04/2015 00:12:33
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,174 @@
1
+/*****************************************************************
2
+ * SQUID - a library of functions for biological sequence analysis
3
+ * Copyright (C) 1992-2002 Washington University School of Medicine
4
+ * 
5
+ *     This source code is freely distributed under the terms of the
6
+ *     GNU General Public License. See the files COPYRIGHT and LICENSE
7
+ *     for details.
8
+ *****************************************************************/
9
+
10
+/* dayhoff.c
11
+ * 
12
+ * Routines for dealing with PAM matrices.
13
+ * 
14
+ * Includes:
15
+ *    ParsePAMFile()  -- read a PAM matrix from disk.
16
+ *    
17
+ *    
18
+ * SRE - Fri Apr  2 11:23:45 1993   
19
+ * RCS $Id: dayhoff.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: dayhoff.c,v 1.5 2002/07/03 15:03:39 eddy Exp)
20
+ */
21
+
22
+
23
+#include <stdlib.h>
24
+#include <stdio.h>
25
+#include <string.h>
26
+#include <math.h>
27
+#include <ctype.h>
28
+#include "squid.h"
29
+
30
+/* Function: ParsePAMFile()
31
+ * 
32
+ * Purpose:  Given a pointer to an open file containing a PAM matrix,
33
+ *           parse the file and allocate and fill a 2D array of
34
+ *           floats containing the matrix. The PAM file is
35
+ *           assumed to be in the format that NCBI distributes
36
+ *           with BLAST. BLOSUM matrices also work fine, as
37
+ *           produced by Henikoff's program "MATBLAS".
38
+ *          
39
+ *           Parses both old format and new format BLAST matrices.
40
+ *           Old format just had rows of integers.
41
+ *           New format includes a leading character on each row.
42
+ *
43
+ *           The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*.
44
+ *           Note that it's not a 20x20 matrix as you might expect;
45
+ *           this is for speed of indexing as well as the ability
46
+ *           to deal with ambiguous characters.
47
+ *           
48
+ * Args:     fp        - open PAM file
49
+ *           ret_pam   - RETURN: pam matrix, integers                   
50
+ *           ret_scale - RETURN: scale factor for converting
51
+ *                       to real Sij. For instance, PAM120 is
52
+ *                       given in units of ln(2)/2. This may
53
+ *                       be passed as NULL if the caller
54
+ *                       doesn't care.
55
+ * 
56
+ * Returns:  1 on success; 0 on failure and sets squid_errno to
57
+ *           indicate the cause. ret_pam is allocated here and
58
+ *           must be freed by the caller (use FreePAM).
59
+ */
60
+int
61
+ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale)
62
+{
63
+  int    **pam;
64
+  char     buffer[512];		/* input buffer from fp                  */
65
+  int      order[27];		/* order of fields, obtained from header */
66
+  int      nsymbols;		/* total number of symbols in matrix     */
67
+  char    *sptr;
68
+  int      idx;
69
+  int      row, col;
70
+  float    scale;
71
+  int      gotscale = FALSE;
72
+  
73
+  scale = 0.0;		/* just to silence gcc uninit warnings */
74
+  if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; }
75
+  
76
+  /* Look at the first non-blank, non-comment line in the file.
77
+   * It gives single-letter codes in the order the PAM matrix
78
+   * is arrayed in the file. 
79
+   */
80
+  do {
81
+    if (fgets(buffer, 512, fp) == NULL) 
82
+      { squid_errno = SQERR_NODATA; return 0; }
83
+
84
+    /* Get the scale factor from the header.
85
+     * For BLOSUM files, we assume the line looks like:
86
+     *     BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
87
+     * and we assume that the fraction is always 1/x;
88
+     * 
89
+     * For PAM files, we assume the line looks like:
90
+     *     PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574
91
+     * and we assume that the number following the final '=' is our scale
92
+     */
93
+    if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL &&
94
+	(sptr = strchr(buffer, '/')) != NULL)
95
+      {
96
+	sptr++;
97
+	if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; }
98
+	scale = (float) (log(2.0) / atof(sptr));
99
+	gotscale = TRUE;
100
+      }
101
+    else if (strstr(buffer, "substitution matrix,") != NULL)
102
+      {
103
+	while ((sptr = strrchr(buffer, '=')) != NULL) {
104
+	  sptr += 2;
105
+	  if (IsReal(sptr)) {
106
+	    scale = atof(sptr);
107
+	    gotscale = TRUE;
108
+	    break;
109
+	  }
110
+	}
111
+      }
112
+  } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
113
+
114
+  idx = 0;
115
+  do {
116
+    order[idx] = (int) *sptr - (int) 'A';
117
+    if (order[idx] < 0 || order[idx] > 25) order[idx] = 26;
118
+    idx++;
119
+  } while ((sptr = strtok(NULL, " \t\n")) != NULL);
120
+  nsymbols = idx;
121
+  
122
+  /* Allocate a pam matrix. For speed of indexing, we use
123
+   * a 27x27 matrix so we can do lookups using the ASCII codes
124
+   * of amino acid single-letter representations, plus one
125
+   * extra field to deal with the "*" (terminators).
126
+   */
127
+  if ((pam = (int **) calloc (27, sizeof(int *))) == NULL)
128
+    Die("calloc failed");
129
+  for (idx = 0; idx < 27; idx++)
130
+    if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL)
131
+      Die("calloc failed");
132
+
133
+  /* Parse the rest of the file.
134
+   */
135
+  for (row = 0; row < nsymbols; row++)
136
+    {
137
+      if (fgets(buffer, 512, fp) == NULL) 
138
+	{ squid_errno = SQERR_NODATA; return 0; }
139
+
140
+      if ((sptr = strtok(buffer, " \t\n")) == NULL)
141
+	{ squid_errno = SQERR_NODATA; return 0; }
142
+      for (col = 0; col < nsymbols; col++)
143
+	{
144
+	  if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; }
145
+
146
+	  /* Watch out for new BLAST format, with leading characters
147
+	   */
148
+	  if (*sptr == '*' || isalpha((int) *sptr))
149
+	    col--;  /* hack hack */
150
+	  else
151
+	    pam [order[row]] [order[col]] = atoi(sptr);
152
+
153
+	  sptr = strtok(NULL, " \t\n");
154
+	}
155
+    }
156
+  
157
+  /* Return
158
+   */
159
+  if (ret_scale != NULL)
160
+    {
161
+      if (gotscale) *ret_scale = scale;
162
+      else
163
+	{
164
+#ifdef CLUSTALO
165
+	  Warning("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
166
+#else
167
+	  Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!");
168
+#endif
169
+	  *ret_scale = log(2.0) / 2.0;
170
+	}
171
+    }
172
+  *ret_pam = pam;
173
+  return 1;
174
+}