msa
git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/msa@102253 bc3139a8-67e5-0310-9ffc-ced21a209358
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,174 @@ |
1 |
+/***************************************************************** |
|
2 |
+ * SQUID - a library of functions for biological sequence analysis |
|
3 |
+ * Copyright (C) 1992-2002 Washington University School of Medicine |
|
4 |
+ * |
|
5 |
+ * This source code is freely distributed under the terms of the |
|
6 |
+ * GNU General Public License. See the files COPYRIGHT and LICENSE |
|
7 |
+ * for details. |
|
8 |
+ *****************************************************************/ |
|
9 |
+ |
|
10 |
+/* dayhoff.c |
|
11 |
+ * |
|
12 |
+ * Routines for dealing with PAM matrices. |
|
13 |
+ * |
|
14 |
+ * Includes: |
|
15 |
+ * ParsePAMFile() -- read a PAM matrix from disk. |
|
16 |
+ * |
|
17 |
+ * |
|
18 |
+ * SRE - Fri Apr 2 11:23:45 1993 |
|
19 |
+ * RCS $Id: dayhoff.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: dayhoff.c,v 1.5 2002/07/03 15:03:39 eddy Exp) |
|
20 |
+ */ |
|
21 |
+ |
|
22 |
+ |
|
23 |
+#include <stdlib.h> |
|
24 |
+#include <stdio.h> |
|
25 |
+#include <string.h> |
|
26 |
+#include <math.h> |
|
27 |
+#include <ctype.h> |
|
28 |
+#include "squid.h" |
|
29 |
+ |
|
30 |
+/* Function: ParsePAMFile() |
|
31 |
+ * |
|
32 |
+ * Purpose: Given a pointer to an open file containing a PAM matrix, |
|
33 |
+ * parse the file and allocate and fill a 2D array of |
|
34 |
+ * floats containing the matrix. The PAM file is |
|
35 |
+ * assumed to be in the format that NCBI distributes |
|
36 |
+ * with BLAST. BLOSUM matrices also work fine, as |
|
37 |
+ * produced by Henikoff's program "MATBLAS". |
|
38 |
+ * |
|
39 |
+ * Parses both old format and new format BLAST matrices. |
|
40 |
+ * Old format just had rows of integers. |
|
41 |
+ * New format includes a leading character on each row. |
|
42 |
+ * |
|
43 |
+ * The PAM matrix is a 27x27 matrix, 0=A..25=Z,26=*. |
|
44 |
+ * Note that it's not a 20x20 matrix as you might expect; |
|
45 |
+ * this is for speed of indexing as well as the ability |
|
46 |
+ * to deal with ambiguous characters. |
|
47 |
+ * |
|
48 |
+ * Args: fp - open PAM file |
|
49 |
+ * ret_pam - RETURN: pam matrix, integers |
|
50 |
+ * ret_scale - RETURN: scale factor for converting |
|
51 |
+ * to real Sij. For instance, PAM120 is |
|
52 |
+ * given in units of ln(2)/2. This may |
|
53 |
+ * be passed as NULL if the caller |
|
54 |
+ * doesn't care. |
|
55 |
+ * |
|
56 |
+ * Returns: 1 on success; 0 on failure and sets squid_errno to |
|
57 |
+ * indicate the cause. ret_pam is allocated here and |
|
58 |
+ * must be freed by the caller (use FreePAM). |
|
59 |
+ */ |
|
60 |
+int |
|
61 |
+ParsePAMFile(FILE *fp, int ***ret_pam, float *ret_scale) |
|
62 |
+{ |
|
63 |
+ int **pam; |
|
64 |
+ char buffer[512]; /* input buffer from fp */ |
|
65 |
+ int order[27]; /* order of fields, obtained from header */ |
|
66 |
+ int nsymbols; /* total number of symbols in matrix */ |
|
67 |
+ char *sptr; |
|
68 |
+ int idx; |
|
69 |
+ int row, col; |
|
70 |
+ float scale; |
|
71 |
+ int gotscale = FALSE; |
|
72 |
+ |
|
73 |
+ scale = 0.0; /* just to silence gcc uninit warnings */ |
|
74 |
+ if (fp == NULL) { squid_errno = SQERR_NODATA; return 0; } |
|
75 |
+ |
|
76 |
+ /* Look at the first non-blank, non-comment line in the file. |
|
77 |
+ * It gives single-letter codes in the order the PAM matrix |
|
78 |
+ * is arrayed in the file. |
|
79 |
+ */ |
|
80 |
+ do { |
|
81 |
+ if (fgets(buffer, 512, fp) == NULL) |
|
82 |
+ { squid_errno = SQERR_NODATA; return 0; } |
|
83 |
+ |
|
84 |
+ /* Get the scale factor from the header. |
|
85 |
+ * For BLOSUM files, we assume the line looks like: |
|
86 |
+ * BLOSUM Clustered Scoring Matrix in 1/2 Bit Units |
|
87 |
+ * and we assume that the fraction is always 1/x; |
|
88 |
+ * |
|
89 |
+ * For PAM files, we assume the line looks like: |
|
90 |
+ * PAM 120 substitution matrix, scale = ln(2)/2 = 0.346574 |
|
91 |
+ * and we assume that the number following the final '=' is our scale |
|
92 |
+ */ |
|
93 |
+ if (strstr(buffer, "BLOSUM Clustered Scoring Matrix") != NULL && |
|
94 |
+ (sptr = strchr(buffer, '/')) != NULL) |
|
95 |
+ { |
|
96 |
+ sptr++; |
|
97 |
+ if (! isdigit((int) (*sptr))) { squid_errno = SQERR_FORMAT; return 0; } |
|
98 |
+ scale = (float) (log(2.0) / atof(sptr)); |
|
99 |
+ gotscale = TRUE; |
|
100 |
+ } |
|
101 |
+ else if (strstr(buffer, "substitution matrix,") != NULL) |
|
102 |
+ { |
|
103 |
+ while ((sptr = strrchr(buffer, '=')) != NULL) { |
|
104 |
+ sptr += 2; |
|
105 |
+ if (IsReal(sptr)) { |
|
106 |
+ scale = atof(sptr); |
|
107 |
+ gotscale = TRUE; |
|
108 |
+ break; |
|
109 |
+ } |
|
110 |
+ } |
|
111 |
+ } |
|
112 |
+ } while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#'); |
|
113 |
+ |
|
114 |
+ idx = 0; |
|
115 |
+ do { |
|
116 |
+ order[idx] = (int) *sptr - (int) 'A'; |
|
117 |
+ if (order[idx] < 0 || order[idx] > 25) order[idx] = 26; |
|
118 |
+ idx++; |
|
119 |
+ } while ((sptr = strtok(NULL, " \t\n")) != NULL); |
|
120 |
+ nsymbols = idx; |
|
121 |
+ |
|
122 |
+ /* Allocate a pam matrix. For speed of indexing, we use |
|
123 |
+ * a 27x27 matrix so we can do lookups using the ASCII codes |
|
124 |
+ * of amino acid single-letter representations, plus one |
|
125 |
+ * extra field to deal with the "*" (terminators). |
|
126 |
+ */ |
|
127 |
+ if ((pam = (int **) calloc (27, sizeof(int *))) == NULL) |
|
128 |
+ Die("calloc failed"); |
|
129 |
+ for (idx = 0; idx < 27; idx++) |
|
130 |
+ if ((pam[idx] = (int *) calloc (27, sizeof(int))) == NULL) |
|
131 |
+ Die("calloc failed"); |
|
132 |
+ |
|
133 |
+ /* Parse the rest of the file. |
|
134 |
+ */ |
|
135 |
+ for (row = 0; row < nsymbols; row++) |
|
136 |
+ { |
|
137 |
+ if (fgets(buffer, 512, fp) == NULL) |
|
138 |
+ { squid_errno = SQERR_NODATA; return 0; } |
|
139 |
+ |
|
140 |
+ if ((sptr = strtok(buffer, " \t\n")) == NULL) |
|
141 |
+ { squid_errno = SQERR_NODATA; return 0; } |
|
142 |
+ for (col = 0; col < nsymbols; col++) |
|
143 |
+ { |
|
144 |
+ if (sptr == NULL) { squid_errno = SQERR_NODATA; return 0; } |
|
145 |
+ |
|
146 |
+ /* Watch out for new BLAST format, with leading characters |
|
147 |
+ */ |
|
148 |
+ if (*sptr == '*' || isalpha((int) *sptr)) |
|
149 |
+ col--; /* hack hack */ |
|
150 |
+ else |
|
151 |
+ pam [order[row]] [order[col]] = atoi(sptr); |
|
152 |
+ |
|
153 |
+ sptr = strtok(NULL, " \t\n"); |
|
154 |
+ } |
|
155 |
+ } |
|
156 |
+ |
|
157 |
+ /* Return |
|
158 |
+ */ |
|
159 |
+ if (ret_scale != NULL) |
|
160 |
+ { |
|
161 |
+ if (gotscale) *ret_scale = scale; |
|
162 |
+ else |
|
163 |
+ { |
|
164 |
+#ifdef CLUSTALO |
|
165 |
+ Warning("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!"); |
|
166 |
+#else |
|
167 |
+ Warn("Failed to parse PAM matrix scale factor. Defaulting to ln(2)/2!"); |
|
168 |
+#endif |
|
169 |
+ *ret_scale = log(2.0) / 2.0; |
|
170 |
+ } |
|
171 |
+ } |
|
172 |
+ *ret_pam = pam; |
|
173 |
+ return 1; |
|
174 |
+} |