Browse code

seqan header files

aguang authored on 15/02/2018 18:04:57
Showing1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,418 @@
1
+// ==========================================================================
2
+//                 SeqAn - The Library for Sequence Analysis
3
+// ==========================================================================
4
+// Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5
+// All rights reserved.
6
+//
7
+// Redistribution and use in source and binary forms, with or without
8
+// modification, are permitted provided that the following conditions are met:
9
+//
10
+//     * Redistributions of source code must retain the above copyright
11
+//       notice, this list of conditions and the following disclaimer.
12
+//     * Redistributions in binary form must reproduce the above copyright
13
+//       notice, this list of conditions and the following disclaimer in the
14
+//       documentation and/or other materials provided with the distribution.
15
+//     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16
+//       its contributors may be used to endorse or promote products derived
17
+//       from this software without specific prior written permission.
18
+//
19
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
+// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29
+// DAMAGE.
30
+//
31
+// ==========================================================================
32
+// Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33
+// ==========================================================================
34
+
35
+// TODO(holtgrew): Allow the storage of additional fields.
36
+/* This could look as follows:
37
+
38
+   The values of the additional fields are stored as strings.  Either we store name/value pairs or, more efficiently,
39
+   store a stringset for each match, with the same number of entries.  An additional String/Map maps from field name to
40
+   index in this string sets.
41
+ */
42
+
43
+#ifndef INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
44
+#define INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
45
+
46
+namespace seqan {
47
+
48
+// ============================================================================
49
+// Forwards
50
+// ============================================================================
51
+
52
+// ============================================================================
53
+// Tags, Classes, Enums
54
+// ============================================================================
55
+
56
+/*!
57
+ * @class LocalMatch
58
+ * @headerfile <seqan/parse_lm.h>
59
+ * @brief Stores information about local matches.
60
+ *
61
+ * @signature template <typename TId, typename TPosition>
62
+ *            class LocalMatch;
63
+ *
64
+ * @tparam TId       Type to use for subject/query id.
65
+ * @tparam TPosition Type to use for storing positions.
66
+ *
67
+ * Matches on the reverse-complement are encoded by the begin position being greater than the end position.
68
+ *
69
+ * Sequence names are not stored in LocalMatch objects but in the @link LocalMatchStore @endlink they belong to.
70
+ *
71
+ * @see LocalMatchStore
72
+ *
73
+ *
74
+ * @var TId LocalMatch::queryId
75
+ * @brief The id of the query.
76
+ *
77
+ * @var TPosition LocalMatch::queryBeginPos
78
+ * @brief Begin position of local match in the query.
79
+ *
80
+ * @var TPosition LocalMatch::queryEndPos
81
+ * @brief End position of local match in the query.
82
+ *
83
+ * @var TPosition LocalMatch::subjectBeginPos
84
+ * @brief Begin position of local match in the subject.
85
+ *
86
+ * @var TPosition LocalMatch::subjectEndPos
87
+ * @brief End position of local match in the subject.
88
+ *
89
+ * @var TId LocalMatch::subjectId
90
+ * @brief The id of the subject.
91
+ */
92
+
93
+template <typename TId, typename TPosition>
94
+class LocalMatch
95
+{
96
+public:
97
+    TId id;
98
+    TId subjectId;
99
+    TPosition subjectBeginPos;
100
+    TPosition subjectEndPos;
101
+    TId queryId;
102
+    TPosition queryBeginPos;
103
+    TPosition queryEndPos;
104
+
105
+    LocalMatch() :
106
+            id(std::numeric_limits<TId>::max()),
107
+            subjectId(std::numeric_limits<TId>::max()),
108
+            subjectBeginPos(std::numeric_limits<TPosition>::max()),
109
+            subjectEndPos(std::numeric_limits<TPosition>::max()),
110
+            queryId(std::numeric_limits<TId>::max()),
111
+            queryBeginPos(std::numeric_limits<TPosition>::max()),
112
+            queryEndPos(std::numeric_limits<TPosition>::max())
113
+    {}
114
+
115
+    LocalMatch(TId id_,
116
+               TId subjectId_,
117
+               TPosition subjectBeginPos_,
118
+               TPosition subjectEndPos_,
119
+               TId queryId_,
120
+               TPosition queryBeginPos_,
121
+               TPosition queryEndPos_) :
122
+            id(id_),
123
+            subjectId(subjectId_),
124
+            subjectBeginPos(subjectBeginPos_),
125
+            subjectEndPos(subjectEndPos_),
126
+            queryId(queryId_),
127
+            queryBeginPos(queryBeginPos_),
128
+            queryEndPos(queryEndPos_)
129
+    {}
130
+
131
+    bool operator==(LocalMatch const & other) const
132
+    {
133
+        return id == other.id && subjectId == other.subjectId && subjectBeginPos == other.subjectBeginPos &&
134
+                subjectEndPos == other.subjectEndPos && queryId == other.queryId &&
135
+                queryBeginPos == other.queryBeginPos && queryEndPos == other.queryEndPos;
136
+    }
137
+};
138
+
139
+/*!
140
+ * @class LocalMatchStoreConfig
141
+ * @headerfile <seqan/parse_lm.h>
142
+ * @brief Stores information about local matches.
143
+ *
144
+ * @signature template <typename TSpec>
145
+ *            struct LocalMatchStoreConfig;
146
+ *
147
+ * @tparam TSpec Specializing type.
148
+ *
149
+ *
150
+ * @typedef LocalMatchStoreConfig::TId;
151
+ * @brief Type to use for ids.
152
+ * @signature typedef unsigned LocalMatchStoreConfig::TId;
153
+ *
154
+ * @typedef LocalMatchStoreConfig::TPosition;
155
+ * @brief The type to use for positions.
156
+ * @signature typedef (...) LocalMatchStoreConfig::TPosition;
157
+ */
158
+
159
+template <typename TSpec>
160
+struct LocalMatchStoreConfig
161
+{
162
+    typedef unsigned TId;
163
+    typedef typename Position<Dna5String>::Type TPosition;
164
+};
165
+
166
+/*!
167
+ * @class LocalMatchStore
168
+ * @headerfile <seqan/parse_lm.h>
169
+ * @brief Stores information about local matches.
170
+ *
171
+ * @signature template <typename TSpec = void, TConfig = LocalMatchStoreConfig<TSpec> >
172
+ *            struct LocalMatchStore;
173
+ *
174
+ * @tparam TSpec   Specialization tag.
175
+ * @tparam TConfig Configuration class.
176
+ *
177
+ * The LocalMatchStore provides information about matches.  Similar to the @link FragmentStore @endlink, the
178
+ * information is split into multiple sub stores.  Each sub store stores a part of the information.
179
+ *
180
+ * The LocalMatchStore#matchStore stores the information about a match.  The LocalMatchStore#sequenceNameStore stores
181
+ * the sequence names.  These both sub stores are "core stores", they are filled with consistent information, i.e. for
182
+ * each sequence id in the matchStore, there has to be a valid entry in the sequenceNameStore.
183
+ *
184
+ * The LocalMatchStore#cigarStore optionally stores CIGAR strings for the matches.  Its entries are referenced by
185
+ * <tt>matchStore[i].id</tt>.
186
+ *
187
+ * @section Examples
188
+ *
189
+ * Read Lastz matches from a link RecordReader and then print them to stdout.
190
+ *
191
+ * @code{.cpp}
192
+ * // Read local alignments from record reader.  Note that in real-world code,
193
+ * // you should have error handling.
194
+ * LocalMatchStore<> lmStore;
195
+ * while (!atEnd(recordReader))
196
+ *     readRecord(lmStore, recordReader, StellarGff());
197
+ *
198
+ * // Print local alignment information to stdout.
199
+ * std::cout << "# Reverse strandness is indicated by end < begin\n"
200
+ *           << "#db\tdb_beg\tdb_end\tquery\tq_beg\tq_end\n";
201
+ * for (unsigned i = 0; i < length(lmStore.matchStore); ++i)
202
+ *     std::cout << lmStore.sequenceNameStore[lmStore.matchStore[i].queryId] << "\t"
203
+ *               << lmStore.matchStore[i].queryBeginPos << "\t"
204
+ *               << lmStore.matchStore[i].queryEndPos << "\t"
205
+ *               << lmStore.sequenceNameStore[lmStore.matchStore[i].subjectId] << "\t"
206
+ *               << lmStore.matchStore[i].subjectBeginPos << "\t"
207
+ *               << lmStore.matchStore[i].subjectEndPos << "\n";
208
+ * @endcode
209
+ * @see LocalMatch
210
+ *
211
+ * @var TMatchStore LocalMatchStore::matchStore
212
+ * @brief String storing the LocalMatch local matches.
213
+ *
214
+ * @var TStringSet LocalMatchStore::sequenceNameStore
215
+ * @brief StringSet storing the sequence names.
216
+ *
217
+ * @var TCigarString LocalMatchStore::cigarStore
218
+ * @brief String storing the CIGAR strings.
219
+ */
220
+
221
+/*!
222
+ * @fn LocalMatchStore#readRecord
223
+ *
224
+ * @headerfile <seqan/parse_lm.h>
225
+ *
226
+ * @brief Read Lastz "general" format record.
227
+ *
228
+ * @signature int readRecord(store, reader, tag);
229
+ *
230
+ * @param[in,out] store  LocalMatchStore object to read into.
231
+ * @param[in,out] stream SinglePassRecordReader to read from.
232
+ * @param[in]     tag    The tag for selecting the format, one of BlastnTabular, LastzGeneral, and StellarGff.
233
+ *
234
+ * @return int 0 on success, non-0 on errors and EOF
235
+ */
236
+
237
+template <typename TSpec=void, typename TConfig=LocalMatchStoreConfig<TSpec> >
238
+class LocalMatchStore
239
+{
240
+public:
241
+    // ----------------------------------------------------------------------------
242
+    // Typedefs
243
+    // ----------------------------------------------------------------------------
244
+
245
+    typedef typename TConfig::TId TId;
246
+    typedef typename TConfig::TPosition TPosition;
247
+
248
+    typedef LocalMatch<TId, TPosition> TLocalMatch;
249
+    typedef String<TLocalMatch> TMatchStore;
250
+
251
+    typedef String<CigarElement<> > TCigarString;
252
+    typedef String<TCigarString> TCigarStore;
253
+
254
+    typedef StringSet<CharString> TNameStore;
255
+    typedef NameStoreCache<TNameStore> TNameStoreCache_;
256
+
257
+    // ----------------------------------------------------------------------------
258
+    // Member Variables
259
+    // ----------------------------------------------------------------------------
260
+
261
+    TNameStore       sequenceNameStore;
262
+    TNameStoreCache_ _sequenceNameStoreCache;
263
+
264
+    TMatchStore       matchStore;
265
+    TCigarStore       cigarStore;
266
+
267
+    LocalMatchStore() :
268
+            _sequenceNameStoreCache(sequenceNameStore)
269
+    {}
270
+};
271
+
272
+// ============================================================================
273
+// Metafunctions
274
+// ============================================================================
275
+
276
+// ============================================================================
277
+// Functions
278
+// ============================================================================
279
+
280
+// ----------------------------------------------------------------------------
281
+// Function registerSequenceName
282
+// ----------------------------------------------------------------------------
283
+
284
+template <typename TLocalMatchStore>
285
+inline void
286
+registerSequenceName(TLocalMatchStore & store,
287
+                     CharString const & sequenceName)
288
+{
289
+    unsigned id = 0;
290
+    if (!getIdByName(store.sequenceNameStore, sequenceName, id, store._sequenceNameStoreCache))
291
+    {
292
+        id = length(store.sequenceNameStore);
293
+        appendName(store.sequenceNameStore, sequenceName, store._sequenceNameStoreCache);
294
+    }
295
+}
296
+
297
+// ----------------------------------------------------------------------------
298
+// Function appendLocalMatch
299
+// ----------------------------------------------------------------------------
300
+
301
+/*!
302
+ * @fn LocalMatchStore#appendLocalMatch
303
+ * @brief Append a new local match to a @link LocalMatchStore @endlink
304
+ *
305
+ * @signature void appendLocalMatchStore(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
306
+ * @signature void appendLocalMatchStore(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos, cigarStringBuffer);
307
+ *
308
+ * @param[in,out] store             The LocalMatchStore to add the local match to.
309
+ * @param[in]     subjectId         Numeric subject sequence identifier, @link IntegerConcept @endlink.
310
+ * @param[in]     subjectName       The textual name of the query sequence, @link CharString @endlink.
311
+ * @param[in]     subjectBegin      The begin position of the match in the subject, @link IntegerConcept @endlink.
312
+ * @param[in]     subjectEnd        The end position of the match in the subject, @link IntegerConcept @endlink.
313
+ * @param[in]     queryId           Numeric query sequence identifier, @link IntegerConcept @endlink.
314
+ * @param[in]     queryName         The textual name of the query sequence, @link CharString @endlink.
315
+ * @param[in]     queryBegin        The begin position of the match in the query, @link IntegerConcept @endlink.
316
+ * @param[in]     queryEnd          The end position of the match in the query, @link IntegerConcept @endlink.
317
+ * @param[in]     cigarStringBuffer Buffer with the cigar string of the local alignment,  @link CharString @endlink.
318
+ *
319
+ * Matches on the reverse-complement are encoded by the begin position being greater than the begin position.
320
+ */
321
+
322
+template <typename TLocalMatchStore, typename TPosition>
323
+inline void
324
+appendLocalMatch(TLocalMatchStore & store,
325
+                 unsigned const & subjectId,
326
+                 TPosition const subjectBeginPos,
327
+                 TPosition const subjectEndPos,
328
+                 unsigned const & queryId,
329
+                 TPosition const queryBeginPos,
330
+                 TPosition const queryEndPos)
331
+{
332
+    typedef typename TLocalMatchStore::TLocalMatch TLocalMatch;
333
+
334
+    SEQAN_ASSERT_LT(subjectId, length(store.sequenceNameStore));
335
+    SEQAN_ASSERT_LT(queryId, length(store.sequenceNameStore));
336
+
337
+    TLocalMatch localMatch(length(store.matchStore),
338
+                           subjectId,
339
+                           subjectBeginPos,
340
+                           subjectEndPos,
341
+                           queryId,
342
+                           queryBeginPos,
343
+                           queryEndPos);
344
+    appendValue(store.matchStore, localMatch);
345
+}
346
+
347
+template <typename TLocalMatchStore, typename TPosition>
348
+inline void
349
+appendLocalMatch(TLocalMatchStore & store,
350
+                 CharString const & subjectName,
351
+                 TPosition const subjectBeginPos,
352
+                 TPosition const subjectEndPos,
353
+                 CharString const & queryName,
354
+                 TPosition const queryBeginPos,
355
+                 TPosition const queryEndPos)
356
+{
357
+    typedef typename TLocalMatchStore::TId         TId;
358
+
359
+    // Get id for subject and query sequences;  Insert sequences into name stores/caches if not already there.
360
+    TId subjectId = 0;
361
+    if (!getIdByName(subjectId, store.sequenceNameStore, subjectName))
362
+    {
363
+        subjectId = length(store.sequenceNameStore);
364
+        appendName(store._sequenceNameStoreCache, subjectName);
365
+    }
366
+    TId queryId = 0;
367
+    if (!getIdByName(queryId, store.sequenceNameStore, queryName))
368
+    {
369
+        queryId = length(store.sequenceNameStore);
370
+        appendName(store._sequenceNameStoreCache, queryName);
371
+    }
372
+
373
+    appendLocalMatch(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
374
+}
375
+
376
+template <typename TLocalMatchStore, typename TPosition>
377
+inline void
378
+appendLocalMatch(TLocalMatchStore & store,
379
+                 CharString const & subjectName,
380
+                 TPosition const subjectBeginPos,
381
+                 TPosition const subjectEndPos,
382
+                 CharString const & queryName,
383
+                 TPosition const queryBeginPos,
384
+                 TPosition const queryEndPos,
385
+                 CharString const & cigarStringBuffer)
386
+{
387
+    // Append local match.
388
+    appendLocalMatch(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos);
389
+    // Make space for CIGAR string.
390
+    resize(store.cigarStore, back(store.matchStore).id + 1);
391
+    // TODO(holtgrew): Something can go wrong when parsing CIGAR string, need return value?
392
+    // Parse out cigar string.
393
+    /*
394
+    typedef Stream<CharArray<char const *> > TCharArrayStream;
395
+    TCharArrayStream cigarStream(&cigarStringBuffer[0], &cigarStringBuffer[0] + length(cigarStringBuffer));
396
+    RecordReader<TCharArrayStream, SinglePass<> > recordReader(cigarStream);
397
+    */
398
+    DirectionIterator<CharString const, Input>::Type iter = begin(cigarStringBuffer);
399
+    CharString numBuf;
400
+    while (!atEnd(iter))
401
+    {
402
+        // Get number string into buffer.
403
+        clear(numBuf);
404
+        readUntil(numBuf, iter, NotFunctor<IsDigit>());
405
+        unsigned num = 0;
406
+        lexicalCast<unsigned>(num, numBuf);
407
+
408
+        // Read operation char and advance record reader.
409
+        char op;
410
+        readOne(op, iter);
411
+        // Append CIGAR element to CIGAR string.
412
+        appendValue(back(store.cigarStore), CigarElement<>(op, num));
413
+    }
414
+}
415
+
416
+}  // namespace seqan
417
+
418
+#endif  // INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_