1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,418 @@ |
1 |
+// ========================================================================== |
|
2 |
+// SeqAn - The Library for Sequence Analysis |
|
3 |
+// ========================================================================== |
|
4 |
+// Copyright (c) 2006-2018, Knut Reinert, FU Berlin |
|
5 |
+// All rights reserved. |
|
6 |
+// |
|
7 |
+// Redistribution and use in source and binary forms, with or without |
|
8 |
+// modification, are permitted provided that the following conditions are met: |
|
9 |
+// |
|
10 |
+// * Redistributions of source code must retain the above copyright |
|
11 |
+// notice, this list of conditions and the following disclaimer. |
|
12 |
+// * Redistributions in binary form must reproduce the above copyright |
|
13 |
+// notice, this list of conditions and the following disclaimer in the |
|
14 |
+// documentation and/or other materials provided with the distribution. |
|
15 |
+// * Neither the name of Knut Reinert or the FU Berlin nor the names of |
|
16 |
+// its contributors may be used to endorse or promote products derived |
|
17 |
+// from this software without specific prior written permission. |
|
18 |
+// |
|
19 |
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|
20 |
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
21 |
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
22 |
+// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE |
|
23 |
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
24 |
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
|
25 |
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
|
26 |
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
27 |
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
28 |
+// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
|
29 |
+// DAMAGE. |
|
30 |
+// |
|
31 |
+// ========================================================================== |
|
32 |
+// Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de> |
|
33 |
+// ========================================================================== |
|
34 |
+ |
|
35 |
+// TODO(holtgrew): Allow the storage of additional fields. |
|
36 |
+/* This could look as follows: |
|
37 |
+ |
|
38 |
+ The values of the additional fields are stored as strings. Either we store name/value pairs or, more efficiently, |
|
39 |
+ store a stringset for each match, with the same number of entries. An additional String/Map maps from field name to |
|
40 |
+ index in this string sets. |
|
41 |
+ */ |
|
42 |
+ |
|
43 |
+#ifndef INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_ |
|
44 |
+#define INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_ |
|
45 |
+ |
|
46 |
+namespace seqan { |
|
47 |
+ |
|
48 |
+// ============================================================================ |
|
49 |
+// Forwards |
|
50 |
+// ============================================================================ |
|
51 |
+ |
|
52 |
+// ============================================================================ |
|
53 |
+// Tags, Classes, Enums |
|
54 |
+// ============================================================================ |
|
55 |
+ |
|
56 |
+/*! |
|
57 |
+ * @class LocalMatch |
|
58 |
+ * @headerfile <seqan/parse_lm.h> |
|
59 |
+ * @brief Stores information about local matches. |
|
60 |
+ * |
|
61 |
+ * @signature template <typename TId, typename TPosition> |
|
62 |
+ * class LocalMatch; |
|
63 |
+ * |
|
64 |
+ * @tparam TId Type to use for subject/query id. |
|
65 |
+ * @tparam TPosition Type to use for storing positions. |
|
66 |
+ * |
|
67 |
+ * Matches on the reverse-complement are encoded by the begin position being greater than the end position. |
|
68 |
+ * |
|
69 |
+ * Sequence names are not stored in LocalMatch objects but in the @link LocalMatchStore @endlink they belong to. |
|
70 |
+ * |
|
71 |
+ * @see LocalMatchStore |
|
72 |
+ * |
|
73 |
+ * |
|
74 |
+ * @var TId LocalMatch::queryId |
|
75 |
+ * @brief The id of the query. |
|
76 |
+ * |
|
77 |
+ * @var TPosition LocalMatch::queryBeginPos |
|
78 |
+ * @brief Begin position of local match in the query. |
|
79 |
+ * |
|
80 |
+ * @var TPosition LocalMatch::queryEndPos |
|
81 |
+ * @brief End position of local match in the query. |
|
82 |
+ * |
|
83 |
+ * @var TPosition LocalMatch::subjectBeginPos |
|
84 |
+ * @brief Begin position of local match in the subject. |
|
85 |
+ * |
|
86 |
+ * @var TPosition LocalMatch::subjectEndPos |
|
87 |
+ * @brief End position of local match in the subject. |
|
88 |
+ * |
|
89 |
+ * @var TId LocalMatch::subjectId |
|
90 |
+ * @brief The id of the subject. |
|
91 |
+ */ |
|
92 |
+ |
|
93 |
+template <typename TId, typename TPosition> |
|
94 |
+class LocalMatch |
|
95 |
+{ |
|
96 |
+public: |
|
97 |
+ TId id; |
|
98 |
+ TId subjectId; |
|
99 |
+ TPosition subjectBeginPos; |
|
100 |
+ TPosition subjectEndPos; |
|
101 |
+ TId queryId; |
|
102 |
+ TPosition queryBeginPos; |
|
103 |
+ TPosition queryEndPos; |
|
104 |
+ |
|
105 |
+ LocalMatch() : |
|
106 |
+ id(std::numeric_limits<TId>::max()), |
|
107 |
+ subjectId(std::numeric_limits<TId>::max()), |
|
108 |
+ subjectBeginPos(std::numeric_limits<TPosition>::max()), |
|
109 |
+ subjectEndPos(std::numeric_limits<TPosition>::max()), |
|
110 |
+ queryId(std::numeric_limits<TId>::max()), |
|
111 |
+ queryBeginPos(std::numeric_limits<TPosition>::max()), |
|
112 |
+ queryEndPos(std::numeric_limits<TPosition>::max()) |
|
113 |
+ {} |
|
114 |
+ |
|
115 |
+ LocalMatch(TId id_, |
|
116 |
+ TId subjectId_, |
|
117 |
+ TPosition subjectBeginPos_, |
|
118 |
+ TPosition subjectEndPos_, |
|
119 |
+ TId queryId_, |
|
120 |
+ TPosition queryBeginPos_, |
|
121 |
+ TPosition queryEndPos_) : |
|
122 |
+ id(id_), |
|
123 |
+ subjectId(subjectId_), |
|
124 |
+ subjectBeginPos(subjectBeginPos_), |
|
125 |
+ subjectEndPos(subjectEndPos_), |
|
126 |
+ queryId(queryId_), |
|
127 |
+ queryBeginPos(queryBeginPos_), |
|
128 |
+ queryEndPos(queryEndPos_) |
|
129 |
+ {} |
|
130 |
+ |
|
131 |
+ bool operator==(LocalMatch const & other) const |
|
132 |
+ { |
|
133 |
+ return id == other.id && subjectId == other.subjectId && subjectBeginPos == other.subjectBeginPos && |
|
134 |
+ subjectEndPos == other.subjectEndPos && queryId == other.queryId && |
|
135 |
+ queryBeginPos == other.queryBeginPos && queryEndPos == other.queryEndPos; |
|
136 |
+ } |
|
137 |
+}; |
|
138 |
+ |
|
139 |
+/*! |
|
140 |
+ * @class LocalMatchStoreConfig |
|
141 |
+ * @headerfile <seqan/parse_lm.h> |
|
142 |
+ * @brief Stores information about local matches. |
|
143 |
+ * |
|
144 |
+ * @signature template <typename TSpec> |
|
145 |
+ * struct LocalMatchStoreConfig; |
|
146 |
+ * |
|
147 |
+ * @tparam TSpec Specializing type. |
|
148 |
+ * |
|
149 |
+ * |
|
150 |
+ * @typedef LocalMatchStoreConfig::TId; |
|
151 |
+ * @brief Type to use for ids. |
|
152 |
+ * @signature typedef unsigned LocalMatchStoreConfig::TId; |
|
153 |
+ * |
|
154 |
+ * @typedef LocalMatchStoreConfig::TPosition; |
|
155 |
+ * @brief The type to use for positions. |
|
156 |
+ * @signature typedef (...) LocalMatchStoreConfig::TPosition; |
|
157 |
+ */ |
|
158 |
+ |
|
159 |
+template <typename TSpec> |
|
160 |
+struct LocalMatchStoreConfig |
|
161 |
+{ |
|
162 |
+ typedef unsigned TId; |
|
163 |
+ typedef typename Position<Dna5String>::Type TPosition; |
|
164 |
+}; |
|
165 |
+ |
|
166 |
+/*! |
|
167 |
+ * @class LocalMatchStore |
|
168 |
+ * @headerfile <seqan/parse_lm.h> |
|
169 |
+ * @brief Stores information about local matches. |
|
170 |
+ * |
|
171 |
+ * @signature template <typename TSpec = void, TConfig = LocalMatchStoreConfig<TSpec> > |
|
172 |
+ * struct LocalMatchStore; |
|
173 |
+ * |
|
174 |
+ * @tparam TSpec Specialization tag. |
|
175 |
+ * @tparam TConfig Configuration class. |
|
176 |
+ * |
|
177 |
+ * The LocalMatchStore provides information about matches. Similar to the @link FragmentStore @endlink, the |
|
178 |
+ * information is split into multiple sub stores. Each sub store stores a part of the information. |
|
179 |
+ * |
|
180 |
+ * The LocalMatchStore#matchStore stores the information about a match. The LocalMatchStore#sequenceNameStore stores |
|
181 |
+ * the sequence names. These both sub stores are "core stores", they are filled with consistent information, i.e. for |
|
182 |
+ * each sequence id in the matchStore, there has to be a valid entry in the sequenceNameStore. |
|
183 |
+ * |
|
184 |
+ * The LocalMatchStore#cigarStore optionally stores CIGAR strings for the matches. Its entries are referenced by |
|
185 |
+ * <tt>matchStore[i].id</tt>. |
|
186 |
+ * |
|
187 |
+ * @section Examples |
|
188 |
+ * |
|
189 |
+ * Read Lastz matches from a link RecordReader and then print them to stdout. |
|
190 |
+ * |
|
191 |
+ * @code{.cpp} |
|
192 |
+ * // Read local alignments from record reader. Note that in real-world code, |
|
193 |
+ * // you should have error handling. |
|
194 |
+ * LocalMatchStore<> lmStore; |
|
195 |
+ * while (!atEnd(recordReader)) |
|
196 |
+ * readRecord(lmStore, recordReader, StellarGff()); |
|
197 |
+ * |
|
198 |
+ * // Print local alignment information to stdout. |
|
199 |
+ * std::cout << "# Reverse strandness is indicated by end < begin\n" |
|
200 |
+ * << "#db\tdb_beg\tdb_end\tquery\tq_beg\tq_end\n"; |
|
201 |
+ * for (unsigned i = 0; i < length(lmStore.matchStore); ++i) |
|
202 |
+ * std::cout << lmStore.sequenceNameStore[lmStore.matchStore[i].queryId] << "\t" |
|
203 |
+ * << lmStore.matchStore[i].queryBeginPos << "\t" |
|
204 |
+ * << lmStore.matchStore[i].queryEndPos << "\t" |
|
205 |
+ * << lmStore.sequenceNameStore[lmStore.matchStore[i].subjectId] << "\t" |
|
206 |
+ * << lmStore.matchStore[i].subjectBeginPos << "\t" |
|
207 |
+ * << lmStore.matchStore[i].subjectEndPos << "\n"; |
|
208 |
+ * @endcode |
|
209 |
+ * @see LocalMatch |
|
210 |
+ * |
|
211 |
+ * @var TMatchStore LocalMatchStore::matchStore |
|
212 |
+ * @brief String storing the LocalMatch local matches. |
|
213 |
+ * |
|
214 |
+ * @var TStringSet LocalMatchStore::sequenceNameStore |
|
215 |
+ * @brief StringSet storing the sequence names. |
|
216 |
+ * |
|
217 |
+ * @var TCigarString LocalMatchStore::cigarStore |
|
218 |
+ * @brief String storing the CIGAR strings. |
|
219 |
+ */ |
|
220 |
+ |
|
221 |
+/*! |
|
222 |
+ * @fn LocalMatchStore#readRecord |
|
223 |
+ * |
|
224 |
+ * @headerfile <seqan/parse_lm.h> |
|
225 |
+ * |
|
226 |
+ * @brief Read Lastz "general" format record. |
|
227 |
+ * |
|
228 |
+ * @signature int readRecord(store, reader, tag); |
|
229 |
+ * |
|
230 |
+ * @param[in,out] store LocalMatchStore object to read into. |
|
231 |
+ * @param[in,out] stream SinglePassRecordReader to read from. |
|
232 |
+ * @param[in] tag The tag for selecting the format, one of BlastnTabular, LastzGeneral, and StellarGff. |
|
233 |
+ * |
|
234 |
+ * @return int 0 on success, non-0 on errors and EOF |
|
235 |
+ */ |
|
236 |
+ |
|
237 |
+template <typename TSpec=void, typename TConfig=LocalMatchStoreConfig<TSpec> > |
|
238 |
+class LocalMatchStore |
|
239 |
+{ |
|
240 |
+public: |
|
241 |
+ // ---------------------------------------------------------------------------- |
|
242 |
+ // Typedefs |
|
243 |
+ // ---------------------------------------------------------------------------- |
|
244 |
+ |
|
245 |
+ typedef typename TConfig::TId TId; |
|
246 |
+ typedef typename TConfig::TPosition TPosition; |
|
247 |
+ |
|
248 |
+ typedef LocalMatch<TId, TPosition> TLocalMatch; |
|
249 |
+ typedef String<TLocalMatch> TMatchStore; |
|
250 |
+ |
|
251 |
+ typedef String<CigarElement<> > TCigarString; |
|
252 |
+ typedef String<TCigarString> TCigarStore; |
|
253 |
+ |
|
254 |
+ typedef StringSet<CharString> TNameStore; |
|
255 |
+ typedef NameStoreCache<TNameStore> TNameStoreCache_; |
|
256 |
+ |
|
257 |
+ // ---------------------------------------------------------------------------- |
|
258 |
+ // Member Variables |
|
259 |
+ // ---------------------------------------------------------------------------- |
|
260 |
+ |
|
261 |
+ TNameStore sequenceNameStore; |
|
262 |
+ TNameStoreCache_ _sequenceNameStoreCache; |
|
263 |
+ |
|
264 |
+ TMatchStore matchStore; |
|
265 |
+ TCigarStore cigarStore; |
|
266 |
+ |
|
267 |
+ LocalMatchStore() : |
|
268 |
+ _sequenceNameStoreCache(sequenceNameStore) |
|
269 |
+ {} |
|
270 |
+}; |
|
271 |
+ |
|
272 |
+// ============================================================================ |
|
273 |
+// Metafunctions |
|
274 |
+// ============================================================================ |
|
275 |
+ |
|
276 |
+// ============================================================================ |
|
277 |
+// Functions |
|
278 |
+// ============================================================================ |
|
279 |
+ |
|
280 |
+// ---------------------------------------------------------------------------- |
|
281 |
+// Function registerSequenceName |
|
282 |
+// ---------------------------------------------------------------------------- |
|
283 |
+ |
|
284 |
+template <typename TLocalMatchStore> |
|
285 |
+inline void |
|
286 |
+registerSequenceName(TLocalMatchStore & store, |
|
287 |
+ CharString const & sequenceName) |
|
288 |
+{ |
|
289 |
+ unsigned id = 0; |
|
290 |
+ if (!getIdByName(store.sequenceNameStore, sequenceName, id, store._sequenceNameStoreCache)) |
|
291 |
+ { |
|
292 |
+ id = length(store.sequenceNameStore); |
|
293 |
+ appendName(store.sequenceNameStore, sequenceName, store._sequenceNameStoreCache); |
|
294 |
+ } |
|
295 |
+} |
|
296 |
+ |
|
297 |
+// ---------------------------------------------------------------------------- |
|
298 |
+// Function appendLocalMatch |
|
299 |
+// ---------------------------------------------------------------------------- |
|
300 |
+ |
|
301 |
+/*! |
|
302 |
+ * @fn LocalMatchStore#appendLocalMatch |
|
303 |
+ * @brief Append a new local match to a @link LocalMatchStore @endlink |
|
304 |
+ * |
|
305 |
+ * @signature void appendLocalMatchStore(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos); |
|
306 |
+ * @signature void appendLocalMatchStore(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos, cigarStringBuffer); |
|
307 |
+ * |
|
308 |
+ * @param[in,out] store The LocalMatchStore to add the local match to. |
|
309 |
+ * @param[in] subjectId Numeric subject sequence identifier, @link IntegerConcept @endlink. |
|
310 |
+ * @param[in] subjectName The textual name of the query sequence, @link CharString @endlink. |
|
311 |
+ * @param[in] subjectBegin The begin position of the match in the subject, @link IntegerConcept @endlink. |
|
312 |
+ * @param[in] subjectEnd The end position of the match in the subject, @link IntegerConcept @endlink. |
|
313 |
+ * @param[in] queryId Numeric query sequence identifier, @link IntegerConcept @endlink. |
|
314 |
+ * @param[in] queryName The textual name of the query sequence, @link CharString @endlink. |
|
315 |
+ * @param[in] queryBegin The begin position of the match in the query, @link IntegerConcept @endlink. |
|
316 |
+ * @param[in] queryEnd The end position of the match in the query, @link IntegerConcept @endlink. |
|
317 |
+ * @param[in] cigarStringBuffer Buffer with the cigar string of the local alignment, @link CharString @endlink. |
|
318 |
+ * |
|
319 |
+ * Matches on the reverse-complement are encoded by the begin position being greater than the begin position. |
|
320 |
+ */ |
|
321 |
+ |
|
322 |
+template <typename TLocalMatchStore, typename TPosition> |
|
323 |
+inline void |
|
324 |
+appendLocalMatch(TLocalMatchStore & store, |
|
325 |
+ unsigned const & subjectId, |
|
326 |
+ TPosition const subjectBeginPos, |
|
327 |
+ TPosition const subjectEndPos, |
|
328 |
+ unsigned const & queryId, |
|
329 |
+ TPosition const queryBeginPos, |
|
330 |
+ TPosition const queryEndPos) |
|
331 |
+{ |
|
332 |
+ typedef typename TLocalMatchStore::TLocalMatch TLocalMatch; |
|
333 |
+ |
|
334 |
+ SEQAN_ASSERT_LT(subjectId, length(store.sequenceNameStore)); |
|
335 |
+ SEQAN_ASSERT_LT(queryId, length(store.sequenceNameStore)); |
|
336 |
+ |
|
337 |
+ TLocalMatch localMatch(length(store.matchStore), |
|
338 |
+ subjectId, |
|
339 |
+ subjectBeginPos, |
|
340 |
+ subjectEndPos, |
|
341 |
+ queryId, |
|
342 |
+ queryBeginPos, |
|
343 |
+ queryEndPos); |
|
344 |
+ appendValue(store.matchStore, localMatch); |
|
345 |
+} |
|
346 |
+ |
|
347 |
+template <typename TLocalMatchStore, typename TPosition> |
|
348 |
+inline void |
|
349 |
+appendLocalMatch(TLocalMatchStore & store, |
|
350 |
+ CharString const & subjectName, |
|
351 |
+ TPosition const subjectBeginPos, |
|
352 |
+ TPosition const subjectEndPos, |
|
353 |
+ CharString const & queryName, |
|
354 |
+ TPosition const queryBeginPos, |
|
355 |
+ TPosition const queryEndPos) |
|
356 |
+{ |
|
357 |
+ typedef typename TLocalMatchStore::TId TId; |
|
358 |
+ |
|
359 |
+ // Get id for subject and query sequences; Insert sequences into name stores/caches if not already there. |
|
360 |
+ TId subjectId = 0; |
|
361 |
+ if (!getIdByName(subjectId, store.sequenceNameStore, subjectName)) |
|
362 |
+ { |
|
363 |
+ subjectId = length(store.sequenceNameStore); |
|
364 |
+ appendName(store._sequenceNameStoreCache, subjectName); |
|
365 |
+ } |
|
366 |
+ TId queryId = 0; |
|
367 |
+ if (!getIdByName(queryId, store.sequenceNameStore, queryName)) |
|
368 |
+ { |
|
369 |
+ queryId = length(store.sequenceNameStore); |
|
370 |
+ appendName(store._sequenceNameStoreCache, queryName); |
|
371 |
+ } |
|
372 |
+ |
|
373 |
+ appendLocalMatch(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos); |
|
374 |
+} |
|
375 |
+ |
|
376 |
+template <typename TLocalMatchStore, typename TPosition> |
|
377 |
+inline void |
|
378 |
+appendLocalMatch(TLocalMatchStore & store, |
|
379 |
+ CharString const & subjectName, |
|
380 |
+ TPosition const subjectBeginPos, |
|
381 |
+ TPosition const subjectEndPos, |
|
382 |
+ CharString const & queryName, |
|
383 |
+ TPosition const queryBeginPos, |
|
384 |
+ TPosition const queryEndPos, |
|
385 |
+ CharString const & cigarStringBuffer) |
|
386 |
+{ |
|
387 |
+ // Append local match. |
|
388 |
+ appendLocalMatch(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos); |
|
389 |
+ // Make space for CIGAR string. |
|
390 |
+ resize(store.cigarStore, back(store.matchStore).id + 1); |
|
391 |
+ // TODO(holtgrew): Something can go wrong when parsing CIGAR string, need return value? |
|
392 |
+ // Parse out cigar string. |
|
393 |
+ /* |
|
394 |
+ typedef Stream<CharArray<char const *> > TCharArrayStream; |
|
395 |
+ TCharArrayStream cigarStream(&cigarStringBuffer[0], &cigarStringBuffer[0] + length(cigarStringBuffer)); |
|
396 |
+ RecordReader<TCharArrayStream, SinglePass<> > recordReader(cigarStream); |
|
397 |
+ */ |
|
398 |
+ DirectionIterator<CharString const, Input>::Type iter = begin(cigarStringBuffer); |
|
399 |
+ CharString numBuf; |
|
400 |
+ while (!atEnd(iter)) |
|
401 |
+ { |
|
402 |
+ // Get number string into buffer. |
|
403 |
+ clear(numBuf); |
|
404 |
+ readUntil(numBuf, iter, NotFunctor<IsDigit>()); |
|
405 |
+ unsigned num = 0; |
|
406 |
+ lexicalCast<unsigned>(num, numBuf); |
|
407 |
+ |
|
408 |
+ // Read operation char and advance record reader. |
|
409 |
+ char op; |
|
410 |
+ readOne(op, iter); |
|
411 |
+ // Append CIGAR element to CIGAR string. |
|
412 |
+ appendValue(back(store.cigarStore), CigarElement<>(op, num)); |
|
413 |
+ } |
|
414 |
+} |
|
415 |
+ |
|
416 |
+} // namespace seqan |
|
417 |
+ |
|
418 |
+#endif // INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_ |