inst/include/seqan/parse_lm/local_match_store.h
d92ab6f0
 // ==========================================================================
 //                 SeqAn - The Library for Sequence Analysis
 // ==========================================================================
 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
 //     * Redistributions of source code must retain the above copyright
 //       notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above copyright
 //       notice, this list of conditions and the following disclaimer in the
 //       documentation and/or other materials provided with the distribution.
 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
 //       its contributors may be used to endorse or promote products derived
 //       from this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 // DAMAGE.
 //
 // ==========================================================================
 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
 // ==========================================================================
 
 // TODO(holtgrew): Allow the storage of additional fields.
 /* This could look as follows:
 
    The values of the additional fields are stored as strings.  Either we store name/value pairs or, more efficiently,
    store a stringset for each match, with the same number of entries.  An additional String/Map maps from field name to
    index in this string sets.
  */
 
 #ifndef INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
 #define INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_
 
 namespace seqan {
 
 // ============================================================================
 // Forwards
 // ============================================================================
 
 // ============================================================================
 // Tags, Classes, Enums
 // ============================================================================
 
 /*!
  * @class LocalMatch
  * @headerfile <seqan/parse_lm.h>
  * @brief Stores information about local matches.
  *
  * @signature template <typename TId, typename TPosition>
  *            class LocalMatch;
  *
  * @tparam TId       Type to use for subject/query id.
  * @tparam TPosition Type to use for storing positions.
  *
  * Matches on the reverse-complement are encoded by the begin position being greater than the end position.
  *
  * Sequence names are not stored in LocalMatch objects but in the @link LocalMatchStore @endlink they belong to.
  *
  * @see LocalMatchStore
  *
  *
  * @var TId LocalMatch::queryId
  * @brief The id of the query.
  *
  * @var TPosition LocalMatch::queryBeginPos
  * @brief Begin position of local match in the query.
  *
  * @var TPosition LocalMatch::queryEndPos
  * @brief End position of local match in the query.
  *
  * @var TPosition LocalMatch::subjectBeginPos
  * @brief Begin position of local match in the subject.
  *
  * @var TPosition LocalMatch::subjectEndPos
  * @brief End position of local match in the subject.
  *
  * @var TId LocalMatch::subjectId
  * @brief The id of the subject.
  */
 
 template <typename TId, typename TPosition>
 class LocalMatch
 {
 public:
     TId id;
     TId subjectId;
     TPosition subjectBeginPos;
     TPosition subjectEndPos;
     TId queryId;
     TPosition queryBeginPos;
     TPosition queryEndPos;
 
     LocalMatch() :
             id(std::numeric_limits<TId>::max()),
             subjectId(std::numeric_limits<TId>::max()),
             subjectBeginPos(std::numeric_limits<TPosition>::max()),
             subjectEndPos(std::numeric_limits<TPosition>::max()),
             queryId(std::numeric_limits<TId>::max()),
             queryBeginPos(std::numeric_limits<TPosition>::max()),
             queryEndPos(std::numeric_limits<TPosition>::max())
     {}
 
     LocalMatch(TId id_,
                TId subjectId_,
                TPosition subjectBeginPos_,
                TPosition subjectEndPos_,
                TId queryId_,
                TPosition queryBeginPos_,
                TPosition queryEndPos_) :
             id(id_),
             subjectId(subjectId_),
             subjectBeginPos(subjectBeginPos_),
             subjectEndPos(subjectEndPos_),
             queryId(queryId_),
             queryBeginPos(queryBeginPos_),
             queryEndPos(queryEndPos_)
     {}
 
     bool operator==(LocalMatch const & other) const
     {
         return id == other.id && subjectId == other.subjectId && subjectBeginPos == other.subjectBeginPos &&
                 subjectEndPos == other.subjectEndPos && queryId == other.queryId &&
                 queryBeginPos == other.queryBeginPos && queryEndPos == other.queryEndPos;
     }
 };
 
 /*!
  * @class LocalMatchStoreConfig
  * @headerfile <seqan/parse_lm.h>
  * @brief Stores information about local matches.
  *
  * @signature template <typename TSpec>
  *            struct LocalMatchStoreConfig;
  *
  * @tparam TSpec Specializing type.
  *
  *
  * @typedef LocalMatchStoreConfig::TId;
  * @brief Type to use for ids.
  * @signature typedef unsigned LocalMatchStoreConfig::TId;
  *
  * @typedef LocalMatchStoreConfig::TPosition;
  * @brief The type to use for positions.
  * @signature typedef (...) LocalMatchStoreConfig::TPosition;
  */
 
 template <typename TSpec>
 struct LocalMatchStoreConfig
 {
     typedef unsigned TId;
     typedef typename Position<Dna5String>::Type TPosition;
 };
 
 /*!
  * @class LocalMatchStore
  * @headerfile <seqan/parse_lm.h>
  * @brief Stores information about local matches.
  *
  * @signature template <typename TSpec = void, TConfig = LocalMatchStoreConfig<TSpec> >
  *            struct LocalMatchStore;
  *
  * @tparam TSpec   Specialization tag.
  * @tparam TConfig Configuration class.
  *
  * The LocalMatchStore provides information about matches.  Similar to the @link FragmentStore @endlink, the
  * information is split into multiple sub stores.  Each sub store stores a part of the information.
  *
  * The LocalMatchStore#matchStore stores the information about a match.  The LocalMatchStore#sequenceNameStore stores
  * the sequence names.  These both sub stores are "core stores", they are filled with consistent information, i.e. for
  * each sequence id in the matchStore, there has to be a valid entry in the sequenceNameStore.
  *
  * The LocalMatchStore#cigarStore optionally stores CIGAR strings for the matches.  Its entries are referenced by
  * <tt>matchStore[i].id</tt>.
  *
  * @section Examples
  *
  * Read Lastz matches from a link RecordReader and then print them to stdout.
  *
  * @code{.cpp}
  * // Read local alignments from record reader.  Note that in real-world code,
  * // you should have error handling.
  * LocalMatchStore<> lmStore;
  * while (!atEnd(recordReader))
  *     readRecord(lmStore, recordReader, StellarGff());
  *
  * // Print local alignment information to stdout.
  * std::cout << "# Reverse strandness is indicated by end < begin\n"
  *           << "#db\tdb_beg\tdb_end\tquery\tq_beg\tq_end\n";
  * for (unsigned i = 0; i < length(lmStore.matchStore); ++i)
  *     std::cout << lmStore.sequenceNameStore[lmStore.matchStore[i].queryId] << "\t"
  *               << lmStore.matchStore[i].queryBeginPos << "\t"
  *               << lmStore.matchStore[i].queryEndPos << "\t"
  *               << lmStore.sequenceNameStore[lmStore.matchStore[i].subjectId] << "\t"
  *               << lmStore.matchStore[i].subjectBeginPos << "\t"
  *               << lmStore.matchStore[i].subjectEndPos << "\n";
  * @endcode
  * @see LocalMatch
  *
  * @var TMatchStore LocalMatchStore::matchStore
  * @brief String storing the LocalMatch local matches.
  *
  * @var TStringSet LocalMatchStore::sequenceNameStore
  * @brief StringSet storing the sequence names.
  *
  * @var TCigarString LocalMatchStore::cigarStore
  * @brief String storing the CIGAR strings.
  */
 
 /*!
  * @fn LocalMatchStore#readRecord
  *
  * @headerfile <seqan/parse_lm.h>
  *
  * @brief Read Lastz "general" format record.
  *
  * @signature int readRecord(store, reader, tag);
  *
  * @param[in,out] store  LocalMatchStore object to read into.
  * @param[in,out] stream SinglePassRecordReader to read from.
  * @param[in]     tag    The tag for selecting the format, one of BlastnTabular, LastzGeneral, and StellarGff.
  *
  * @return int 0 on success, non-0 on errors and EOF
  */
 
 template <typename TSpec=void, typename TConfig=LocalMatchStoreConfig<TSpec> >
 class LocalMatchStore
 {
 public:
     // ----------------------------------------------------------------------------
     // Typedefs
     // ----------------------------------------------------------------------------
 
     typedef typename TConfig::TId TId;
     typedef typename TConfig::TPosition TPosition;
 
     typedef LocalMatch<TId, TPosition> TLocalMatch;
     typedef String<TLocalMatch> TMatchStore;
 
     typedef String<CigarElement<> > TCigarString;
     typedef String<TCigarString> TCigarStore;
 
     typedef StringSet<CharString> TNameStore;
     typedef NameStoreCache<TNameStore> TNameStoreCache_;
 
     // ----------------------------------------------------------------------------
     // Member Variables
     // ----------------------------------------------------------------------------
 
     TNameStore       sequenceNameStore;
     TNameStoreCache_ _sequenceNameStoreCache;
 
     TMatchStore       matchStore;
     TCigarStore       cigarStore;
 
     LocalMatchStore() :
             _sequenceNameStoreCache(sequenceNameStore)
     {}
 };
 
 // ============================================================================
 // Metafunctions
 // ============================================================================
 
 // ============================================================================
 // Functions
 // ============================================================================
 
 // ----------------------------------------------------------------------------
 // Function registerSequenceName
 // ----------------------------------------------------------------------------
 
 template <typename TLocalMatchStore>
 inline void
 registerSequenceName(TLocalMatchStore & store,
                      CharString const & sequenceName)
 {
     unsigned id = 0;
     if (!getIdByName(store.sequenceNameStore, sequenceName, id, store._sequenceNameStoreCache))
     {
         id = length(store.sequenceNameStore);
         appendName(store.sequenceNameStore, sequenceName, store._sequenceNameStoreCache);
     }
 }
 
 // ----------------------------------------------------------------------------
 // Function appendLocalMatch
 // ----------------------------------------------------------------------------
 
 /*!
  * @fn LocalMatchStore#appendLocalMatch
  * @brief Append a new local match to a @link LocalMatchStore @endlink
  *
  * @signature void appendLocalMatchStore(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
  * @signature void appendLocalMatchStore(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos, cigarStringBuffer);
  *
  * @param[in,out] store             The LocalMatchStore to add the local match to.
  * @param[in]     subjectId         Numeric subject sequence identifier, @link IntegerConcept @endlink.
  * @param[in]     subjectName       The textual name of the query sequence, @link CharString @endlink.
  * @param[in]     subjectBegin      The begin position of the match in the subject, @link IntegerConcept @endlink.
  * @param[in]     subjectEnd        The end position of the match in the subject, @link IntegerConcept @endlink.
  * @param[in]     queryId           Numeric query sequence identifier, @link IntegerConcept @endlink.
  * @param[in]     queryName         The textual name of the query sequence, @link CharString @endlink.
  * @param[in]     queryBegin        The begin position of the match in the query, @link IntegerConcept @endlink.
  * @param[in]     queryEnd          The end position of the match in the query, @link IntegerConcept @endlink.
  * @param[in]     cigarStringBuffer Buffer with the cigar string of the local alignment,  @link CharString @endlink.
  *
  * Matches on the reverse-complement are encoded by the begin position being greater than the begin position.
  */
 
 template <typename TLocalMatchStore, typename TPosition>
 inline void
 appendLocalMatch(TLocalMatchStore & store,
                  unsigned const & subjectId,
                  TPosition const subjectBeginPos,
                  TPosition const subjectEndPos,
                  unsigned const & queryId,
                  TPosition const queryBeginPos,
                  TPosition const queryEndPos)
 {
     typedef typename TLocalMatchStore::TLocalMatch TLocalMatch;
 
     SEQAN_ASSERT_LT(subjectId, length(store.sequenceNameStore));
     SEQAN_ASSERT_LT(queryId, length(store.sequenceNameStore));
 
     TLocalMatch localMatch(length(store.matchStore),
                            subjectId,
                            subjectBeginPos,
                            subjectEndPos,
                            queryId,
                            queryBeginPos,
                            queryEndPos);
     appendValue(store.matchStore, localMatch);
 }
 
 template <typename TLocalMatchStore, typename TPosition>
 inline void
 appendLocalMatch(TLocalMatchStore & store,
                  CharString const & subjectName,
                  TPosition const subjectBeginPos,
                  TPosition const subjectEndPos,
                  CharString const & queryName,
                  TPosition const queryBeginPos,
                  TPosition const queryEndPos)
 {
     typedef typename TLocalMatchStore::TId         TId;
 
     // Get id for subject and query sequences;  Insert sequences into name stores/caches if not already there.
     TId subjectId = 0;
     if (!getIdByName(subjectId, store.sequenceNameStore, subjectName))
     {
         subjectId = length(store.sequenceNameStore);
         appendName(store._sequenceNameStoreCache, subjectName);
     }
     TId queryId = 0;
     if (!getIdByName(queryId, store.sequenceNameStore, queryName))
     {
         queryId = length(store.sequenceNameStore);
         appendName(store._sequenceNameStoreCache, queryName);
     }
 
     appendLocalMatch(store, subjectId, subjectBeginPos, subjectEndPos, queryId, queryBeginPos, queryEndPos);
 }
 
 template <typename TLocalMatchStore, typename TPosition>
 inline void
 appendLocalMatch(TLocalMatchStore & store,
                  CharString const & subjectName,
                  TPosition const subjectBeginPos,
                  TPosition const subjectEndPos,
                  CharString const & queryName,
                  TPosition const queryBeginPos,
                  TPosition const queryEndPos,
                  CharString const & cigarStringBuffer)
 {
     // Append local match.
     appendLocalMatch(store, subjectName, subjectBeginPos, subjectEndPos, queryName, queryBeginPos, queryEndPos);
     // Make space for CIGAR string.
     resize(store.cigarStore, back(store.matchStore).id + 1);
     // TODO(holtgrew): Something can go wrong when parsing CIGAR string, need return value?
     // Parse out cigar string.
     /*
     typedef Stream<CharArray<char const *> > TCharArrayStream;
     TCharArrayStream cigarStream(&cigarStringBuffer[0], &cigarStringBuffer[0] + length(cigarStringBuffer));
     RecordReader<TCharArrayStream, SinglePass<> > recordReader(cigarStream);
     */
     DirectionIterator<CharString const, Input>::Type iter = begin(cigarStringBuffer);
     CharString numBuf;
     while (!atEnd(iter))
     {
         // Get number string into buffer.
         clear(numBuf);
         readUntil(numBuf, iter, NotFunctor<IsDigit>());
         unsigned num = 0;
         lexicalCast<unsigned>(num, numBuf);
 
         // Read operation char and advance record reader.
         char op;
         readOne(op, iter);
         // Append CIGAR element to CIGAR string.
         appendValue(back(store.cigarStore), CigarElement<>(op, num));
     }
 }
 
 }  // namespace seqan
 
 #endif  // INCLUDE_SEQAN_PARSE_LM_LOCAL_MATCH_STORE_H_