// ========================================================================== // SeqAn - The Library for Sequence Analysis // ========================================================================== // Copyright (c) 2006-2018, Knut Reinert, FU Berlin // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of Knut Reinert or the FU Berlin nor the names of // its contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH // DAMAGE. // // ========================================================================== // Author: Stephan Aiche <stephan.aiche@fu-berlin.de> // ========================================================================== #ifndef SEQAN_INCLUDE_SEQAN_ALIGN_GLOBAL_ALIGNMENT_MYERS_HIRSCHBERG_IMPL_H_ #define SEQAN_INCLUDE_SEQAN_ALIGN_GLOBAL_ALIGNMENT_MYERS_HIRSCHBERG_IMPL_H_ #include <bitset> namespace seqan { // ============================================================================ // Forwards // ============================================================================ // ============================================================================ // Tags, Classes, Enums // ============================================================================ // ============================================================================ // Metafunctions // ============================================================================ // ============================================================================ // Functions // ============================================================================ // ---------------------------------------------------------------------------- // Function _writeDebugMatrix() // ---------------------------------------------------------------------------- #ifdef MYERS_HIRSCHBERG_VERBOSE template<typename TSource> void _writeDebugMatrix(TSource s1,TSource s2) { //IOREV _notio_ not relevant for iorev int l1 = length(s1); int l2 = length(s2); int i,j,sg,sd; String<String<int> > fMatrix,rMatrix,tMatrix; resize(fMatrix,l1 + 1); resize(rMatrix,l1 + 1); resize(tMatrix,l1 + 1); for(i = 0;i <= l1;++i) { resize(fMatrix[i],l2 + 1); resize(rMatrix[i],l2 + 1); resize(tMatrix[i],l2 + 1); } for(i = 0;i <= l1;++i) fMatrix[i][0] = i * (-1); for(i = l1;i >= 0;--i) rMatrix[i][l2] = (l1 - i) * (-1); // calculate forward matrix for(j = 1;j <= l2;++j) { fMatrix[0][j] = j*(-1); for(i = 1;i <= l1;++i) { sg = -1 + ((fMatrix[i-1][j] > fMatrix[i][j-1]) ? fMatrix[i-1][j] : fMatrix[i][j-1]); sd = fMatrix[i-1][j-1] + ((s1[i - 1] == s2[j-1]) ? 0 : -1 ); fMatrix[i][j] = ((sg > sd) ? sg : sd); } } // calculate reverse matrix for(j = l2 - 1;j >= 0;--j) { rMatrix[l1][j] = (l2 - j)*(-1); for(i = l1 - 1;i >= 0;--i) { sg = -1 + ((rMatrix[i+1][j] > rMatrix[i][j+1]) ? rMatrix[i+1][j] : rMatrix[i][j+1]); sd = rMatrix[i+1][j+1] + ((s1[i] == s2[j]) ? 0 : -1 ); rMatrix[i][j] = ((sg > sd) ? sg : sd); } } // print fMatrix std::cout << ";-;"; for(i = 0;i < l1;++i) std::cout << s1[i] << ";"; std::cout << std::endl << "-;"; for(j = 0;j <= l2;++j) { if(j != 0) std::cout << s2[j-1] << ";"; for(i = 0;i <= l1;++i) { std::cout << fMatrix[i][j] << ";"; } std::cout << std::endl; } // print rMatrix std::cout << ";"; for(i = 0;i < l1;++i) std::cout << s1[i] << ";"; std::cout << "-;" << std::endl; for(j = 0;j <= l2;++j) { if(j != l2) std::cout << s2[j] << ";"; else std::cout << "-;"; for(i = 0;i <= l1;++i) { std::cout << rMatrix[i][j] << ";"; } std::cout << std::endl; } // fill and print target matrix std::cout << ";-;"; for(i = 0;i < l1;++i) std::cout << s1[i] << ";"; std::cout << std::endl << "-;"; for(j = 0;j <= l2;++j) { if(j != 0) std::cout << s2[j-1] << ";"; for(i = 0;i <= l1;++i) { tMatrix[i][j] = fMatrix[i][j] + rMatrix[i][j]; std::cout << tMatrix[i][j] << ";"; } std::cout << std::endl; } } #endif // ---------------------------------------------------------------------------- // Function _printBinary() // ---------------------------------------------------------------------------- // Debug integer types in binary format. template <typename TStream, typename T> inline void _printBinary(TStream & stream, T const n) { std::bitset<BitsPerValue<T>::VALUE> bits(n); stream << bits; stream << '\n'; } // ---------------------------------------------------------------------------- // Function globalAlignment() // ---------------------------------------------------------------------------- // When using different alphabets, we will internally use the pattern alphabet // for the comparison. This means that the text character is converted to the // pattern alphabet. Here, the "pattern" is the shorter source sequence. template <typename TSequenceH, typename TGapsSpecH, typename TSequenceV, typename TGapsSpecV> int _globalAlignment(Gaps<TSequenceH, TGapsSpecH> & gapsH, Gaps<TSequenceV, TGapsSpecV> & gapsV, MyersHirschberg const & algorithmTag) { // Switch horizontal and vertical gap roles, gapsV should be the shorter one // to fit into less words. if (length(source(gapsH)) < length(source(gapsV))) return _globalAlignment(gapsV, gapsH, algorithmTag); clearGaps(gapsH); clearGaps(gapsV); clearClipping(gapsH); clearClipping(gapsV); typedef int TScoreValue; // use size of unsigned int as blocksize for bit-vectors const unsigned int BLOCK_SIZE = BitsPerValue<unsigned int>::VALUE; // saves the score value that will be returned TScoreValue score,total_score = 0; typedef typename Value<TSequenceV>::Type TPatternAlphabet; typedef typename Size<TSequenceH>::Type TStringSize; typedef typename Iterator<TSequenceH const, Rooted>::Type TSequenceHIterator; typedef typename Iterator<TSequenceV const, Rooted>::Type TSequenceVIterator; typedef Gaps<TSequenceH, TGapsSpecH> TGapsH; typedef Gaps<TSequenceV, TGapsSpecV> TGapsV; typedef typename Iterator<TGapsH, Rooted>::Type TGapsHIterator; typedef typename Iterator<TGapsV, Rooted>::Type TGapsVIterator; typedef typename Iterator<Matrix<TScoreValue>, Rooted>::Type TMatrixIterator; TGapsHIterator target_0 = begin(gapsH); TGapsVIterator target_1 = begin(gapsV); TSequenceH const & x = source(gapsH); TSequenceV const & y = source(gapsV); TStringSize len_x = length(x); TStringSize len_y = length(y); // string to store the score values for the currently active cell String<TScoreValue> c_score; resize(c_score, len_x + 1, 0); // scoring-scheme specific score values TScoreValue score_match = 0; TScoreValue score_mismatch = -1; TScoreValue score_gap = -1; // additional vars unsigned i; // stack with parts of matrix that have to be processed std::stack<HirschbergSet_> to_process; HirschbergSet_ target; // myers specific vars and preprocessing unsigned int patternAlphabetSize = ValueSize<TPatternAlphabet>::VALUE; unsigned int blockCount = (len_y + BLOCK_SIZE - 1) / BLOCK_SIZE; // maximal count of blocks String<unsigned> VP; String<unsigned> VN; String<unsigned> forwardBitMask; String<unsigned> reverseBitMask; resize(VP, blockCount, std::numeric_limits<unsigned>::max()); resize(VN, blockCount, 0); // first bitMask will be constructed from the shorter sequence resize(forwardBitMask, patternAlphabetSize * blockCount, 0); resize(reverseBitMask, patternAlphabetSize * blockCount, 0); // encoding the letters as bit-vectors for (unsigned int j = 0; j < len_y; j++){ forwardBitMask[blockCount * ordValue(getValue(y,j)) + j/BLOCK_SIZE] = forwardBitMask[blockCount * ordValue(getValue(y,j)) + j/BLOCK_SIZE] | 1 << (j%BLOCK_SIZE); reverseBitMask[blockCount * ordValue(getValue(y,len_y - j - 1)) + j/BLOCK_SIZE] = reverseBitMask[blockCount * ordValue(getValue(y,len_y - j - 1)) + j/BLOCK_SIZE] | 1 << (j%BLOCK_SIZE); } HirschbergSet_ hs_complete{0, static_cast<unsigned>(len_x), 0, static_cast<unsigned>(len_y), 1}; to_process.push(hs_complete); while(!to_process.empty()) { target = to_process.top(); to_process.pop(); /* if score is zero, the whole part of the sequence can be simply skipped */ if(_score(target) == 0) { /* coukd work faster */ for(i = 0;i < (_end1(target) - _begin1(target));++i) { ++target_0; ++target_1; } #ifdef MYERS_HIRSCHBERG_VERBOSE printf("skipped %i to %i in first sequence\n",_begin1(target),_end1(target)); #endif } else if(_begin1(target) == _end1(target)) { #ifdef MYERS_HIRSCHBERG_VERBOSE std::cout << "align y " << _begin2(target) << " to " << _end2(target) << std::endl; std::cout << "align " << infix(y,_begin2(target),_end2(target)) << std::endl << std::endl; #endif for(i = 0;i < (_end2(target) - _begin2(target));++i) { insertGap(target_0); ++target_0; ++target_1; } } else if(_begin2(target) + 1 == _end2(target)) { /* ALIGN */ #ifdef MYERS_HIRSCHBERG_VERBOSE std::cout << "align x " << _begin1(target) << " to " << _end1(target) << " and y " << _begin2(target) << " to " << _end2(target) << std::endl; std::cout << "align " << infix(x,_begin1(target),_end1(target)) << " and " << infix(y,_begin2(target),_end2(target)) << std::endl << std::endl; #endif TStringSize len_1 = _end1(target) - _begin1(target); TStringSize len_2 = _end2(target) - _begin2(target); Matrix<TScoreValue> matrix_; setDimension(matrix_, 2); setLength(matrix_, 0, len_1 + 1); setLength(matrix_, 1, len_2 + 1); resize(matrix_); /* init matrix */ TSequenceHIterator xs_begin = iter(x,_begin1(target)) - 1; TSequenceHIterator xs_end = iter(x,_end1(target)) - 1; TSequenceVIterator ys_begin = iter(y,_begin2(target)) - 1; TSequenceVIterator ys_end = iter(y,_end2(target)) - 1; TSequenceHIterator xs = xs_end; TSequenceVIterator ys; TMatrixIterator col_ = end(matrix_) - 1; TMatrixIterator finger1; TMatrixIterator finger2; TScoreValue h = 0; TScoreValue border_ = score_gap; TScoreValue v = border_; //------------------------------------------------------------------------- // init finger1 = col_; *finger1 = 0; for (xs = xs_end; xs != xs_begin; --xs) { goPrevious(finger1, 0); *finger1 = border_; border_ += score_gap; } //------------------------------------------------------------------------- //fill matrix border_ = 0; for (ys = ys_end; ys != ys_begin; --ys) { TPatternAlphabet cy = *ys; h = border_; border_ += score_gap; v = border_; finger2 = col_; goPrevious(col_, 1); finger1 = col_; *finger1 = v; for (xs = xs_end; xs != xs_begin; --xs) { goPrevious(finger1, 0); goPrevious(finger2, 0); if (*xs == cy) { v = h + score_match; h = *finger2; } else { TScoreValue s1 = h + score_mismatch; h = *finger2; TScoreValue s2 = score_gap + ((h > v) ? h : v); v = (s1 > s2) ? s1 : s2; } *finger1 = v; } } // if computed the whole matrix last value of v = alignment score if(target == hs_complete) total_score = v; /* TRACE BACK */ finger1 = begin(matrix_); xs = iter(x,_begin1(target)); ys = iter(y,_begin2(target)); xs_end = iter(x,_end1(target)); ys_end = iter(y,_end2(target)); while ((xs != xs_end) && (ys != ys_end)) { bool gv; bool gh; if (*xs == *ys) { gv = gh = true; } else { TMatrixIterator it_ = finger1; goNext(it_, 0); TScoreValue v = *it_; goNext(it_, 1); TScoreValue d = *it_; it_ = finger1; goNext(it_, 1); TScoreValue h = *it_; gv = (v >= h) | (d >= h); gh = (h >= v) | (d >= v); } if (gv) { ++xs; goNext(finger1, 0); } else { insertGap(target_0); } if (gh) { ++ys; goNext(finger1, 1); } else { insertGap(target_1); } ++target_0; ++target_1; } // if x or y did not reached there end position, fill the rest with gaps while(xs != xs_end) { insertGap(target_1); ++target_0; ++target_1; ++xs; } while(ys != ys_end) { insertGap(target_0); ++target_0; ++target_1; ++ys; } /* END ALIGN */ #ifdef MYERS_HIRSCHBERG_VERBOSE std::cout << std::endl << align_ << std::endl << std::endl; #endif } else { /* --------------------------------------------------------------- Calculate cut position using extended Myers-Bitvector-Algorithm --------------------------------------------------------------- */ /* declare variables */ unsigned int X, D0, HN, HP; /* compute cut position */ unsigned mid = static_cast<int>(floor( static_cast<double>((_begin2(target) + _end2(target))/2) )); /* debug infos */ #ifdef MYERS_HIRSCHBERG_VERBOSE std::cout << "calculate cut for x " << _begin1(target) << " to " << _end1(target) << " and y " << _begin2(target) << " to " << _end2(target) << std::endl; std::cout << "calculate cut for " << infix(x,_begin1(target),_end1(target)) << " and " << infix(y,_begin2(target),_end2(target)) << std::endl; std::cout << "cut is in row " << mid << " symbol is " << getValue(x,mid-1) << std::endl << std::endl; std::cout << std::endl; _writeDebugMatrix(infix(x,_begin1(target),_end1(target)),infix(y,_begin2(target),_end2(target))); std::cout << std::endl; #endif /* compute blocks and score masks */ unsigned fStartBlock = _begin2(target) / BLOCK_SIZE; unsigned fEndBlock = (mid - 1) / BLOCK_SIZE; unsigned fSpannedBlocks = (fEndBlock - fStartBlock) + 1; unsigned int fScoreMask = 1 << ((mid - 1) % BLOCK_SIZE); unsigned int fOffSet = _begin2(target) % BLOCK_SIZE; unsigned int fSilencer = ~0; fSilencer <<= fOffSet; /* reset v-bitvectors */ std::fill(begin(VP, Standard()) + fStartBlock, begin(VP, Standard()) + fEndBlock + 1, std::numeric_limits<unsigned>::max()); std::fill(begin(VN, Standard()) + fStartBlock, begin(VN, Standard()) + fEndBlock + 1, 0); /* determine start-position and start-score */ auto pos = _begin1(target); score = (mid - _begin2(target)) * score_gap; c_score[pos] = score; /* compute with myers - forward - begin */ if(fSpannedBlocks == 1) { while (pos < _end1(target)) { X = (fSilencer & forwardBitMask[(blockCount * ordValue(static_cast<TPatternAlphabet>(getValue(x,pos)))) + fStartBlock]) | VN[fStartBlock]; D0 = ((VP[fStartBlock] + (X & VP[fStartBlock])) ^ VP[fStartBlock]) | X; HN = VP[fStartBlock] & D0; HP = VN[fStartBlock] | ~(VP[fStartBlock] | D0); X = (HP << 1) | (1 << fOffSet); VN[fStartBlock] = X & D0; VP[fStartBlock] = (HN << 1) | ~(X | D0); if (HP & fScoreMask) score--; else if (HN & fScoreMask) score++; c_score[pos + 1] = score; ++pos; } } /* end - short patten */ else { unsigned shift, currentBlock; unsigned int temp, carryD0, carryHP, carryHN; while (pos < _end1(target)) { carryD0 = carryHP = carryHN = 0; shift = blockCount * ordValue(static_cast<TPatternAlphabet>(getValue(x,pos))); // computing first the top most block X = (fSilencer & forwardBitMask[shift + fStartBlock]) | VN[fStartBlock]; temp = VP[fStartBlock] + (X & VP[fStartBlock]); carryD0 = temp < VP[fStartBlock]; D0 = (temp ^ VP[fStartBlock]) | X; HN = VP[fStartBlock] & D0; HP = VN[fStartBlock] | ~(VP[fStartBlock] | D0); X = (HP << 1) | (1 << fOffSet); carryHP = HP >> (BLOCK_SIZE - 1); VN[fStartBlock] = X & D0; temp = (HN << 1); carryHN = HN >> (BLOCK_SIZE - 1); VP[fStartBlock] = temp | ~(X | D0); // compute the remaining blocks for (currentBlock = fStartBlock + 1; currentBlock <= fEndBlock; currentBlock++) { X = forwardBitMask[shift + currentBlock] | VN[currentBlock]; temp = VP[currentBlock] + (X & VP[currentBlock]) + carryD0; carryD0 = ((carryD0) ? temp <= VP[currentBlock] : temp < VP[currentBlock]); D0 = (temp ^ VP[currentBlock]) | X; HN = VP[currentBlock] & D0; HP = VN[currentBlock] | ~(VP[currentBlock] | D0); X = (HP << 1) | carryHP; carryHP = HP >> (BLOCK_SIZE-1); VN[currentBlock] = X & D0; temp = (HN << 1) | carryHN; carryHN = HN >> (BLOCK_SIZE - 1); VP[currentBlock] = temp | ~(X | D0); } /* update score */ if (HP & fScoreMask) score--; else if (HN & fScoreMask) score++; c_score[pos + 1] = score; ++pos; } } /* end - long patten */ /* compute with myers - forward - end */ /* compute blocks and score masks */ unsigned rStartBlock = (len_y - _end2(target)) / BLOCK_SIZE; unsigned rEndBlock = (len_y - mid - 1) / BLOCK_SIZE; unsigned rSpannedBlocks = (rEndBlock - rStartBlock) + 1; unsigned int rScoreMask = 1 << ((len_y - mid - 1) % BLOCK_SIZE); unsigned int rOffSet = (len_y - _end2(target)) % BLOCK_SIZE; unsigned int rSilencer = ~0; rSilencer <<= rOffSet; /* reset v-bitvectors */ std::fill(begin(VP, Standard()) + rStartBlock, begin(VP, Standard()) + rEndBlock + 1, std::numeric_limits<unsigned>::max()); std::fill(begin(VN, Standard()) + rStartBlock, begin(VN, Standard()) + rEndBlock + 1, 0); /* determine start-position and start-score */ pos = _end1(target); score = (_end2(target) - mid) * score_gap; /* set start score */ c_score[_end1(target)] += score; /* determine optimal cut position -- score extension */ TScoreValue max = c_score[_end1(target)]; TScoreValue rmax = score; unsigned int pos_max = _end1(target); /* compute with myers - reverse - begin */ if(rSpannedBlocks == 1) { while (pos-- > _begin1(target)) { X = (rSilencer & reverseBitMask[(blockCount * ordValue(static_cast<TPatternAlphabet>(getValue(x,pos)))) + rStartBlock]) | VN[rStartBlock]; D0 = ((VP[rStartBlock] + (X & VP[rStartBlock])) ^ VP[rStartBlock]) | X; HN = VP[rStartBlock] & D0; HP = VN[rStartBlock] | ~(VP[rStartBlock] | D0); X = (HP << 1) | (1 << rOffSet); VN[rStartBlock] = X & D0; VP[rStartBlock] = (HN << 1) | ~(X | D0); if (HP & rScoreMask) --score; else if (HN & rScoreMask) ++score; c_score[pos] += score; /* check for optimality -- score extension */ if(c_score[pos]> max) { pos_max = pos; max = c_score[pos]; rmax = score; } } } /* end - short pattern */ else { unsigned shift, currentBlock; unsigned int temp, carryD0, carryHP, carryHN; while (pos-- > _begin1(target)) { carryD0 = carryHP = carryHN = 0; shift = blockCount * ordValue(static_cast<TPatternAlphabet>(getValue(x,pos))); // compute first the top most block X = (rSilencer & reverseBitMask[shift + rStartBlock]) | VN[rStartBlock]; temp = VP[rStartBlock] + (X & VP[rStartBlock]); carryD0 = temp < VP[rStartBlock]; D0 = (temp ^ VP[rStartBlock]) | X; HN = VP[rStartBlock] & D0; HP = VN[rStartBlock] | ~(VP[rStartBlock] | D0); X = (HP << 1) | (1 << rOffSet); carryHP = HP >> (BLOCK_SIZE - 1); VN[rStartBlock] = X & D0; temp = (HN << 1); carryHN = HN >> (BLOCK_SIZE - 1); VP[rStartBlock] = temp | ~(X | D0); // compute the remaining blocks for (currentBlock = rStartBlock + 1; currentBlock <= rEndBlock; currentBlock++) { X = reverseBitMask[shift + currentBlock] | VN[currentBlock]; temp = VP[currentBlock] + (X & VP[currentBlock]) + carryD0; carryD0 = ((carryD0) ? temp <= VP[currentBlock] : temp < VP[currentBlock]); D0 = (temp ^ VP[currentBlock]) | X; HN = VP[currentBlock] & D0; HP = VN[currentBlock] | ~(VP[currentBlock] | D0); X = (HP << 1) | carryHP; carryHP = HP >> (BLOCK_SIZE-1); VN[currentBlock] = X & D0; temp = (HN << 1) | carryHN; carryHN = HN >> (BLOCK_SIZE - 1); VP[currentBlock] = temp | ~(X | D0); } if (HP & rScoreMask) --score; else if (HN & rScoreMask) ++score; c_score[pos] += score; /* check for optimality -- score extension*/ if(c_score[pos] > max) { pos_max = pos; max = c_score[pos]; rmax = score; } } } /* end - long pattern */ /* compute with myers - reverse - end */ // if computed the whole matrix max = alignment score if(target == hs_complete) total_score = max; #ifdef MYERS_HIRSCHBERG_VERBOSE printf("Optimal cut is at %i and %i with forward score %i and reverse score %i\n\n",mid,pos_max,(max - rmax),rmax); #endif /* push the two computed parts of the dp-matrix on process stack */ to_process.push(HirschbergSet_{pos_max, _end1(target), mid, _end2(target), rmax}); to_process.push(HirschbergSet_{_begin1(target), pos_max, _begin2(target), mid, max - rmax}); } /* END CUT */ } return total_score; } } // namespace seqan #endif // #ifndef SEQAN_INCLUDE_SEQAN_ALIGN_GLOBAL_ALIGNMENT_MYERS_HIRSCHBERG_IMPL_H_