seqan icon indicating copy to clipboard operation
seqan copied to clipboard

MyersBitVector (Banded) returns wrong Score in some cases.

Open svnbgnk opened this issue 4 years ago • 4 comments

The issue only seems to occure when length(reference) == length(pattern)

#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/align.h>
#include <seqan/index.h>
#include <seqan/seq_io.h>

#include <iostream>
#include <seqan/align.h>

using namespace seqan;
using namespace std;

template<typename TString, typename TNeedle, typename TError>
inline bool verification(TString & text, TNeedle & needle, TError maxErrors)
{
    typedef Finder<TString>                                     TMyersFinder;
    typedef PatternState_<TNeedle, Myers<AlignTextBanded<FindInfix, NMatchesNone_, NMatchesNone_>, True, void> > TPatternState;
    TPatternState patternState;


    typedef PatternState_<TNeedle, Myers<AlignTextBanded<FindPrefix, NMatchesNone_, NMatchesNone_>, True, void> > TPatternStatePrefix;
    TPatternStatePrefix patternPrefixState;

    uint8_t minErrors = maxErrors + 1;

    std::cout << "Seqs: \n" << "reference: " << text << "\nneedle:    " << needle << "\n";

    TMyersFinder finder(text);
    uint8_t mErrors = maxErrors * 4;
    int startPos = 99;
    while (find(finder, needle, patternPrefixState, -static_cast<int>(maxErrors * 4)))
    {
        int currentEnd = position(finder) + 1;
        int currentErrors = -getScore(patternPrefixState);
        std::cout << "Loop errors: " << currentErrors << "\tselected end: " << currentEnd << "\n";
        if (currentErrors <= mErrors)
        {
            mErrors = currentErrors;
            startPos = currentEnd;
        }
    }
    std::cout << "BEST error: " << (int)mErrors << "\n";

    int score = globalAlignmentScore(needle, text, MyersBitVector());
    std::cout << "score global Alignment function :" << score << "\n";

}

int main(int argc, char const * argv[])
{

    // The bug only occurres when length(reference) == length(needle);

    Dna5String reference = "TAAATATTATT";
    Dna5String needle =    "TAAAATATTAT";
    verification(reference, needle, 3);

// This works
//     Dna5String needle =      "TATTATAAAAT";
//     Dna5String reference =   "TTATTATAAAT";

    return 0;
}

Cout:

Seqs: 
reference: TAAATATTATT
needle:    TAAAATATTAT
Loop errors: 4	selected end: 11
BEST error: 4
score global Alignment function :-2

svnbgnk avatar Aug 26 '19 14:08 svnbgnk

One more thing I noticed, where always the wrong score is reported. Dna5String reference = "TAAATATTAT"; Dna5String needle __= "TAAATAATTAT"; TAAATA--TTAT |||||||||||||||...|||||||| TAAATAATTAT BEST error: 4 score global Alignment function :- 1 Needle aligns to an infix of the reference (the whole reference in this case). I am not if is possible for myers to report the correct score, since never the needle is longer than the reference.

svnbgnk avatar Aug 30 '19 16:08 svnbgnk

Here is an even shorter example, it seem to be connect to band and having more Insertions than deletion in the needle: Dna5String reference = "ACGTA"; Dna5String needle = "ACCGT"; verification(reference, needle, 3);

Dna5String reference = "ACGTA";
Dna5String needle =    "ACCGTA";
verification(reference, needle, 3);

svnbgnk avatar Sep 10 '19 13:09 svnbgnk

The non-banded version works flawless:

#include <seqan/basic.h>
#include <seqan/sequence.h>
#include <seqan/align.h>
#include <seqan/index.h>
#include <seqan/seq_io.h>

#include <iostream>
#include <seqan/align.h>

using namespace seqan;
using namespace std;

template<typename TString, typename TNeedle, typename TError>
inline bool verification(TString & text, TNeedle & needle, TError maxErrors)
{
    typedef Finder<TString>                                     TMyersFinder;
    typedef Pattern<TNeedle const, Myers<FindInfix> >      TMyersPattern;
    typedef typename PatternState<TMyersPattern>::Type          TPatternState;

    typedef Pattern<TNeedle const, Myers<FindPrefix> >          TMyersPrefixPattern;
    typedef typename PatternState<TMyersPrefixPattern>::Type          TPatternPrefixState;

    TPatternState patternState;
    TPatternPrefixState patternPrefixState;

    uint8_t minErrors = maxErrors + 1;

    std::cout << "Seqs: \n" << "reference: " << text << "\nneedle:    " << needle << "\n";

    TMyersFinder finder(text);
    TMyersPrefixPattern pattern = needle;
    uint8_t mErrors = maxErrors * 4;
    int startPos = 99;
//     find(finder, pattern, patternState, 5);
    while (find(finder, pattern, patternPrefixState, 5))
    {
        int currentEnd = position(finder) + 1;
        int currentErrors = -getScore(patternPrefixState);
        std::cout << "Loop errors: " << currentErrors << "\tselected end: " << currentEnd << "\n";
        if (currentErrors <= mErrors)
        {
            mErrors = currentErrors;
            startPos = currentEnd;
        }
    }
    std::cout << "BEST error: " << (int)mErrors << "\n";

    int score = globalAlignmentScore(needle, text, MyersBitVector());
    std::cout << "score global Alignment function :" << score << "\n";

}

int main(int argc, char const * argv[])
{
    {
    Dna5String reference = "ACGTA";
    Dna5String needle =    "ACGGGGTA";
    verification(reference, needle, 3);
    }

    return 0;
}

svnbgnk avatar Sep 10 '19 14:09 svnbgnk

I guess this should be fixed in SeqAn3.

rrahn avatar Feb 09 '20 18:02 rrahn