aho_corasick
aho_corasick copied to clipboard
remove_partial_matches only consider alpha, the digial may not match?
[code ]
aho_corasick::trie trie;
trie.remove_overlaps()
.only_whole_words()
.case_insensitive();
trie.insert("great question");
trie.insert("forty-two");
trie.insert("deep thought");
auto tokens = trie.tokenise("123great question");
std::stringstream html;
html << "<html><body><p>";
for (const auto& token : tokens) {
if (token.is_match()) html << "<i>";
html << token.get_fragment();
if (token.is_match()) html << "</i>";
}
html << "</p></body></html>";
std::cout << html.str();
[output]
<html><body><p>great question</p></body></html>
that's not the whole words.
if change remove_partial_matches as below, it works ok (isalpha ---> isalnum)
void remove_partial_matches(string_ref_type search_text, emit_collection &collected_emits) const
{
size_t size = search_text.size();
emit_collection remove_emits;
for (const auto &e : collected_emits) {
if ((e.get_start() == 0 || !std::isalnum(search_text.at(e.get_start() - 1))) &&
(e.get_end() + 1 == size || !std::isalnum(search_text.at(e.get_end() + 1)))
) {
continue;
}
remove_emits.push_back(e);
}
for (auto &e : remove_emits) {
collected_emits.erase(
std::find(collected_emits.begin(), collected_emits.end(), e)
);
}
}