tantivy icon indicating copy to clipboard operation
tantivy copied to clipboard

Searcher is being overly strict

Open kakserpom opened this issue 7 months ago • 3 comments

Describe the bug

  • What did you do? I am building an ngram index with a single field to search names.
  • What happened? Searcher seems to be overly strict.
  • What was expected? I expected both tests to pass.

Which version of tantivy are you using? 0.24.1

To Reproduce

use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{Index, IndexReader, IndexWriter};

pub struct TativyMatcher {
    haystack: Vec<String>,
    query_parser: QueryParser,
    id: Field,
    reader: IndexReader,
}
impl TativyMatcher {
    pub fn new<T>(
        haystack: impl IntoIterator<Item = T>,
        ngram_length: usize,
    ) -> tantivy::Result<Self>
    where
        T: Into<String>,
    {
        let mut schema_builder = Schema::builder();
        let id = schema_builder.add_i64_field("id", STORED);
        let name = schema_builder.add_text_field(
            "name",
            TextOptions::default()
                .set_indexing_options(
                    TextFieldIndexing::default()
                        .set_tokenizer("ngram3")
                        .set_index_option(IndexRecordOption::WithFreqsAndPositions),
                )
                .set_stored(),
        );
        let index = Index::create_in_ram(schema_builder.build());
        index.tokenizers().register(
            "ngram3",
            NgramTokenizer::new(ngram_length, ngram_length, false)?,
        );
        let mut index_writer: IndexWriter = index.writer(50_000_000)?;
        let haystack = haystack
            .into_iter()
            .map(|x| x.into())
            .collect::<Vec<String>>();
        let mut i = 0;
        for item in &haystack {
            let mut document = TantivyDocument::default();
            document.add_u64(id, i);
            document.add_text(name, item.to_uppercase());
            index_writer.add_document(document)?;
            i += 1;
        }
        index_writer.commit()?;
        let reader = index.reader()?;
        let query_parser = QueryParser::for_index(&index, vec![name]);
        Ok(Self {
            haystack,
            reader,
            query_parser,
            id,
        })
    }

    pub fn find_one<'a>(&'a self, needle: &'a str, top_k: usize) -> tantivy::Result<Needle<'a>> {
        let query = needle
            .to_uppercase()
            .replace(['\'', '’', '“', '”', '-'], "");

        let query = self.query_parser.parse_query(query.as_str())?;
        let searcher = self.reader.searcher();
        let matches = searcher
            .search(&query, &TopDocs::with_limit(top_k))?
            .into_iter()
            .filter_map(|(score, doc_address)| {
                let haystack_idx = searcher
                    .doc::<TantivyDocument>(doc_address)
                    .ok()?
                    .get_first(self.id)?
                    .as_u64()? as usize;
                Some(MatchEntry {
                    confidence: score as f64,
                    haystack: self.haystack.get(haystack_idx)?,
                    haystack_idx,
                })
            })
            .collect();
        Ok(Needle { needle, matches })
    }
}
#[derive(Debug)]
pub struct Needle<'a> {
    pub needle: &'a str,
    pub matches: Vec<MatchEntry<'a>>,
}
#[derive(Debug)]
pub struct MatchEntry<'a> {
    pub haystack: &'a str,
    pub confidence: f64,
    pub haystack_idx: usize,
}

mod tests {
    use crate::tativy_matcher::TativyMatcher;

    const HAYSTACK: &str = "VIKTORIJA NIKIFOROVA";
    const NEEDLE_1: &str = "VIKTORIJA NIKFOROVA";
    const NEEDLE_2: &str = "VIKKTORIJA NIKFOROVA";
    fn get_matcher() -> TativyMatcher {
        TativyMatcher::new([HAYSTACK], 3).unwrap()
    }
    #[test]
    fn test_tativy_1() {
        let matcher = get_matcher();
        let results = matcher.find_one(NEEDLE_1, 5).unwrap();
        println!("results: {:?}", results);
        assert!(!results.matches.is_empty());
    }
    #[test]
    fn test_tativy_2() {
        let matcher = get_matcher();
        let results = matcher.find_one(NEEDLE_2, 5).unwrap();
        println!("results: {:?}", results);
        assert!(!results.matches.is_empty());
    }
}

Output:

running 2 tests
test tativy_matcher::tests::test_tativy_1 ... ok
test tativy_matcher::tests::test_tativy_2 ... FAILED

failures:

---- tativy_matcher::tests::test_tativy_2 stdout ----
results: Needle { needle: "VIKKTORIJA NIKFOROVA", matches: [] }

thread 'tativy_matcher::tests::test_tativy_2' panicked at src/tativy_matcher.rs:121:9:
assertion failed: !results.matches.is_empty()
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace

kakserpom avatar Jul 04 '25 16:07 kakserpom

Seems to work like expected to me.

IKK is a ngram that doesn't match.

If you want fuzzy search, this is the recommended way https://github.com/quickwit-oss/tantivy/blob/main/examples/fuzzy_search.rs. It's also possible with ngram, but you'd have to do it yourself.

PSeitz-dd avatar Jul 14 '25 05:07 PSeitz-dd

@PSeitz-dd but other ngrams match, don't they? FuzzyTermQuery is too limited :-(

kakserpom avatar Jul 14 '25 12:07 kakserpom

NgramTokenizer::new(ngram_length, ngram_length, false)?,

Usually you have different lenghts for min and max ngrams.

You can check what the ngram tokenizer outputs to see which tokens you get (there was some weird behavior there)

PSeitz avatar Jul 14 '25 13:07 PSeitz