manticoresearch
manticoresearch copied to clipboard
When there are many text fields, highlighting breaks if search term appears in "later" text fields
When there are several text fields (39 in my case), highlighting seems to break if a search term is found in the text fields declared towards the end of the config.
MRE Here are two example indexes which are exactly the same, except that in one, the contentid field is declared first and the titleid field last. In the other, it is the opposite.
In the index where contentid field is declared first, searches including terms found in contentid field highlight correctly, but searches with terms from titleid field break. Note that the highlighting does not happen in any field, and not just the titleid field.
test.csv Have attached an example test.csv. Python file to test MRE is below.
Config of the two indexes is also given below.
import csv
import manticoresearch
import requests
config = manticoresearch.Configuration(
host = "http://127.0.0.1:9308"
)
client = manticoresearch.ApiClient(config)
indexApi = manticoresearch.IndexApi(client)
searchApi = manticoresearch.SearchApi(client)
utilsApi = manticoresearch.UtilsApi(client)
with open('test.csv', 'r', encoding='utf-8', errors='ignore') as f:
reader = csv.DictReader(f)
data = list(reader)
print(data)
for i, row in enumerate(data):
resp = indexApi.insert({"index" : "titleidfirst", "doc" : row})
print(resp)
resp = indexApi.insert({"index" : "contentidfirst", "doc" : row})
print(resp)
queries = ['medieval', 'medieval (@contentid engi1)', 'medieval (@titleid engt1)']
url = 'http://localhost:9308/search'
session = requests.session()
for ixname in ['titleidfirst', 'contentidfirst']:
print(ixname)
for squery in queries:
print(squery)
data = {"index":ixname, "query":{"query_string":f"{squery}"},"highlight":{"limit": 0, "encoder": "default"}}
resp = session.post(url, json=data)
print(resp.json(strict=False))
Config of the indexes
index contentidfirst {
charset_table = non_cjk
type = rt
path = D:\Code\manticore\indexdir\contentidfirst\contentidfirst
rt_field = contentid
rt_field = content
rt_field = title
rt_field = collection
rt_field = sgcontent
rt_field = access
rt_field = jira
rt_field = type
rt_field = info
rt_field = transcript
rt_field = eventtype
rt_field = published
rt_field = spublished
rt_field = language
rt_field = event
rt_field = status
rt_field = child
rt_field = parent
rt_field = ppublished
rt_field = cpublished
rt_field = tag
rt_field = summary
rt_field = speaker
rt_field = evententities
rt_field = aboutentities
rt_field = markup
rt_field = attachment
rt_field = tkeyword
rt_field = mkeyword
rt_field = venue
rt_field = location
rt_field = city
rt_field = state
rt_field = country
rt_field = area
rt_field = mstatus
rt_field = markuper
rt_field = userperm
rt_field = titleid
rt_mem_limit = 128M
preopen = 1
min_infix_len = 2
html_strip = 1
index_sp = 1
bigram_index = both_freq
bigram_freq_words = a, am, an, and, are, as, at, be, but, by, can, did, do, for, i, if, in, is, it, its, no, not, of, on, or, so, to, was
}
index titleidfirst {
charset_table = non_cjk
type = rt
path = D:\Code\manticore\indexdir\titleidfirst\titleidfirst
rt_field = titleid
rt_field = content
rt_field = title
rt_field = collection
rt_field = sgcontent
rt_field = access
rt_field = jira
rt_field = type
rt_field = info
rt_field = transcript
rt_field = eventtype
rt_field = published
rt_field = spublished
rt_field = language
rt_field = event
rt_field = status
rt_field = child
rt_field = parent
rt_field = ppublished
rt_field = cpublished
rt_field = tag
rt_field = summary
rt_field = speaker
rt_field = evententities
rt_field = aboutentities
rt_field = markup
rt_field = attachment
rt_field = tkeyword
rt_field = mkeyword
rt_field = venue
rt_field = location
rt_field = city
rt_field = state
rt_field = country
rt_field = area
rt_field = mstatus
rt_field = markuper
rt_field = userperm
rt_field = contentid
rt_mem_limit = 128M
preopen = 1
min_infix_len = 2
html_strip = 1
index_sp = 1
bigram_index = both_freq
bigram_freq_words = a, am, an, and, are, as, at, be, but, by, can, did, do, for, i, if, in, is, it, its, no, not, of, on, or, so, to, was
}
➤ Dmitrii Kuzmenkov commented:
MRE
mysql> drop table if exists sample;
mysql>
mysql> create table sample (content text, something1 text, something2 text, something3 text, something4 text, something5 text, something6 text, something7 text, something8 text, something9 text, something10 text, something11 text, something12 text, something13 text, something14 text, something15 text, something16 text, something17 text, something18 text, something19 text, something20 text, something21 text, something22 text, something23 text, something24 text, something25 text, something26 text, something27 text, something28 text, something29 text, something30 text, something31 text, content_id text);
mysql>
mysql> insert into sample (content_id, something1, something2, something3, something4, something5, something6, something7, something8, something9, something10, something11, something12, something13, something14, something15, something16, something17, something18, something19, something20, something21, something22, something23, something24, something25, something26, something27, something28, something29, something30, something31, content) values ( 'content', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'NULL', 'This is word that we will search for');
mysql>
mysql> select highlight() as h from sample where match('word');
+---------------------------------------------+
| h |
+---------------------------------------------+
| This is <b>word</b> that we will search for |
+---------------------------------------------+
mysql>
mysql> select highlight() as h from sample where match('word (@content_id content)');
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| h |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| This is word that we will search for | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | content |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
mysql>
Conclusion
The issue is reproducible when we have a gap in 31+ fields after the first match. If we have less than 31 fields, the case is not reproducible. When we have just one text field with lengthy text, it works fine also.
If we swap positions of content_id and content and repeat, the sample failed query will result in the expected behavior.