hello-ltr icon indicating copy to clipboard operation
hello-ltr copied to clipboard

TermStatQuery in OpenSearch notebook not working

Open wrigleyDan opened this issue 2 years ago • 0 comments

This might rather be a bug in the OpenSearch plugin than a bug in this repository, so I'm mainly posting this here for visibility.

In the notebook notebooks/opensearch/tmdb/term-stat-query.ipynb in step two, the feature tsq_expr_title_tfidf cannot be logged and an exception is thrown:

---------------------------------------------------------------------------
RequestError                              Traceback (most recent call last)
Input In [3], in <cell line: 6>()
      6 with judgments_open('data/title_judgments.txt') as judgment_list:
      7     for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
----> 8         ftr_logger.log_for_qid(judgments=query_judgments, 
      9                                qid=qid,
     10                                keywords=judgment_list.keywords(qid))
     12 df = judgments_to_dataframe(ftr_logger.logged)
     13 df

File ~/IdeaProjects/OpenSearchWork/hello-ltr/ltr/log.py:56, in FeatureLogger.log_for_qid(self, qid, judgments, keywords)
     48 keywords = re.sub('([^\s\w]|_)+', '', keywords)
     50 params = {
     51     "keywords": keywords,
     52     "fuzzy_keywords": ' '.join([x + '~' for x in keywords.split(' ')]),
     53     "keywordsList": [keywords] # Needed by TSQ for the time being
     54 }
---> 56 res = self.client.log_query(self.index, self.feature_set, ids, params)
     58 # Add feature back to each judgment
     59 for doc in res:

File ~/IdeaProjects/OpenSearchWork/hello-ltr/ltr/client/opensearch_client.py:145, in OpenSearchClient.log_query(self, index, featureset, ids, params)
    142 if ids is not None:
    143     params["query"]["bool"]["must"] = terms_query
--> 145 resp = self.es.search(index=index, body=params)
    146 # resp_msg(msg="Searching {} - {}".format(index, str(terms_query)[:20]), resp=SearchResp(resp))
    148 matches = []

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/opensearchpy/client/utils.py:177, in query_params.<locals>._wrapper.<locals>._wrapped(*args, **kwargs)
    175     if p in kwargs:
    176         params[p] = kwargs.pop(p)
--> 177 return func(*args, params=params, headers=headers, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/opensearchpy/client/__init__.py:1544, in OpenSearch.search(self, body, index, params, headers)
   1541 if "from_" in params:
   1542     params["from"] = params.pop("from_")
-> 1544 return self.transport.perform_request(
   1545     "POST",
   1546     _make_path(index, "_search"),
   1547     params=params,
   1548     headers=headers,
   1549     body=body,
   1550 )

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/opensearchpy/transport.py:407, in Transport.perform_request(self, method, url, headers, params, body)
    405             raise e
    406     else:
--> 407         raise e
    409 else:
    410     # connection didn't fail, confirm it's live status
    411     self.connection_pool.mark_live(connection)

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/opensearchpy/transport.py:368, in Transport.perform_request(self, method, url, headers, params, body)
    365 connection = self.get_connection()
    367 try:
--> 368     status, headers_response, data = connection.perform_request(
    369         method,
    370         url,
    371         params,
    372         body,
    373         headers=headers,
    374         ignore=ignore,
    375         timeout=timeout,
    376     )
    378     # Lowercase all the header names for consistency in accessing them.
    379     headers_response = {
    380         header.lower(): value for header, value in headers_response.items()
    381     }

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/opensearchpy/connection/http_urllib3.py:275, in Urllib3HttpConnection.perform_request(self, method, url, params, body, timeout, ignore, headers)
    271 if not (200 <= response.status < 300) and response.status not in ignore:
    272     self.log_request_fail(
    273         method, full_url, url, orig_body, duration, response.status, raw_data
    274     )
--> 275     self._raise_error(
    276         response.status,
    277         raw_data,
    278         self.get_response_headers(response).get("content-type"),
    279     )
    281 self.log_request_success(
    282     method, full_url, url, orig_body, response.status, raw_data, duration
    283 )
    285 return response.status, response.getheaders(), raw_data

File /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/opensearchpy/connection/base.py:300, in Connection._raise_error(self, status_code, raw_data, content_type)
    297 except (ValueError, TypeError) as err:
    298     logger.warning("Undecodable raw error response from server: %s", err)
--> 300 raise HTTP_EXCEPTIONS.get(status_code, TransportError)(
    301     status_code, error_message, additional_info
    302 )

RequestError: RequestError(400, 'search_phase_execution_exception', 'Cannot create query while parsing feature [tsq_expr_title_tfidf]')

Removing the feature and running the notebook prevents this error from happening. The feature works in the ES version of the notebook, so this likely is a bug in the OS plugin.

wrigleyDan avatar Aug 22 '23 12:08 wrigleyDan