MLServer
MLServer copied to clipboard
Input encoding failed for string type colume in pandas.DataFrame
Hi all,
I'm using mlflow with mlserver, and a model with pandas.DataFrame input schema, with a colume of string type.
However, it doesn't look like supported by mlserver.codecs.pandas:
[nav] In [24]: client.predict(data)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[24], line 1
----> 1 client.predict(data)
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/kai_client/__init__.py:116, in KaiClient.predict(self, x, input_encoder, output_decoder)
113 try:
114 enc_x = input_encoder(x)
--> 116 r = requests.post(self.infer_url, json=enc_x, timeout=self.timeout)
117 logger.debug(dump.dump_all(r).decode("utf-8")) # for debugging
118 request_output = r.json()
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/requests/api.py:117, in post(url, data, json, **kwargs)
105 def post(url, data=None, json=None, **kwargs):
106 r"""Sends a POST request.
107
108 :param url: URL for the new :class:`Request` object.
(...)
114 :rtype: requests.Response
115 """
--> 117 return request('post', url, data=data, json=json, **kwargs)
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/requests/api.py:61, in request(method, url, **kwargs)
57 # By using the 'with' statement we are sure the session is closed, thus we
58 # avoid leaving sockets open which can trigger a ResourceWarning in some
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/requests/sessions.py:528, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
515 # Create the Request.
516 req = Request(
517 method=method.upper(),
518 url=url,
(...)
526 hooks=hooks,
527 )
--> 528 prep = self.prepare_request(req)
530 proxies = proxies or {}
532 settings = self.merge_environment_settings(
533 prep.url, proxies, stream, verify, cert
534 )
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/requests/sessions.py:456, in Session.prepare_request(self, request)
453 auth = get_netrc_auth(request.url)
455 p = PreparedRequest()
--> 456 p.prepare(
457 method=request.method.upper(),
458 url=request.url,
459 files=request.files,
460 data=request.data,
461 json=request.json,
462 headers=merge_setting(request.headers, self.headers, dict_class=CaseInsensitiveDict),
463 params=merge_setting(request.params, self.params),
464 auth=merge_setting(auth, self.auth),
465 cookies=merged_cookies,
466 hooks=merge_hooks(request.hooks, self.hooks),
467 )
468 return p
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/requests/models.py:319, in PreparedRequest.prepare(self, method, url, headers, files, data, params, auth, cookies, hooks, json)
317 self.prepare_headers(headers)
318 self.prepare_cookies(cookies)
--> 319 self.prepare_body(data, files, json)
320 self.prepare_auth(auth, url)
322 # Note that prepare_auth must be last to enable authentication schemes
323 # such as OAuth to work on a fully prepared request.
324
325 # This MUST go after prepare_auth. Authenticators could add a hook
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/requests/models.py:471, in PreparedRequest.prepare_body(self, data, files, json)
468 content_type = 'application/json'
470 try:
--> 471 body = complexjson.dumps(json, allow_nan=False)
472 except ValueError as ve:
473 raise InvalidJSONError(ve, request=self)
File ~/.pyenv/versions/3.10.12/lib/python3.10/json/__init__.py:238, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
232 if cls is None:
233 cls = JSONEncoder
234 return cls(
235 skipkeys=skipkeys, ensure_ascii=ensure_ascii,
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, default=default, sort_keys=sort_keys,
--> 238 **kw).encode(obj)
File ~/.pyenv/versions/3.10.12/lib/python3.10/json/encoder.py:199, in JSONEncoder.encode(self, o)
195 return encode_basestring(o)
196 # This doesn't pass the iterator directly to ''.join() because the
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
File ~/.pyenv/versions/3.10.12/lib/python3.10/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
252 else:
253 _iterencode = _make_iterencode(
254 markers, self.default, _encoder, self.indent, floatstr,
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
File ~/.pyenv/versions/3.10.12/lib/python3.10/json/encoder.py:179, in JSONEncoder.default(self, o)
160 def default(self, o):
161 """Implement this method in a subclass such that it returns
162 a serializable object for ``o``, or calls the base implementation
163 (to raise a ``TypeError``).
(...)
177
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
TypeError: Object of type bytes is not JSON serializable
[nav] In [25]: data = pd.read_csv('data.csv', index_col=0).convert_dtypes()
...: data[[col for col in data.columns if col != "machine"]] = data[[col for col in data.columns if col != "machine"]].astype("float32")
...: data.info()
...: display(data)
<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 0 to 9
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 machine 10 non-null string
1 dhomo_ebl_eml 10 non-null float32
2 dhomo_eml_hbl 10 non-null float32
3 dhomo_hbl_etl 10 non-null float32
4 dhomo_hil_htl 10 non-null float32
5 dhomo_htl_ebl 10 non-null float32
6 dlumo_ebl_eml 10 non-null float32
7 dlumo_eml_hbl 10 non-null float32
8 dlumo_hbl_etl 10 non-null float32
9 dlumo_hil_htl 10 non-null float32
10 ln_lt 10 non-null float32
11 init 10 non-null float32
dtypes: float32(11), string(1)
memory usage: 600.0 bytes
machine dhomo_ebl_eml dhomo_eml_hbl dhomo_hbl_etl dhomo_hil_htl dhomo_htl_ebl dlumo_ebl_eml dlumo_eml_hbl dlumo_hbl_etl dlumo_hil_htl ln_lt init
0 d2 *** *** 0.0 0.0 0.0 *** *** 0.0 0.0 *** ***
.....................(omitted)........................
[ins] In [26]: from mlserver.codecs import PandasCodec
...: v2_request = PandasCodec.encode_request(data)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[26], line 2
1 from mlserver.codecs import PandasCodec
----> 2 v2_request = PandasCodec.encode_request(data)
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/mlserver/codecs/pandas.py:133, in PandasCodec.encode_request(cls, payload, use_bytes, **kwargs)
129 @classmethod
130 def encode_request(
131 cls, payload: pd.DataFrame, use_bytes: bool = True, **kwargs
132 ) -> InferenceRequest:
--> 133 outputs = cls.encode_outputs(payload, use_bytes=use_bytes)
135 return InferenceRequest(
136 parameters=Parameters(content_type=cls.ContentType),
137 inputs=[
(...)
146 ],
147 )
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/mlserver/codecs/pandas.py:125, in PandasCodec.encode_outputs(cls, payload, use_bytes)
121 @classmethod
122 def encode_outputs(
123 cls, payload: pd.DataFrame, use_bytes: bool = True
124 ) -> List[ResponseOutput]:
--> 125 return [
126 _to_response_output(payload[col], use_bytes=use_bytes) for col in payload
127 ]
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/mlserver/codecs/pandas.py:126, in <listcomp>(.0)
121 @classmethod
122 def encode_outputs(
123 cls, payload: pd.DataFrame, use_bytes: bool = True
124 ) -> List[ResponseOutput]:
125 return [
--> 126 _to_response_output(payload[col], use_bytes=use_bytes) for col in payload
127 ]
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/mlserver/codecs/pandas.py:36, in _to_response_output(series, use_bytes)
35 def _to_response_output(series: pd.Series, use_bytes: bool = True) -> ResponseOutput:
---> 36 datatype = to_datatype(series.dtype)
37 data = series.tolist()
38 content_type = None
File ~/.pyenv/versions/3.10.12/envs/kai/lib/python3.10/site-packages/mlserver/codecs/numpy.py:60, in to_datatype(dtype)
56 if as_str not in _NumpyToDatatype:
57 # If not present, try with kind
58 as_str = getattr(dtype, "kind")
---> 60 datatype = _NumpyToDatatype[as_str]
62 return datatype
KeyError: 'O'
Do you have any plan to support str type column in pandas.DataFrame input schema? Thanks.
Hey @jinserk ,
String columns should be supported by the Pandas codec.
Could you share a minimum example so that we can replicate on our side?