BentoML icon indicating copy to clipboard operation
BentoML copied to clipboard

bug: Numpy Array serialization/deserialization is slow

Open judahrand opened this issue 1 year ago • 4 comments

Describe the bug

For large payloads BentoML's Numpy Protobuf serialization/deserialization is ~1000x slower and the JSON serialization/deserialization is ~3000x slower compared to either:

  1. Pickle with Protocol 5 and out-of-bound buffers
  2. Converting to PyArrow Tensor and using Arrow IPC to store/send the Array

To reproduce

#%%
from typing import Any

import numpy as np
import numpy.typing as npt
from numpy.testing import assert_array_equal

arr = np.random.random((10000, 3, 4))
# %%
import pyarrow


def pyarrow_serialize_numpy(arr: npt.NDArray[Any]) -> bytes:
    tensor = pyarrow.Tensor.from_numpy(arr)
    sink = pyarrow.BufferOutputStream()
    with pyarrow.output_stream(sink) as stream:
        pyarrow.ipc.write_tensor(tensor, stream)
    return sink.getvalue()


def pyarrow_deserialize_numpy(data: bytes) -> npt.NDArray[Any]:
    buf = memoryview(data)
    with pyarrow.input_stream(buf) as stream:
        tensor = pyarrow.ipc.read_tensor(stream)
    return tensor.to_numpy()

assert_array_equal(arr, pyarrow_deserialize_numpy(pyarrow_serialize_numpy(arr)))
# %%
%%timeit
data = pyarrow_serialize_numpy(arr)
res = pyarrow_deserialize_numpy(data)

#> 28 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

#%%
import io


def np_serialize_numpy(arr: npt.NDArray[Any]) -> bytes:
    buf = io.BytesIO()
    np.save(buf, arr, allow_pickle=False, fix_imports=False)
    return buf.getvalue()


def np_deserialize_numpy(data: bytes) -> npt.NDArray[Any]:
    return np.load(io.BytesIO(data))

assert_array_equal(arr, np_deserialize_numpy(np_serialize_numpy(arr)))
# %%
%%timeit
data = np_serialize_numpy(arr)
res = np_deserialize_numpy(data)

#> 153 µs ± 9.35 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

# %%
import json


def json_serialize_numpy(arr: npt.NDArray[Any]) -> str:
    return json.dumps(arr.tolist())


def json_deserialize_numpy(data: str) -> npt.NDArray[Any]:
    return np.array(json.loads(data))

assert_array_equal(arr, json_deserialize_numpy(json_serialize_numpy(arr)))

# %%
%%timeit
data = json_serialize_numpy(arr)
res = json_deserialize_numpy(data)

#> 74.9 ms ± 892 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# %%
import pickle


def pickle_serialize_numpy(arr: npt.NDArray[Any]) -> list[bytes]:
    bufs = []

    def callback(buf):
        bufs.append(buf)

    pickled = pickle.dumps(arr, 5, buffer_callback=callback)
    return [pickled] + [bytes(buf) for buf in bufs]

def pickle_deserialize_numpy(data: list[bytes]) -> npt.NDArray[Any]:
    return pickle.loads(data[0], buffers=data[1:])

assert_array_equal(arr, pickle_deserialize_numpy(pickle_serialize_numpy(arr)))

# %%
%%timeit
data = pickle_serialize_numpy(arr)
res = pickle_deserialize_numpy(data)

#> 24.3 µs ± 794 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

# %%
from bentoml._internal.io_descriptors.numpy import (
    npdtype_to_dtypepb_map,
    npdtype_to_fieldpb_map,
    dtypepb_to_fieldpb_map,
    dtypepb_to_npdtype_map,
    fieldpb_to_npdtype_map,
    pb,
)

def bentoml_serialize_numpy(obj: npt.NDArray[Any]) -> pb.NDArray:
    fieldpb = npdtype_to_fieldpb_map()[obj.dtype]
    dtypepb = npdtype_to_dtypepb_map()[obj.dtype]
    return pb.NDArray(
        dtype=dtypepb,
        shape=tuple(obj.shape),
        **{fieldpb: obj.ravel().tolist()},
    )


def bentoml_deserialize_numpy(field: pb.NDArray) -> npt.NDArray[Any]:
    if field.dtype == pb.NDArray.DTYPE_UNSPECIFIED:
        dtype = None
    else:
        try:
            dtype = dtypepb_to_npdtype_map()[field.dtype]
        except KeyError:
            raise ValueError(f"{field.dtype} is invalid.") from None
    if dtype is not None:
        values_array = getattr(
            field, dtypepb_to_fieldpb_map()[field.dtype]
        )
    else:
        fieldpb = [
            f.name for f, _ in field.ListFields() if f.name.endswith("_values")
        ]
        if len(fieldpb) == 0:
            # input message doesn't have any fields.
            return np.empty(shape=field.shape or 0)
        elif len(fieldpb) > 1:
            # when there are more than two values provided in the proto.
            raise ValueError(
                f"Array contents can only be one of given values key. Use one of '{fieldpb}' instead.",
            ) from None

        dtype: npt.NDArray[Any] = fieldpb_to_npdtype_map()[fieldpb[0]]
        values_array = getattr(field, fieldpb[0])
    try:
        array = np.array(values_array, dtype=dtype)
    except ValueError:
        array = np.array(values_array)

    # We will try to reshape the array if ``shape`` is provided.
    # Note that all of the logics here are handled in-place, meaning that we will ensure
    # not to create new copies of given initialized array.
    if field.shape:
        array = np.reshape(array, field.shape)

    return array

assert_array_equal(arr, bentoml_deserialize_numpy(bentoml_serialize_numpy(arr)))

# %%
%%timeit
data = bentoml_serialize_numpy(arr)
res = bentoml_deserialize_numpy(data)

#> 20.6 ms ± 82.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

# %%

Expected behavior

Serialization/deserialization should be fast. I propose moving towards the PyArrow based serialization approach as it is the best combination of portable (not just Python) and fast.

Environment

NA

judahrand avatar Aug 17 '23 10:08 judahrand

A potentially even better alternative would to serialize via DLPack.

#%%
import numpy as np
from numpy.testing import assert_array_equal

arr = np.random.random((10000, 3, 4))

# %%
import dlpack

dltensor = dlpack.from_numpy(arr)
res = dlpack.to_numpy(dltensor)
assert_array_equal(res, arr)

# %%
%%timeit
dltensor = dlpack.from_numpy(arr)
res = dlpack.to_numpy(dltensor)

#> 18.2 µs ± 533 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

While it is a little bit slower than Arrow or Pickle it has the advantage of effectively adding support for any DLPack supporting NdArray (Tensorflow, PyTorch, CuPy, Numpy, etc.). Another downside is the need to implement our own implementation of DLPack as Protobuf.

judahrand avatar Aug 17 '23 14:08 judahrand

Thanks for benchmarking this! We are currently moving towards a 2.0 version of IO descriptors and we will include this one of the design consideration. CC: @frostming

jianshen92 avatar Sep 22 '23 16:09 jianshen92

A potentially even better alternative would to serialize via DLPack.

#%%
import numpy as np
from numpy.testing import assert_array_equal

arr = np.random.random((10000, 3, 4))

# %%
import dlpack

dltensor = dlpack.from_numpy(arr)
res = dlpack.to_numpy(dltensor)
assert_array_equal(res, arr)

# %%
%%timeit
dltensor = dlpack.from_numpy(arr)
res = dlpack.to_numpy(dltensor)

#> 18.2 µs ± 533 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

While it is a little bit slower than Arrow or Pickle it has the advantage of effectively adding support for any DLPack supporting NdArray (Tensorflow, PyTorch, CuPy, Numpy, etc.). Another downside is the need to implement our own implementation of DLPack as Protobuf.

Hi, @judahrand How did you install the dlpack to use this way? Was it a pip package or your own developments?

decadance-dance avatar May 05 '24 14:05 decadance-dance

I am still experiencing this. Very slow numpy serialization, several times slower than PIL.Image. Any update?

gallardorafael avatar May 10 '24 04:05 gallardorafael