PyAthena icon indicating copy to clipboard operation
PyAthena copied to clipboard

Getting signed integer is greater than maximum error when querying large table

Open maky-hnou opened this issue 1 year ago • 5 comments

When I query a large table (about 5 million rows), I get this error:

pyathena.error.OperationalError: signed integer is greater than maximum

Here is the code I am using:

from pyathena import connect as athena_connect
from pyathena.pandas.cursor import PandasCursor

AWS_ACCESS_KEY_ID = 'my_key_id'
AWS_SECRET_ACCESS_KEY = 'my_secret_key'
ATHENA_SCHEMA = 'my_schema'
REGION_NAME = 'us-west-2'
WORK_GROUP = 'my_work_group'


schema_cursor = athena_connect(
            region_name=REGION_NAME,
            schema_name=ATHENA_SCHEMA,
            work_group=WORK_GROUP,
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
            cursor_class=PandasCursor,
        ).cursor()

data = schema_cursor.execute(f"""SELECT * FROM my_table""").as_pandas()

print(data.shape)

This is the output I get:

Failed to read s3://path_to_csv.csv.
Traceback (most recent call last):
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/pandas/result_set.py", line 273, in _read_csv
    return pd.read_csv(
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/util/_decorators.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/util/_decorators.py", line 331, in wrapper
    return func(*args, **kwargs)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/readers.py", line 950, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/readers.py", line 611, in _read
    return parser.read(nrows)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/readers.py", line 1778, in read
    ) = self._engine.read(  # type: ignore[attr-defined]
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 230, in read
    chunks = self._reader.read_low_memory(nrows)
  File "pandas/_libs/parsers.pyx", line 808, in pandas._libs.parsers.TextReader.read_low_memory
  File "pandas/_libs/parsers.pyx", line 866, in pandas._libs.parsers.TextReader._read_rows
  File "pandas/_libs/parsers.pyx", line 852, in pandas._libs.parsers.TextReader._tokenize_rows
  File "pandas/_libs/parsers.pyx", line 1965, in pandas._libs.parsers.raise_parser_error
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
    yield fs.pop().result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 437, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/usr/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/filesystem/s3.py", line 395, in _get_object
    return ranges[0], cast(bytes, response["Body"].read())
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/botocore/response.py", line 99, in read
    chunk = self._raw_stream.read(amt)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/urllib3/response.py", line 518, in read
    data = self._fp.read() if not fp_closed else b""
  File "/usr/lib/python3.8/http/client.py", line 472, in read
    s = self._safe_read(self.length)
  File "/usr/lib/python3.8/http/client.py", line 613, in _safe_read
    data = self.fp.read(amt)
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib/python3.8/ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "/usr/lib/python3.8/ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
OverflowError: signed integer is greater than maximum
Traceback (most recent call last):
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/pandas/result_set.py", line 273, in _read_csv
    return pd.read_csv(
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/util/_decorators.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/util/_decorators.py", line 331, in wrapper
    return func(*args, **kwargs)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/readers.py", line 950, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/readers.py", line 611, in _read
    return parser.read(nrows)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/readers.py", line 1778, in read
    ) = self._engine.read(  # type: ignore[attr-defined]
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 230, in read
    chunks = self._reader.read_low_memory(nrows)
  File "pandas/_libs/parsers.pyx", line 808, in pandas._libs.parsers.TextReader.read_low_memory
  File "pandas/_libs/parsers.pyx", line 866, in pandas._libs.parsers.TextReader._read_rows
  File "pandas/_libs/parsers.pyx", line 852, in pandas._libs.parsers.TextReader._tokenize_rows
  File "pandas/_libs/parsers.pyx", line 1965, in pandas._libs.parsers.raise_parser_error
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
    yield fs.pop().result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 437, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/usr/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/filesystem/s3.py", line 395, in _get_object
    return ranges[0], cast(bytes, response["Body"].read())
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/botocore/response.py", line 99, in read
    chunk = self._raw_stream.read(amt)
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/urllib3/response.py", line 518, in read
    data = self._fp.read() if not fp_closed else b""
  File "/usr/lib/python3.8/http/client.py", line 472, in read
    s = self._safe_read(self.length)
  File "/usr/lib/python3.8/http/client.py", line 613, in _safe_read
    data = self.fp.read(amt)
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "/usr/lib/python3.8/ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "/usr/lib/python3.8/ssl.py", line 1099, in read
    return self._sslobj.read(len, buffer)
OverflowError: signed integer is greater than maximum

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "py_doc.py", line 24, in <module>
    data = schema_cursor.execute(f"""SELECT * FROM my_table""").as_pandas()
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/pandas/cursor.py", line 173, in execute
    self.result_set = AthenaPandasResultSet(
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/pandas/result_set.py", line 141, in __init__
    df = self._as_pandas()
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/pandas/result_set.py", line 382, in _as_pandas
    df = self._read_csv()
  File "/home/hani.yousfi/.virtualenvs/my_env/lib/python3.8/site-packages/pyathena/pandas/result_set.py", line 297, in _read_csv
    raise OperationalError(*e.args) from e
pyathena.error.OperationalError: signed integer is greater than maximum

My dependencies:

boto3==1.26.74
botocore==1.29.74
pandas==1.5.3
psycopg2-binary==2.9.5
pyathena==2.23.0
urllib3==1.26.9
python --version 
Python 3.8.10

Further info:
The table contains 15 columns, most of them are with timestamps and strings values.
The column with timestamps contains unix format timestamps and 'NOT APPLICABLE' when the value is empty.

I was not getting this issue with lower versions of pyathena and pandas. But I had to update them due to vulnerabilities.

Do you know how to fix this?
Or how to query the table in chunks then merge them?

maky-hnou avatar Mar 01 '23 16:03 maky-hnou