aws-sdk-pandas Athena query throws error with message "AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte

Describe the bug

Using the AWS Wrangler SDK to query a table using Athena results in error message "AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'". This error happens when using modin.pandas and not with regular pandas library.

Environment: Juptyer notebook on Sagemaker

Error stack trace below:

AttributeError                            Traceback (most recent call last)
Cell In[9], line 1
----> 1 df = wr.athena.read_sql_query('select count(distinct ti_cu_customer_id) as num_customers from loans', database='<db-name>', workgroup='<workgroup_name>')

File /opt/conda/lib/python3.10/site-packages/awswrangler/_config.py:715, in apply_configs.<locals>.wrapper(*args_raw, **kwargs)
    713         del args[name]
    714         args = {**args, **keywords}
--> 715 return function(**args)

File /opt/conda/lib/python3.10/site-packages/awswrangler/_utils.py:178, in validate_kwargs.<locals>.decorator.<locals>.inner(*args, **kwargs)
    175 if condition_fn() and len(passed_unsupported_kwargs) > 0:
    176     raise exceptions.InvalidArgument(f"{message} `{', '.join(passed_unsupported_kwargs)}`.")
--> 178 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:1081, in read_sql_query(sql, database, ctas_approach, unload_approach, ctas_parameters, unload_parameters, categories, chunksize, s3_output, workgroup, encryption, kms_key, keep_files, use_threads, boto3_session, client_request_token, athena_cache_settings, data_source, athena_query_wait_polling_delay, params, paramstyle, dtype_backend, s3_additional_kwargs, pyarrow_additional_kwargs)
   1078 ctas_bucketing_info = ctas_parameters.get("bucketing_info")
   1079 ctas_write_compression = ctas_parameters.get("compression")
-> 1081 return _resolve_query_without_cache(
   1082     sql=sql,
   1083     database=database,
   1084     data_source=data_source,
   1085     ctas_approach=ctas_approach,
   1086     unload_approach=unload_approach,
   1087     unload_parameters=unload_parameters,
   1088     categories=categories,
   1089     chunksize=chunksize,
   1090     s3_output=s3_output,
   1091     workgroup=workgroup,
   1092     encryption=encryption,
   1093     kms_key=kms_key,
   1094     keep_files=keep_files,
   1095     ctas_database=ctas_database,
   1096     ctas_temp_table_name=ctas_temp_table_name,
   1097     ctas_bucketing_info=ctas_bucketing_info,
   1098     ctas_write_compression=ctas_write_compression,
   1099     athena_query_wait_polling_delay=athena_query_wait_polling_delay,
   1100     use_threads=use_threads,
   1101     s3_additional_kwargs=s3_additional_kwargs,
   1102     boto3_session=boto3_session,
   1103     pyarrow_additional_kwargs=pyarrow_additional_kwargs,
   1104     execution_params=execution_params,
   1105     dtype_backend=dtype_backend,
   1106     client_request_token=client_request_token,
   1107 )

File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:507, in _resolve_query_without_cache(sql, database, data_source, ctas_approach, unload_approach, unload_parameters, categories, chunksize, s3_output, workgroup, encryption, kms_key, keep_files, ctas_database, ctas_temp_table_name, ctas_bucketing_info, ctas_write_compression, athena_query_wait_polling_delay, use_threads, s3_additional_kwargs, boto3_session, pyarrow_additional_kwargs, execution_params, dtype_backend, client_request_token)
    505     name = f"temp_table_{uuid.uuid4().hex}"
    506 try:
--> 507     return _resolve_query_without_cache_ctas(
    508         sql=sql,
    509         database=database,
    510         data_source=data_source,
    511         s3_output=s3_output,
    512         keep_files=keep_files,
    513         chunksize=chunksize,
    514         categories=categories,
    515         encryption=encryption,
    516         workgroup=workgroup,
    517         kms_key=kms_key,
    518         alt_database=ctas_database,
    519         name=name,
    520         ctas_bucketing_info=ctas_bucketing_info,
    521         ctas_write_compression=ctas_write_compression,
    522         athena_query_wait_polling_delay=athena_query_wait_polling_delay,
    523         use_threads=use_threads,
    524         s3_additional_kwargs=s3_additional_kwargs,
    525         boto3_session=boto3_session,
    526         pyarrow_additional_kwargs=pyarrow_additional_kwargs,
    527         execution_params=execution_params,
    528         dtype_backend=dtype_backend,
    529     )
    530 finally:
    531     catalog.delete_table_if_exists(database=ctas_database or database, table=name, boto3_session=boto3_session)

File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:345, in _resolve_query_without_cache_ctas(sql, database, data_source, s3_output, keep_files, chunksize, categories, encryption, workgroup, kms_key, alt_database, name, ctas_bucketing_info, ctas_write_compression, athena_query_wait_polling_delay, use_threads, s3_additional_kwargs, boto3_session, pyarrow_additional_kwargs, execution_params, dtype_backend)
    343 ctas_query_metadata = cast(_QueryMetadata, ctas_query_info["ctas_query_metadata"])
    344 _logger.debug("CTAS query metadata: %s", ctas_query_metadata)
--> 345 return _fetch_parquet_result(
    346     query_metadata=ctas_query_metadata,
    347     keep_files=keep_files,
    348     categories=categories,
    349     chunksize=chunksize,
    350     use_threads=use_threads,
    351     s3_additional_kwargs=s3_additional_kwargs,
    352     boto3_session=boto3_session,
    353     temp_table_fqn=fully_qualified_name,
    354     pyarrow_additional_kwargs=pyarrow_additional_kwargs,
    355     dtype_backend=dtype_backend,
    356 )

File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:156, in _fetch_parquet_result(query_metadata, keep_files, categories, chunksize, use_threads, boto3_session, s3_additional_kwargs, temp_table_fqn, pyarrow_additional_kwargs, dtype_backend)
    154     pyarrow_additional_kwargs["categories"] = categories
    155 _logger.debug("Reading Parquet result from %d paths", len(paths))
--> 156 ret = s3.read_parquet(
    157     path=paths,
    158     use_threads=use_threads,
    159     boto3_session=boto3_session,
    160     chunked=chunked,
    161     pyarrow_additional_kwargs=pyarrow_additional_kwargs,
    162     dtype_backend=dtype_backend,
    163 )
    165 if chunked is False:
    166     ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)

File /opt/conda/lib/python3.10/site-packages/awswrangler/_utils.py:178, in validate_kwargs.<locals>.decorator.<locals>.inner(*args, **kwargs)
    175 if condition_fn() and len(passed_unsupported_kwargs) > 0:
    176     raise exceptions.InvalidArgument(f"{message} `{', '.join(passed_unsupported_kwargs)}`.")
--> 178 return func(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/awswrangler/_config.py:715, in apply_configs.<locals>.wrapper(*args_raw, **kwargs)
    713         del args[name]
    714         args = {**args, **keywords}
--> 715 return function(**args)

File /opt/conda/lib/python3.10/site-packages/awswrangler/s3/_read_parquet.py:558, in read_parquet(path, path_root, dataset, path_suffix, path_ignore_suffix, ignore_empty, partition_filter, columns, validate_schema, coerce_int96_timestamp_unit, schema, last_modified_begin, last_modified_end, version_id, dtype_backend, chunked, use_threads, ray_args, boto3_session, s3_additional_kwargs, pyarrow_additional_kwargs, decryption_configuration)
    543 if chunked:
    544     return _read_parquet_chunked(
    545         s3_client=s3_client,
    546         paths=paths,
   (...)
    555         decryption_properties=decryption_properties,
    556     )
--> 558 return _read_parquet(
    559     paths,
    560     path_root=path_root,
    561     schema=schema,
    562     columns=columns,
    563     coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
    564     use_threads=use_threads,
    565     parallelism=ray_args.get("parallelism", -1),
    566     s3_client=s3_client,
    567     s3_additional_kwargs=s3_additional_kwargs,
    568     arrow_kwargs=arrow_kwargs,
    569     version_ids=version_ids,
    570     bulk_read=bulk_read,
    571     decryption_properties=decryption_properties,
    572 )

File /opt/conda/lib/python3.10/site-packages/awswrangler/_distributed.py:105, in Engine.dispatch_on_engine.<locals>.wrapper(*args, **kw)
    102 @wraps(func)
    103 def wrapper(*args: Any, **kw: dict[str, Any]) -> Any:
    104     cls.initialize(name=cls.get().value)
--> 105     return cls.dispatch_func(func)(*args, **kw)

File /opt/conda/lib/python3.10/site-packages/awswrangler/distributed/ray/modin/s3/_read_parquet.py:51, in _read_parquet_distributed(paths, path_root, schema, columns, coerce_int96_timestamp_unit, use_threads, parallelism, version_ids, s3_client, s3_additional_kwargs, arrow_kwargs, bulk_read, decryption_properties)
     48 if decryption_properties:
     49     dataset_kwargs["decryption_properties"] = decryption_properties
---> 51 dataset = read_datasource(
     52     **_resolve_datasource_parameters(
     53         bulk_read,
     54         paths=paths,
     55         path_root=path_root,
     56         arrow_parquet_args={
     57             "use_threads": use_threads,
     58             "schema": schema,
     59             "columns": columns,
     60             "dataset_kwargs": dataset_kwargs,
     61         },
     62     ),
     63     parallelism=parallelism,
     64 )
     65 return _to_modin(
     66     dataset=dataset,
     67     to_pandas_kwargs=arrow_kwargs,
     68     ignore_index=arrow_kwargs.get("ignore_metadata"),
     69 )

File /opt/conda/lib/python3.10/site-packages/ray/_private/auto_init_hook.py:21, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
     18 @wraps(fn)
     19 def auto_init_wrapper(*args, **kwargs):
     20     auto_init_ray()
---> 21     return fn(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/ray/data/read_api.py:399, in read_datasource(datasource, parallelism, ray_remote_args, concurrency, override_num_blocks, **read_args)
    389 requested_parallelism, _, inmemory_size = _autodetect_parallelism(
    390     parallelism,
    391     ctx.target_max_block_size,
   (...)
    394     placement_group=cur_pg,
    395 )
    397 # TODO(hchen/chengsu): Remove the duplicated get_read_tasks call here after
    398 # removing LazyBlockList code path.
--> 399 read_tasks = datasource_or_legacy_reader.get_read_tasks(requested_parallelism)
    401 read_op_name = f"Read{datasource.get_name()}"
    403 block_list = LazyBlockList(
    404     read_tasks,
    405     read_op_name=read_op_name,
    406     ray_remote_args=ray_remote_args,
    407     owned_by_consumer=False,
    408 )

File /opt/conda/lib/python3.10/site-packages/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py:341, in ArrowParquetDatasource.get_read_tasks(self, parallelism)
    338 if len(fragments) <= 0:
    339     continue
--> 341 meta = self._meta_provider(
    342     paths,  # type: ignore[arg-type]
    343     self._inferred_schema,
    344     num_fragments=len(fragments),
    345     prefetched_metadata=metadata,
    346 )
    347 # If there is a filter operation, reset the calculated row count,
    348 # since the resulting row count is unknown.
    349 if self._arrow_parquet_args.get("filter") is not None:

File /opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:70, in FileMetadataProvider.__call__(self, paths, schema, **kwargs)
     64 def __call__(
     65     self,
     66     paths: List[str],
     67     schema: Optional[Union[type, "pyarrow.lib.Schema"]],
     68     **kwargs,
     69 ) -> BlockMetadata:
---> 70     return self._get_block_metadata(paths, schema, **kwargs)

File /opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:309, in DefaultParquetMetadataProvider._get_block_metadata(self, paths, schema, num_fragments, prefetched_metadata)
    292 def _get_block_metadata(
    293     self,
    294     paths: List[str],
   (...)
    298     prefetched_metadata: Optional[List["_ParquetFileFragmentMetaData"]],
    299 ) -> BlockMetadata:
    300     if (
    301         prefetched_metadata is not None
    302         and len(prefetched_metadata) == num_fragments
   (...)
    305         # Fragment metadata was available, construct a normal
    306         # BlockMetadata.
    307         block_metadata = BlockMetadata(
    308             num_rows=sum(m.num_rows for m in prefetched_metadata),
--> 309             size_bytes=sum(m.total_byte_size for m in prefetched_metadata),
    310             schema=schema,
    311             input_files=paths,
    312             exec_stats=None,
    313         )  # Exec stats filled in later.
    314     else:
    315         # Fragment metadata was not available, construct an empty
    316         # BlockMetadata.
    317         block_metadata = BlockMetadata(
    318             num_rows=None,
    319             size_bytes=None,
   (...)
    322             exec_stats=None,
    323         )

File /opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:309, in <genexpr>(.0)
    292 def _get_block_metadata(
    293     self,
    294     paths: List[str],
   (...)
    298     prefetched_metadata: Optional[List["_ParquetFileFragmentMetaData"]],
    299 ) -> BlockMetadata:
    300     if (
    301         prefetched_metadata is not None
    302         and len(prefetched_metadata) == num_fragments
   (...)
    305         # Fragment metadata was available, construct a normal
    306         # BlockMetadata.
    307         block_metadata = BlockMetadata(
    308             num_rows=sum(m.num_rows for m in prefetched_metadata),
--> 309             size_bytes=sum(m.total_byte_size for m in prefetched_metadata),
    310             schema=schema,
    311             input_files=paths,
    312             exec_stats=None,
    313         )  # Exec stats filled in later.
    314     else:
    315         # Fragment metadata was not available, construct an empty
    316         # BlockMetadata.
    317         block_metadata = BlockMetadata(
    318             num_rows=None,
    319             size_bytes=None,
   (...)
    322             exec_stats=None,
    323         )

AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'

How to Reproduce

Below is a code snippet to reproduce the error

!pip install awswrangler[ray,modin]

import modin.pandas as pd
import awswrangler as wr

wr.engine.initialize()

df = wr.athena.read_sql_query('select count(distinct ti_cu_customer_id) as num_customers from loans', database='my-db, workgroup='my-workgroup')

### Expected behavior

_No response_

### Your project

_No response_

### Screenshots

_No response_

### OS

Linux

### Python version

3.10

### AWS SDK for pandas version

3.7.3

### Additional context

_No response_

Jun 04 '24 16:06 leo4ever

Hi @leo4ever can you share output of pip list please? (pyarrow/ray/modin versions is mainly what I'm looking for)

Jun 15 '24 20:06 kukushking

@kukushking - Sorry for the delay. Please, find the output from pip list below

`Package Version

aiohttp 3.9.5 aiohttp-cors 0.7.0 aiosignal 1.3.1 alabaster 0.7.16 annotated-types 0.6.0 anyio 4.3.0 appdirs 1.4.4 archspec 0.2.2 argon2-cffi 23.1.0 argon2-cffi-bindings 21.2.0 arrow 1.3.0 asgiref 3.8.1 astroid 2.15.8 astropy 6.0.1 astropy-iers-data 0.2024.4.15.2.45.49 asttokens 2.4.1 async-lru 2.0.4 async-timeout 4.0.3 atomicwrites 1.4.1 attrs 23.2.0 Authlib 1.3.0 Automat 22.10.0 autopep8 2.0.4 autovizwidget 0.20.4 awscli 1.32.84 awswrangler 3.7.3 Babel 2.14.0 bcrypt 4.1.2 beautifulsoup4 4.12.3 binaryornot 0.4.4 black 24.4.0 bleach 6.1.0 blinker 1.7.0 bokeh 3.4.1 boltons 23.1.1 boto3 1.34.84 botocore 1.34.84 Brotli 1.1.0 brotlipy 0.7.0 cached-property 1.5.2 cachetools 5.3.3 certifi 2024.2.2 cffi 1.16.0 chardet 5.2.0 charset-normalizer 3.3.2 click 8.1.7 cloudpickle 2.2.1 colorama 0.4.4 colorcet 3.1.0 colorful 0.5.6 comm 0.2.2 conda 23.11.0 conda-content-trust 0.2.0 conda-libmamba-solver 23.12.0 conda-package-handling 2.2.0 conda_package_streaming 0.9.0 constantly 15.1.0 contextlib2 21.6.0 contourpy 1.2.1 cookiecutter 2.6.0 cryptography 42.0.5 cycler 0.12.1 cytoolz 0.12.3 daal4py 2024.3.0 dask 2024.4.1 dask-expr 1.0.11 debugpy 1.8.1 decorator 5.1.1 defusedxml 0.7.1 diff-match-patch 20230430 dill 0.3.8 distlib 0.3.8 distributed 2024.4.1 distro 1.8.0 Django 5.0.4 docker 6.1.3 docstring-to-markdown 0.15 docutils 0.16 dparse 0.6.4b0 entrypoints 0.4 et-xmlfile 1.1.0 exceptiongroup 1.2.0 executing 2.0.1 fastapi 0.110.1 fastjsonschema 2.19.1 filelock 3.13.4 flake8 7.0.0 Flask 2.3.3 fonttools 4.51.0 fqdn 1.5.1 frozenlist 1.4.1 fsspec 2024.3.1 future 1.0.0 gevent 23.9.0.post1 gmpy2 2.1.2 google-api-core 2.19.0 google-auth 2.29.0 google-pasta 0.2.0 googleapis-common-protos 1.63.1 greenlet 3.0.3 grpcio 1.64.1 gssapi 1.8.3 h11 0.14.0 h2 4.1.0 h5py 3.11.0 hdijupyterutils 0.20.4 holoviews 1.18.3 hpack 4.0.0 httpcore 1.0.5 httpx 0.27.0 hyperframe 6.0.1 hyperlink 21.0.0 idna 3.6 imagecodecs 2024.1.1 imageio 2.34.0 imagesize 1.4.1 importlib-metadata 6.11.0 importlib_resources 6.4.0 incremental 22.10.0 inflection 0.5.1 iniconfig 2.0.0 intervaltree 3.1.0 ipykernel 6.29.3 ipython 8.23.0 ipython-genutils 0.2.0 ipywidgets 7.6.5 isoduration 20.11.0 isort 5.13.2 itsdangerous 2.1.2 jaraco.classes 3.4.0 jaraco.context 4.3.0 jaraco.functools 4.0.0 jedi 0.19.1 jeepney 0.8.0 jellyfish 1.0.3 Jinja2 3.1.3 jmespath 1.0.1 joblib 1.4.0 json5 0.9.25 jsonpatch 1.33 jsonpointer 2.4 jsonschema 4.21.1 jsonschema-specifications 2023.12.1 jupyter 1.0.0 jupyter_client 7.4.9 jupyter-console 6.6.3 jupyter_core 5.7.2 jupyter-events 0.10.0 jupyter-lsp 2.2.5 jupyter_server 2.14.0 jupyter_server_terminals 0.5.3 jupyterlab 4.1.6 jupyterlab_pygments 0.3.0 jupyterlab_server 2.26.0 jupyterlab_widgets 3.0.10 keyring 25.1.0 kiwisolver 1.4.5 krb5 0.5.1 lazy-object-proxy 1.10.0 libmambapy 1.5.5 lief 0.14.1 linkify-it-py 2.0.3 llvmlite 0.42.0 locket 1.0.0 lxml 5.1.0 lz4 4.3.3 mamba 1.5.5 Markdown 3.6 markdown-it-py 3.0.0 MarkupSafe 2.1.5 marshmallow 3.21.1 matplotlib 3.8.4 matplotlib-inline 0.1.6 mccabe 0.7.0 mdit-py-plugins 0.4.0 mdurl 0.1.2 memray 1.12.0 menuinst 2.0.1 mistune 0.8.4 mock 5.1.0 modin 0.26.1 more-itertools 10.2.0 mpmath 1.3.0 msgpack 1.0.7 multidict 6.0.5 multiprocess 0.70.16 munkres 1.1.4 mypy-extensions 1.0.0 nb_conda_kernels 2.3.1 nbclassic 1.0.0 nbclient 0.10.0 nbconvert 6.5.3 nbformat 5.10.4 nest-asyncio 1.5.5 nltk 3.8.1 nose 1.3.7 notebook 6.5.6 notebook_shim 0.2.4 numba 0.59.1 numexpr 2.9.0 numpy 1.26.4 numpydoc 1.7.0 opencensus 0.11.4 opencensus-context 0.1.3 openpyxl 3.1.2 overrides 7.7.0 packaging 23.2 pandas 2.1.4 pandocfilters 1.5.0 panel 1.4.1 papermill 2.5.0 param 2.1.0 parso 0.8.4 partd 1.4.1 pathos 0.3.2 pathspec 0.12.1 patsy 0.5.6 pexpect 4.9.0 pickleshare 0.7.5 pillow 10.3.0 pip 24.0 pkgutil_resolve_name 1.3.10 platformdirs 4.1.0 plotly 5.19.0 pluggy 1.4.0 ply 3.11 pox 0.3.4 ppft 1.7.6.8 prometheus_client 0.20.0 prompt-toolkit 3.0.42 proto-plus 1.23.0 protobuf 4.25.3 psutil 5.9.8 ptyprocess 0.7.0 pure-eval 0.2.2 pure-sasl 0.6.2 py-spy 0.3.14 pyarrow 15.0.2 pyarrow-hotfix 0.6 pyasn1 0.6.0 pyasn1_modules 0.4.0 pycodestyle 2.11.1 pycosat 0.6.6 pycparser 2.21 pydantic 2.7.0 pydantic_core 2.18.1 pydocstyle 6.3.0 pyerfa 2.0.1.4 pyflakes 3.2.0 pyfunctional 1.5.0 Pygments 2.17.2 PyHive 0.7.0 pylint 2.17.7 pylint-venv 3.0.3 pyls-spyder 0.4.0 pyodbc 5.1.0 pyOpenSSL 24.0.0 pyparsing 3.1.2 PyQt5 5.15.9 PyQt5-sip 12.12.2 PyQtWebEngine 5.15.4 PySocks 1.7.1 pyspnego 0.9.1 pytest 8.1.1 python-dateutil 2.9.0 python-json-logger 2.0.7 python-lsp-black 2.0.0 python-lsp-jsonrpc 1.1.2 python-lsp-server 1.10.1 python-slugify 8.0.4 pytoolconfig 1.2.5 pytz 2024.1 pyviz_comms 3.0.1 pyxdg 0.28 PyYAML 6.0.1 pyzmq 24.0.1 QDarkStyle 3.2.3 qstylizer 0.2.2 QtAwesome 1.2.3 qtconsole 5.5.1 QtPy 2.4.1 ray 2.23.0 referencing 0.34.0 regex 2023.12.25 requests 2.31.0 requests-kerberos 0.14.0 rfc3339-validator 0.1.4 rfc3986-validator 0.1.1 rich 13.7.1 rope 1.13.0 rpds-py 0.18.0 rsa 4.7.2 Rtree 1.2.0 ruamel.yaml 0.18.5 ruamel.yaml.clib 0.2.7 ruamel-yaml-conda 0.15.80 s3fs 0.4.2 s3transfer 0.10.1 safety-schemas 0.0.2 sagemaker 2.215.0 sagemaker-data-insights 0.3.3 sagemaker-datawrangler 0.4.3 sagemaker-headless-execution-driver 0.0.13 sagemaker-scikit-learn-extension 2.5.0 sagemaker-studio-analytics-extension 0.0.20 sagemaker-studio-sparkmagic-lib 0.1.4 sasl 0.3.1 schema 0.7.5 scikit-learn 1.4.2 scipy 1.13.0 seaborn 0.13.2 SecretStorage 3.3.3 Send2Trash 1.8.3 service-identity 21.1.0 setuptools 69.5.1 shellingham 1.5.4 sip 6.7.12 six 1.16.0 smart-open 7.0.4 smclarify 0.5 smdebug-rulesconfig 1.0.1 sniffio 1.3.1 snowballstemmer 2.2.0 sortedcontainers 2.4.0 soupsieve 2.5 sparkmagic 0.20.4 Sphinx 7.2.6 sphinxcontrib-applehelp 1.0.8 sphinxcontrib-devhelp 1.0.6 sphinxcontrib-htmlhelp 2.0.5 sphinxcontrib-jsmath 1.0.1 sphinxcontrib-qthelp 1.0.7 sphinxcontrib-serializinghtml 1.1.10 spyder 5.5.3 spyder-kernels 2.5.1 SQLAlchemy 2.0.29 sqlparse 0.4.4 stack-data 0.6.2 starlette 0.37.2 statsmodels 0.14.1 sympy 1.12 tabulate 0.9.0 tblib 3.0.0 tenacity 8.2.3 terminado 0.18.1 text-unidecode 1.3 textdistance 4.5.0 textual 0.64.0 threadpoolctl 3.4.0 three-merge 0.1.1 thrift 0.20.0 thrift-sasl 0.4.3 tinycss2 1.2.1 toml 0.10.2 tomli 2.0.1 tomlkit 0.12.4 toolz 0.12.1 tornado 6.4 tqdm 4.66.1 traitlets 5.14.2 truststore 0.8.0 Twisted 24.3.0 typer 0.12.3 types-python-dateutil 2.9.0.20240316 typing_extensions 4.11.0 typing-utils 0.1.0 tzdata 2024.1 uc-micro-py 1.0.3 ujson 5.9.0 unicodedata2 15.1.0 uri-template 1.3.0 urllib3 2.2.1 virtualenv 20.26.2 w3lib 2.1.2 watchdog 4.0.0 wcwidth 0.2.13 webcolors 1.13 webencodings 0.5.1 websocket-client 1.7.0 Werkzeug 2.3.8 whatthepatch 1.0.5 wheel 0.43.0 widgetsnbextension 3.5.2 wrapt 1.16.0 wurlitzer 3.0.3 xlrd 2.0.1 xyzservices 2024.4.0 yapf 0.40.1 yarl 1.9.4 zict 3.0.0 zipp 3.17.0 zope.event 5.0 zope.interface 6.3 zstandard 0.22.0`

Jun 28 '24 14:06 leo4ever

@leo4ever The error you're encountering appears to be due to an incompatibility between pyarrow and the libraries awswrangler, modin, and ray. Specifically, the pyarrow._parquet.FileMetaData object does not have an attribute total_byte_size, which is expected by modin when used with awswrangler. steps might help:-

Check Versions Compatibility Ensure that the versions of awswrangler, pyarrow, modin, and ray are compatible.

!pip install -U awswrangler[ray,modin]
!pip install -U pyarrow
!pip install -U modin[ray]
!pip install -U ray

Use Regular Pandas for Athena Queries Since the issue is specific to modin, you can use regular pandas for querying Athena and then convert the DataFrame to modin if required

import pandas as pd
import awswrangler as wr

df_pandas = wr.athena.read_sql_query('select count(distinct ti_cu_customer_id) as num_customers from loans', database='my-db', workgroup='my-workgroup')

import modin.pandas as mpd
df_modin = mpd.DataFrame(df_pandas)

OR if you want to stick to earlier, then 3. Modify _fetch_parquet_result Method:-

# Modify the awswrangler s3 _read_parquet.py file

def _fetch_parquet_result(query_metadata, keep_files, categories, chunksize, use_threads, boto3_session, s3_additional_kwargs, temp_table_fqn, pyarrow_additional_kwargs, dtype_backend):
    paths = query_metadata["OutputLocation"]
    arrow_kwargs = pyarrow_additional_kwargs or {}
    if categories:
        arrow_kwargs["categories"] = categories
    ret = wr.s3.read_parquet(
        path=paths,
        use_threads=use_threads,
        boto3_session=boto3_session,
        chunked=chunked,
        pyarrow_additional_kwargs=arrow_kwargs,
        dtype_backend=dtype_backend
    )

    # Fix for the missing 'total_byte_size' attribute
    if isinstance(ret, modin.pandas.DataFrame):
        ret._data.total_byte_size = lambda: sum(m.metadata.total_byte_size if hasattr(m.metadata, 'total_byte_size') else 0 for m in ret._data._modin_frame._partitions)

    if chunked is False:
        ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)
    return ret

Hope, this helps, plx do let me know, if it works Thanks

Jul 15 '24 13:07 Siddharth-Latthe-07

aws-sdk-pandas
aws-sdk-pandas copied to clipboard

Athena query throws error with message "AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'"

Describe the bug

How to Reproduce

aws-sdk-pandas aws-sdk-pandas copied to clipboard

Athena query throws error with message "AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'"

Describe the bug

How to Reproduce

aws-sdk-pandas
aws-sdk-pandas copied to clipboard