aws-sdk-pandas
aws-sdk-pandas copied to clipboard
Athena query throws error with message "AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'"
Describe the bug
Using the AWS Wrangler SDK to query a table using Athena results in error message "AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'". This error happens when using modin.pandas and not with regular pandas library.
Environment: Juptyer notebook on Sagemaker
Error stack trace below:
AttributeError Traceback (most recent call last)
Cell In[9], line 1
----> 1 df = wr.athena.read_sql_query('select count(distinct ti_cu_customer_id) as num_customers from loans', database='<db-name>', workgroup='<workgroup_name>')
File /opt/conda/lib/python3.10/site-packages/awswrangler/_config.py:715, in apply_configs.<locals>.wrapper(*args_raw, **kwargs)
713 del args[name]
714 args = {**args, **keywords}
--> 715 return function(**args)
File /opt/conda/lib/python3.10/site-packages/awswrangler/_utils.py:178, in validate_kwargs.<locals>.decorator.<locals>.inner(*args, **kwargs)
175 if condition_fn() and len(passed_unsupported_kwargs) > 0:
176 raise exceptions.InvalidArgument(f"{message} `{', '.join(passed_unsupported_kwargs)}`.")
--> 178 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:1081, in read_sql_query(sql, database, ctas_approach, unload_approach, ctas_parameters, unload_parameters, categories, chunksize, s3_output, workgroup, encryption, kms_key, keep_files, use_threads, boto3_session, client_request_token, athena_cache_settings, data_source, athena_query_wait_polling_delay, params, paramstyle, dtype_backend, s3_additional_kwargs, pyarrow_additional_kwargs)
1078 ctas_bucketing_info = ctas_parameters.get("bucketing_info")
1079 ctas_write_compression = ctas_parameters.get("compression")
-> 1081 return _resolve_query_without_cache(
1082 sql=sql,
1083 database=database,
1084 data_source=data_source,
1085 ctas_approach=ctas_approach,
1086 unload_approach=unload_approach,
1087 unload_parameters=unload_parameters,
1088 categories=categories,
1089 chunksize=chunksize,
1090 s3_output=s3_output,
1091 workgroup=workgroup,
1092 encryption=encryption,
1093 kms_key=kms_key,
1094 keep_files=keep_files,
1095 ctas_database=ctas_database,
1096 ctas_temp_table_name=ctas_temp_table_name,
1097 ctas_bucketing_info=ctas_bucketing_info,
1098 ctas_write_compression=ctas_write_compression,
1099 athena_query_wait_polling_delay=athena_query_wait_polling_delay,
1100 use_threads=use_threads,
1101 s3_additional_kwargs=s3_additional_kwargs,
1102 boto3_session=boto3_session,
1103 pyarrow_additional_kwargs=pyarrow_additional_kwargs,
1104 execution_params=execution_params,
1105 dtype_backend=dtype_backend,
1106 client_request_token=client_request_token,
1107 )
File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:507, in _resolve_query_without_cache(sql, database, data_source, ctas_approach, unload_approach, unload_parameters, categories, chunksize, s3_output, workgroup, encryption, kms_key, keep_files, ctas_database, ctas_temp_table_name, ctas_bucketing_info, ctas_write_compression, athena_query_wait_polling_delay, use_threads, s3_additional_kwargs, boto3_session, pyarrow_additional_kwargs, execution_params, dtype_backend, client_request_token)
505 name = f"temp_table_{uuid.uuid4().hex}"
506 try:
--> 507 return _resolve_query_without_cache_ctas(
508 sql=sql,
509 database=database,
510 data_source=data_source,
511 s3_output=s3_output,
512 keep_files=keep_files,
513 chunksize=chunksize,
514 categories=categories,
515 encryption=encryption,
516 workgroup=workgroup,
517 kms_key=kms_key,
518 alt_database=ctas_database,
519 name=name,
520 ctas_bucketing_info=ctas_bucketing_info,
521 ctas_write_compression=ctas_write_compression,
522 athena_query_wait_polling_delay=athena_query_wait_polling_delay,
523 use_threads=use_threads,
524 s3_additional_kwargs=s3_additional_kwargs,
525 boto3_session=boto3_session,
526 pyarrow_additional_kwargs=pyarrow_additional_kwargs,
527 execution_params=execution_params,
528 dtype_backend=dtype_backend,
529 )
530 finally:
531 catalog.delete_table_if_exists(database=ctas_database or database, table=name, boto3_session=boto3_session)
File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:345, in _resolve_query_without_cache_ctas(sql, database, data_source, s3_output, keep_files, chunksize, categories, encryption, workgroup, kms_key, alt_database, name, ctas_bucketing_info, ctas_write_compression, athena_query_wait_polling_delay, use_threads, s3_additional_kwargs, boto3_session, pyarrow_additional_kwargs, execution_params, dtype_backend)
343 ctas_query_metadata = cast(_QueryMetadata, ctas_query_info["ctas_query_metadata"])
344 _logger.debug("CTAS query metadata: %s", ctas_query_metadata)
--> 345 return _fetch_parquet_result(
346 query_metadata=ctas_query_metadata,
347 keep_files=keep_files,
348 categories=categories,
349 chunksize=chunksize,
350 use_threads=use_threads,
351 s3_additional_kwargs=s3_additional_kwargs,
352 boto3_session=boto3_session,
353 temp_table_fqn=fully_qualified_name,
354 pyarrow_additional_kwargs=pyarrow_additional_kwargs,
355 dtype_backend=dtype_backend,
356 )
File /opt/conda/lib/python3.10/site-packages/awswrangler/athena/_read.py:156, in _fetch_parquet_result(query_metadata, keep_files, categories, chunksize, use_threads, boto3_session, s3_additional_kwargs, temp_table_fqn, pyarrow_additional_kwargs, dtype_backend)
154 pyarrow_additional_kwargs["categories"] = categories
155 _logger.debug("Reading Parquet result from %d paths", len(paths))
--> 156 ret = s3.read_parquet(
157 path=paths,
158 use_threads=use_threads,
159 boto3_session=boto3_session,
160 chunked=chunked,
161 pyarrow_additional_kwargs=pyarrow_additional_kwargs,
162 dtype_backend=dtype_backend,
163 )
165 if chunked is False:
166 ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)
File /opt/conda/lib/python3.10/site-packages/awswrangler/_utils.py:178, in validate_kwargs.<locals>.decorator.<locals>.inner(*args, **kwargs)
175 if condition_fn() and len(passed_unsupported_kwargs) > 0:
176 raise exceptions.InvalidArgument(f"{message} `{', '.join(passed_unsupported_kwargs)}`.")
--> 178 return func(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/awswrangler/_config.py:715, in apply_configs.<locals>.wrapper(*args_raw, **kwargs)
713 del args[name]
714 args = {**args, **keywords}
--> 715 return function(**args)
File /opt/conda/lib/python3.10/site-packages/awswrangler/s3/_read_parquet.py:558, in read_parquet(path, path_root, dataset, path_suffix, path_ignore_suffix, ignore_empty, partition_filter, columns, validate_schema, coerce_int96_timestamp_unit, schema, last_modified_begin, last_modified_end, version_id, dtype_backend, chunked, use_threads, ray_args, boto3_session, s3_additional_kwargs, pyarrow_additional_kwargs, decryption_configuration)
543 if chunked:
544 return _read_parquet_chunked(
545 s3_client=s3_client,
546 paths=paths,
(...)
555 decryption_properties=decryption_properties,
556 )
--> 558 return _read_parquet(
559 paths,
560 path_root=path_root,
561 schema=schema,
562 columns=columns,
563 coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
564 use_threads=use_threads,
565 parallelism=ray_args.get("parallelism", -1),
566 s3_client=s3_client,
567 s3_additional_kwargs=s3_additional_kwargs,
568 arrow_kwargs=arrow_kwargs,
569 version_ids=version_ids,
570 bulk_read=bulk_read,
571 decryption_properties=decryption_properties,
572 )
File /opt/conda/lib/python3.10/site-packages/awswrangler/_distributed.py:105, in Engine.dispatch_on_engine.<locals>.wrapper(*args, **kw)
102 @wraps(func)
103 def wrapper(*args: Any, **kw: dict[str, Any]) -> Any:
104 cls.initialize(name=cls.get().value)
--> 105 return cls.dispatch_func(func)(*args, **kw)
File /opt/conda/lib/python3.10/site-packages/awswrangler/distributed/ray/modin/s3/_read_parquet.py:51, in _read_parquet_distributed(paths, path_root, schema, columns, coerce_int96_timestamp_unit, use_threads, parallelism, version_ids, s3_client, s3_additional_kwargs, arrow_kwargs, bulk_read, decryption_properties)
48 if decryption_properties:
49 dataset_kwargs["decryption_properties"] = decryption_properties
---> 51 dataset = read_datasource(
52 **_resolve_datasource_parameters(
53 bulk_read,
54 paths=paths,
55 path_root=path_root,
56 arrow_parquet_args={
57 "use_threads": use_threads,
58 "schema": schema,
59 "columns": columns,
60 "dataset_kwargs": dataset_kwargs,
61 },
62 ),
63 parallelism=parallelism,
64 )
65 return _to_modin(
66 dataset=dataset,
67 to_pandas_kwargs=arrow_kwargs,
68 ignore_index=arrow_kwargs.get("ignore_metadata"),
69 )
File /opt/conda/lib/python3.10/site-packages/ray/_private/auto_init_hook.py:21, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
18 @wraps(fn)
19 def auto_init_wrapper(*args, **kwargs):
20 auto_init_ray()
---> 21 return fn(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/ray/data/read_api.py:399, in read_datasource(datasource, parallelism, ray_remote_args, concurrency, override_num_blocks, **read_args)
389 requested_parallelism, _, inmemory_size = _autodetect_parallelism(
390 parallelism,
391 ctx.target_max_block_size,
(...)
394 placement_group=cur_pg,
395 )
397 # TODO(hchen/chengsu): Remove the duplicated get_read_tasks call here after
398 # removing LazyBlockList code path.
--> 399 read_tasks = datasource_or_legacy_reader.get_read_tasks(requested_parallelism)
401 read_op_name = f"Read{datasource.get_name()}"
403 block_list = LazyBlockList(
404 read_tasks,
405 read_op_name=read_op_name,
406 ray_remote_args=ray_remote_args,
407 owned_by_consumer=False,
408 )
File /opt/conda/lib/python3.10/site-packages/awswrangler/distributed/ray/datasources/arrow_parquet_datasource.py:341, in ArrowParquetDatasource.get_read_tasks(self, parallelism)
338 if len(fragments) <= 0:
339 continue
--> 341 meta = self._meta_provider(
342 paths, # type: ignore[arg-type]
343 self._inferred_schema,
344 num_fragments=len(fragments),
345 prefetched_metadata=metadata,
346 )
347 # If there is a filter operation, reset the calculated row count,
348 # since the resulting row count is unknown.
349 if self._arrow_parquet_args.get("filter") is not None:
File /opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:70, in FileMetadataProvider.__call__(self, paths, schema, **kwargs)
64 def __call__(
65 self,
66 paths: List[str],
67 schema: Optional[Union[type, "pyarrow.lib.Schema"]],
68 **kwargs,
69 ) -> BlockMetadata:
---> 70 return self._get_block_metadata(paths, schema, **kwargs)
File /opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:309, in DefaultParquetMetadataProvider._get_block_metadata(self, paths, schema, num_fragments, prefetched_metadata)
292 def _get_block_metadata(
293 self,
294 paths: List[str],
(...)
298 prefetched_metadata: Optional[List["_ParquetFileFragmentMetaData"]],
299 ) -> BlockMetadata:
300 if (
301 prefetched_metadata is not None
302 and len(prefetched_metadata) == num_fragments
(...)
305 # Fragment metadata was available, construct a normal
306 # BlockMetadata.
307 block_metadata = BlockMetadata(
308 num_rows=sum(m.num_rows for m in prefetched_metadata),
--> 309 size_bytes=sum(m.total_byte_size for m in prefetched_metadata),
310 schema=schema,
311 input_files=paths,
312 exec_stats=None,
313 ) # Exec stats filled in later.
314 else:
315 # Fragment metadata was not available, construct an empty
316 # BlockMetadata.
317 block_metadata = BlockMetadata(
318 num_rows=None,
319 size_bytes=None,
(...)
322 exec_stats=None,
323 )
File /opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:309, in <genexpr>(.0)
292 def _get_block_metadata(
293 self,
294 paths: List[str],
(...)
298 prefetched_metadata: Optional[List["_ParquetFileFragmentMetaData"]],
299 ) -> BlockMetadata:
300 if (
301 prefetched_metadata is not None
302 and len(prefetched_metadata) == num_fragments
(...)
305 # Fragment metadata was available, construct a normal
306 # BlockMetadata.
307 block_metadata = BlockMetadata(
308 num_rows=sum(m.num_rows for m in prefetched_metadata),
--> 309 size_bytes=sum(m.total_byte_size for m in prefetched_metadata),
310 schema=schema,
311 input_files=paths,
312 exec_stats=None,
313 ) # Exec stats filled in later.
314 else:
315 # Fragment metadata was not available, construct an empty
316 # BlockMetadata.
317 block_metadata = BlockMetadata(
318 num_rows=None,
319 size_bytes=None,
(...)
322 exec_stats=None,
323 )
AttributeError: 'pyarrow._parquet.FileMetaData' object has no attribute 'total_byte_size'
How to Reproduce
Below is a code snippet to reproduce the error
!pip install awswrangler[ray,modin]
import modin.pandas as pd
import awswrangler as wr
wr.engine.initialize()
df = wr.athena.read_sql_query('select count(distinct ti_cu_customer_id) as num_customers from loans', database='my-db, workgroup='my-workgroup')
### Expected behavior
_No response_
### Your project
_No response_
### Screenshots
_No response_
### OS
Linux
### Python version
3.10
### AWS SDK for pandas version
3.7.3
### Additional context
_No response_
Hi @leo4ever can you share output of pip list please? (pyarrow/ray/modin versions is mainly what I'm looking for)
@kukushking - Sorry for the delay. Please, find the output from pip list below
`Package Version
aiohttp 3.9.5 aiohttp-cors 0.7.0 aiosignal 1.3.1 alabaster 0.7.16 annotated-types 0.6.0 anyio 4.3.0 appdirs 1.4.4 archspec 0.2.2 argon2-cffi 23.1.0 argon2-cffi-bindings 21.2.0 arrow 1.3.0 asgiref 3.8.1 astroid 2.15.8 astropy 6.0.1 astropy-iers-data 0.2024.4.15.2.45.49 asttokens 2.4.1 async-lru 2.0.4 async-timeout 4.0.3 atomicwrites 1.4.1 attrs 23.2.0 Authlib 1.3.0 Automat 22.10.0 autopep8 2.0.4 autovizwidget 0.20.4 awscli 1.32.84 awswrangler 3.7.3 Babel 2.14.0 bcrypt 4.1.2 beautifulsoup4 4.12.3 binaryornot 0.4.4 black 24.4.0 bleach 6.1.0 blinker 1.7.0 bokeh 3.4.1 boltons 23.1.1 boto3 1.34.84 botocore 1.34.84 Brotli 1.1.0 brotlipy 0.7.0 cached-property 1.5.2 cachetools 5.3.3 certifi 2024.2.2 cffi 1.16.0 chardet 5.2.0 charset-normalizer 3.3.2 click 8.1.7 cloudpickle 2.2.1 colorama 0.4.4 colorcet 3.1.0 colorful 0.5.6 comm 0.2.2 conda 23.11.0 conda-content-trust 0.2.0 conda-libmamba-solver 23.12.0 conda-package-handling 2.2.0 conda_package_streaming 0.9.0 constantly 15.1.0 contextlib2 21.6.0 contourpy 1.2.1 cookiecutter 2.6.0 cryptography 42.0.5 cycler 0.12.1 cytoolz 0.12.3 daal4py 2024.3.0 dask 2024.4.1 dask-expr 1.0.11 debugpy 1.8.1 decorator 5.1.1 defusedxml 0.7.1 diff-match-patch 20230430 dill 0.3.8 distlib 0.3.8 distributed 2024.4.1 distro 1.8.0 Django 5.0.4 docker 6.1.3 docstring-to-markdown 0.15 docutils 0.16 dparse 0.6.4b0 entrypoints 0.4 et-xmlfile 1.1.0 exceptiongroup 1.2.0 executing 2.0.1 fastapi 0.110.1 fastjsonschema 2.19.1 filelock 3.13.4 flake8 7.0.0 Flask 2.3.3 fonttools 4.51.0 fqdn 1.5.1 frozenlist 1.4.1 fsspec 2024.3.1 future 1.0.0 gevent 23.9.0.post1 gmpy2 2.1.2 google-api-core 2.19.0 google-auth 2.29.0 google-pasta 0.2.0 googleapis-common-protos 1.63.1 greenlet 3.0.3 grpcio 1.64.1 gssapi 1.8.3 h11 0.14.0 h2 4.1.0 h5py 3.11.0 hdijupyterutils 0.20.4 holoviews 1.18.3 hpack 4.0.0 httpcore 1.0.5 httpx 0.27.0 hyperframe 6.0.1 hyperlink 21.0.0 idna 3.6 imagecodecs 2024.1.1 imageio 2.34.0 imagesize 1.4.1 importlib-metadata 6.11.0 importlib_resources 6.4.0 incremental 22.10.0 inflection 0.5.1 iniconfig 2.0.0 intervaltree 3.1.0 ipykernel 6.29.3 ipython 8.23.0 ipython-genutils 0.2.0 ipywidgets 7.6.5 isoduration 20.11.0 isort 5.13.2 itsdangerous 2.1.2 jaraco.classes 3.4.0 jaraco.context 4.3.0 jaraco.functools 4.0.0 jedi 0.19.1 jeepney 0.8.0 jellyfish 1.0.3 Jinja2 3.1.3 jmespath 1.0.1 joblib 1.4.0 json5 0.9.25 jsonpatch 1.33 jsonpointer 2.4 jsonschema 4.21.1 jsonschema-specifications 2023.12.1 jupyter 1.0.0 jupyter_client 7.4.9 jupyter-console 6.6.3 jupyter_core 5.7.2 jupyter-events 0.10.0 jupyter-lsp 2.2.5 jupyter_server 2.14.0 jupyter_server_terminals 0.5.3 jupyterlab 4.1.6 jupyterlab_pygments 0.3.0 jupyterlab_server 2.26.0 jupyterlab_widgets 3.0.10 keyring 25.1.0 kiwisolver 1.4.5 krb5 0.5.1 lazy-object-proxy 1.10.0 libmambapy 1.5.5 lief 0.14.1 linkify-it-py 2.0.3 llvmlite 0.42.0 locket 1.0.0 lxml 5.1.0 lz4 4.3.3 mamba 1.5.5 Markdown 3.6 markdown-it-py 3.0.0 MarkupSafe 2.1.5 marshmallow 3.21.1 matplotlib 3.8.4 matplotlib-inline 0.1.6 mccabe 0.7.0 mdit-py-plugins 0.4.0 mdurl 0.1.2 memray 1.12.0 menuinst 2.0.1 mistune 0.8.4 mock 5.1.0 modin 0.26.1 more-itertools 10.2.0 mpmath 1.3.0 msgpack 1.0.7 multidict 6.0.5 multiprocess 0.70.16 munkres 1.1.4 mypy-extensions 1.0.0 nb_conda_kernels 2.3.1 nbclassic 1.0.0 nbclient 0.10.0 nbconvert 6.5.3 nbformat 5.10.4 nest-asyncio 1.5.5 nltk 3.8.1 nose 1.3.7 notebook 6.5.6 notebook_shim 0.2.4 numba 0.59.1 numexpr 2.9.0 numpy 1.26.4 numpydoc 1.7.0 opencensus 0.11.4 opencensus-context 0.1.3 openpyxl 3.1.2 overrides 7.7.0 packaging 23.2 pandas 2.1.4 pandocfilters 1.5.0 panel 1.4.1 papermill 2.5.0 param 2.1.0 parso 0.8.4 partd 1.4.1 pathos 0.3.2 pathspec 0.12.1 patsy 0.5.6 pexpect 4.9.0 pickleshare 0.7.5 pillow 10.3.0 pip 24.0 pkgutil_resolve_name 1.3.10 platformdirs 4.1.0 plotly 5.19.0 pluggy 1.4.0 ply 3.11 pox 0.3.4 ppft 1.7.6.8 prometheus_client 0.20.0 prompt-toolkit 3.0.42 proto-plus 1.23.0 protobuf 4.25.3 psutil 5.9.8 ptyprocess 0.7.0 pure-eval 0.2.2 pure-sasl 0.6.2 py-spy 0.3.14 pyarrow 15.0.2 pyarrow-hotfix 0.6 pyasn1 0.6.0 pyasn1_modules 0.4.0 pycodestyle 2.11.1 pycosat 0.6.6 pycparser 2.21 pydantic 2.7.0 pydantic_core 2.18.1 pydocstyle 6.3.0 pyerfa 2.0.1.4 pyflakes 3.2.0 pyfunctional 1.5.0 Pygments 2.17.2 PyHive 0.7.0 pylint 2.17.7 pylint-venv 3.0.3 pyls-spyder 0.4.0 pyodbc 5.1.0 pyOpenSSL 24.0.0 pyparsing 3.1.2 PyQt5 5.15.9 PyQt5-sip 12.12.2 PyQtWebEngine 5.15.4 PySocks 1.7.1 pyspnego 0.9.1 pytest 8.1.1 python-dateutil 2.9.0 python-json-logger 2.0.7 python-lsp-black 2.0.0 python-lsp-jsonrpc 1.1.2 python-lsp-server 1.10.1 python-slugify 8.0.4 pytoolconfig 1.2.5 pytz 2024.1 pyviz_comms 3.0.1 pyxdg 0.28 PyYAML 6.0.1 pyzmq 24.0.1 QDarkStyle 3.2.3 qstylizer 0.2.2 QtAwesome 1.2.3 qtconsole 5.5.1 QtPy 2.4.1 ray 2.23.0 referencing 0.34.0 regex 2023.12.25 requests 2.31.0 requests-kerberos 0.14.0 rfc3339-validator 0.1.4 rfc3986-validator 0.1.1 rich 13.7.1 rope 1.13.0 rpds-py 0.18.0 rsa 4.7.2 Rtree 1.2.0 ruamel.yaml 0.18.5 ruamel.yaml.clib 0.2.7 ruamel-yaml-conda 0.15.80 s3fs 0.4.2 s3transfer 0.10.1 safety-schemas 0.0.2 sagemaker 2.215.0 sagemaker-data-insights 0.3.3 sagemaker-datawrangler 0.4.3 sagemaker-headless-execution-driver 0.0.13 sagemaker-scikit-learn-extension 2.5.0 sagemaker-studio-analytics-extension 0.0.20 sagemaker-studio-sparkmagic-lib 0.1.4 sasl 0.3.1 schema 0.7.5 scikit-learn 1.4.2 scipy 1.13.0 seaborn 0.13.2 SecretStorage 3.3.3 Send2Trash 1.8.3 service-identity 21.1.0 setuptools 69.5.1 shellingham 1.5.4 sip 6.7.12 six 1.16.0 smart-open 7.0.4 smclarify 0.5 smdebug-rulesconfig 1.0.1 sniffio 1.3.1 snowballstemmer 2.2.0 sortedcontainers 2.4.0 soupsieve 2.5 sparkmagic 0.20.4 Sphinx 7.2.6 sphinxcontrib-applehelp 1.0.8 sphinxcontrib-devhelp 1.0.6 sphinxcontrib-htmlhelp 2.0.5 sphinxcontrib-jsmath 1.0.1 sphinxcontrib-qthelp 1.0.7 sphinxcontrib-serializinghtml 1.1.10 spyder 5.5.3 spyder-kernels 2.5.1 SQLAlchemy 2.0.29 sqlparse 0.4.4 stack-data 0.6.2 starlette 0.37.2 statsmodels 0.14.1 sympy 1.12 tabulate 0.9.0 tblib 3.0.0 tenacity 8.2.3 terminado 0.18.1 text-unidecode 1.3 textdistance 4.5.0 textual 0.64.0 threadpoolctl 3.4.0 three-merge 0.1.1 thrift 0.20.0 thrift-sasl 0.4.3 tinycss2 1.2.1 toml 0.10.2 tomli 2.0.1 tomlkit 0.12.4 toolz 0.12.1 tornado 6.4 tqdm 4.66.1 traitlets 5.14.2 truststore 0.8.0 Twisted 24.3.0 typer 0.12.3 types-python-dateutil 2.9.0.20240316 typing_extensions 4.11.0 typing-utils 0.1.0 tzdata 2024.1 uc-micro-py 1.0.3 ujson 5.9.0 unicodedata2 15.1.0 uri-template 1.3.0 urllib3 2.2.1 virtualenv 20.26.2 w3lib 2.1.2 watchdog 4.0.0 wcwidth 0.2.13 webcolors 1.13 webencodings 0.5.1 websocket-client 1.7.0 Werkzeug 2.3.8 whatthepatch 1.0.5 wheel 0.43.0 widgetsnbextension 3.5.2 wrapt 1.16.0 wurlitzer 3.0.3 xlrd 2.0.1 xyzservices 2024.4.0 yapf 0.40.1 yarl 1.9.4 zict 3.0.0 zipp 3.17.0 zope.event 5.0 zope.interface 6.3 zstandard 0.22.0`
@leo4ever The error you're encountering appears to be due to an incompatibility between pyarrow and the libraries awswrangler, modin, and ray. Specifically, the pyarrow._parquet.FileMetaData object does not have an attribute total_byte_size, which is expected by modin when used with awswrangler. steps might help:-
- Check Versions Compatibility Ensure that the versions of awswrangler, pyarrow, modin, and ray are compatible.
!pip install -U awswrangler[ray,modin]
!pip install -U pyarrow
!pip install -U modin[ray]
!pip install -U ray
- Use Regular Pandas for Athena Queries Since the issue is specific to modin, you can use regular pandas for querying Athena and then convert the DataFrame to modin if required
import pandas as pd
import awswrangler as wr
df_pandas = wr.athena.read_sql_query('select count(distinct ti_cu_customer_id) as num_customers from loans', database='my-db', workgroup='my-workgroup')
import modin.pandas as mpd
df_modin = mpd.DataFrame(df_pandas)
OR if you want to stick to earlier, then 3. Modify _fetch_parquet_result Method:-
# Modify the awswrangler s3 _read_parquet.py file
def _fetch_parquet_result(query_metadata, keep_files, categories, chunksize, use_threads, boto3_session, s3_additional_kwargs, temp_table_fqn, pyarrow_additional_kwargs, dtype_backend):
paths = query_metadata["OutputLocation"]
arrow_kwargs = pyarrow_additional_kwargs or {}
if categories:
arrow_kwargs["categories"] = categories
ret = wr.s3.read_parquet(
path=paths,
use_threads=use_threads,
boto3_session=boto3_session,
chunked=chunked,
pyarrow_additional_kwargs=arrow_kwargs,
dtype_backend=dtype_backend
)
# Fix for the missing 'total_byte_size' attribute
if isinstance(ret, modin.pandas.DataFrame):
ret._data.total_byte_size = lambda: sum(m.metadata.total_byte_size if hasattr(m.metadata, 'total_byte_size') else 0 for m in ret._data._modin_frame._partitions)
if chunked is False:
ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)
return ret
Hope, this helps, plx do let me know, if it works Thanks