fixinventory
fixinventory copied to clipboard
Cloud2SQL Parquet error with AWS API Gateway
Description
The API gateway produces the following type of output
[
{
'id': 'XXXXX', 'tags': [
('Environment', 'prod')
],
'name': 'prod',
'ctime': '2000-01-01T01:01:00Z',
'mtime': '2000-01-01T01:02:00Z',
'description': 'Prod',
'stage_cache_cluster_enabled': False,
'stage_cache_cluster_size': '0.5',
'stage_cache_status': 'NOT_AVAILABLE',
'stage_method_settings': [
('*/*', {
'metricsEnabled': True, 'loggingLevel': 'INFO', 'dataTraceEnabled': True,
'throttlingBurstLimit': 2000, 'throttlingRateLimit': 1000.0, 'cachingEnabled': False,
'cacheTtlInSeconds': 300, 'cacheDataEncrypted': False, 'requireAuthorizationForCacheControl': True,
'unauthorizedCacheControlHeaderStrategy': 'SUCCEED_WITH_RESPONSE_HEADER'
})
],
Currently, this causes an exception like so:
{"timestamp": "2024-02-02T15:50:55", "level": "ERROR", "message": "An error occurred", "pid": 1983677, "thread": "MainThread", "process": "resoto.cloud2sql", "exception": "Traceback (most recent call last):
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/cloud2sql/collect.py\", line 194, in collect_from_plugins
name, nodes, edges = future.result()
^^^^^^^^^^^^^^^
File \"/usr/lib/python3.11/concurrent/futures/_base.py\", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File \"/usr/lib/python3.11/concurrent/futures/_base.py\", line 401, in __get_result
raise self._exception
File \"/usr/lib/python3.11/concurrent/futures/thread.py\", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/cloud2sql/collect.py\", line 162, in collect
return collect_to_file(collector, feedback, config[\"destinations\"][\"arrow\"])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/collect_plugins.py\", line 61, in collect_to_file
model.create_schema(list(edges_by_kind))
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/arrow/model.py\", line 64, in create_schema
table_schema(kind)
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/arrow/model.py\", line 38, in table_schema
*[pa.field(p.name, self.pyarrow_type(p.kind)) for p in properties],
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/arrow/model.py\", line 38, in <listcomp>
*[pa.field(p.name, self.pyarrow_type(p.kind)) for p in properties],
^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/arrow/type_converter.py\", line 10, in parquet_pyarrow_type
(key_kind, value_kind) = kind.strip(\"dictionary\").strip(\"[]\").split(\",\")
Looking at the data types, this is resolving to the following:
Converting dictionary[string, dictionary[string, dictionary[string, any]]]
This can be reworked as follows to make it possible to map the map of maps (in resotodatalink/arrow/type_converter.py
)
elif kind.startswith("dictionary"):
try:
(key_kind, value_kind) = kind.strip("dictionary").split(",", 1) # type: ignore (key_kind, value_kind) = kind.strip("dictionary").strip("[]").split(",")
key_kind = key_kind.strip('[]').strip()
value_kind = value_kind.strip()
if not value_kind.startswith("dictionary"):
value_kind = value_kind.strip("[]").strip()
return pa.map_(parquet_pyarrow_type(key_kind, model), parquet_pyarrow_type(value_kind, model))
except ValueError:
print(kind.strip("dictionary").strip("[]").split(","))
raise Exception(f"Invalid dictionary kind {kind}")
(I also notice that kind == 'float'
is not returning pa.float32()
, it simply calls the function but lacks the return)
This works for the first problem, but then leads to a later error with arrow
.env/lib/python3.11/site-packages/cloud2sql/collect.py\", line 194, in collect_from_plugins
name, nodes, edges = future.result()
^^^^^^^^^^^^^^^
File \"/usr/lib/python3.11/concurrent/futures/_base.py\", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File \"/usr/lib/python3.11/concurrent/futures/_base.py\", line 401, in __get_result
raise self._exception
File \"/usr/lib/python3.11/concurrent/futures/thread.py\", line 58, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/cloud2sql/collect.py\", line 162, in collect
return collect_to_file(collector, feedback, config[\"destinations\"][\"arrow\"])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/collect_plugins.py\", line 78, in collect_to_file
writer.close()
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/arrow/writer.py\", line 302, in close
batch = write_batch_to_file(batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"/home/greg/analysis/.env/lib/python3.11/site-packages/resotodatalink/arrow/writer.py\", line 177, in write_batch_to_file
pa_table = pa.Table.from_pylist(batch.rows, batch.schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File \"pyarrow/table.pxi\", line 3906, in pyarrow.lib.Table.from_pylist
File \"pyarrow/table.pxi\", line 5460, in pyarrow.lib._from_pylist
File \"pyarrow/table.pxi\", line 3781, in pyarrow.lib.Table.from_arrays
File \"pyarrow/table.pxi\", line 1449, in pyarrow.lib._sanitize_arrays
File \"pyarrow/array.pxi\", line 357, in pyarrow.lib.asarray
File \"pyarrow/array.pxi\", line 327, in pyarrow.lib.array
File \"pyarrow/array.pxi\", line 39, in pyarrow.lib._sequence_to_array
File \"pyarrow/error.pxi\", line 144, in pyarrow.lib.pyarrow_internal_check_status
File \"pyarrow/error.pxi\", line 123, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: Expected bytes, got a 'bool' object"}
... at this point I have to admit I gave up, I am not sure what the solution is, but it might be beneficial to ignore this data or to convert it into a set of strings.
I suspect the problematic part is
'stage_method_settings': [
('*/*', {
'metricsEnabled': True, 'loggingLevel': 'INFO', 'dataTraceEnabled': True,
'throttlingBurstLimit': 2000, 'throttlingRateLimit': 1000.0, 'cachingEnabled': False,
'cacheTtlInSeconds': 300, 'cacheDataEncrypted': False, 'requireAuthorizationForCacheControl': True,
'unauthorizedCacheControlHeaderStrategy': 'SUCCEED_WITH_RESPONSE_HEADER'
})
For now, I am skipping api_gateway resources.
Version
latest from pip as per cloud2sql
Thanks, @GregBowyer, for the report. We will take a look.