trino
trino copied to clipboard
HIVE_CANNOT_OPEN_SPLIT when reading AVRO with decimal fields.
I ran the query on a Trino engine and by omitting decimal
fields the query works. It used to work fine before 21 Apr, 2024. I suspect some changes
the query is very simple
SELECT * FROM test_table LIMIT 1 -- does NOT work
while
-- does WORK
SELECT
_timestamp
, driverid
, course
, horizontalaccuracy
, locationtimestamp
, provider
, speed
-- , loc_lat
-- , loc_lon
-- , raw_loc_lat
-- , raw_loc_lon
, raw_course
, is_interpolated
, is_cold_start
, loc_provider
, course_provider
, vehicleid
, vcuid
, plate
, infoupdatedat
, _id
, _index
, synced_at
, load_date
, driveareaid
, data_date
, data_hour
FROM test_table
LIMIT 1
The error message is
HIVE_CANNOT_OPEN_SPLIT: Avro type resolution error when initializing split from s3://....
Attached link is the sample avro files https://drive.google.com/file/d/1h-jj8SjW4Ob9Xcf-hesQbhM6HL7GbnCE/view?usp=sharing
and the table is created using this query
CREATE EXTERNAL TABLE `test_table`(
`_timestamp` timestamp COMMENT 'from deserializer',
`driverid` string COMMENT 'from deserializer',
`course` double COMMENT 'from deserializer',
`horizontalaccuracy` double COMMENT 'from deserializer',
`locationtimestamp` timestamp COMMENT 'from deserializer',
`provider` string COMMENT 'from deserializer',
`speed` double COMMENT 'from deserializer',
`loc_lat` binary COMMENT 'from deserializer',
`loc_lon` binary COMMENT 'from deserializer',
`raw_loc_lat` binary COMMENT 'from deserializer',
`raw_loc_lon` binary COMMENT 'from deserializer',
`raw_course` double COMMENT 'from deserializer',
`is_interpolated` int COMMENT 'from deserializer',
`is_cold_start` int COMMENT 'from deserializer',
`loc_provider` string COMMENT 'from deserializer',
`course_provider` string COMMENT 'from deserializer',
`vehicleid` string COMMENT 'from deserializer',
`vcuid` string COMMENT 'from deserializer',
`plate` string COMMENT 'from deserializer',
`infoupdatedat` timestamp COMMENT 'from deserializer',
`_id` string COMMENT 'from deserializer',
`_index` string COMMENT 'from deserializer',
`synced_at` timestamp COMMENT 'from deserializer')
PARTITIONED BY (
`load_date` string,
`driveareaid` string,
`data_date` string,
`data_hour` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
WITH SERDEPROPERTIES (
'avro.schema.literal'='{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":[{\"name\":\"_timestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"driverId\",\"type\":[\"string\",\"null\"]},{\"name\":\"course\",\"type\":[\"double\",\"null\"]},{\"name\":\"horizontalAccuracy\",\"type\":[\"double\",\"null\"]},{\"name\":\"locationTimestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"speed\",\"type\":[\"double\",\"null\"]},{\"name\":\"loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_course\",\"type\":[\"double\",\"null\"]},{\"name\":\"is_interpolated\",\"type\":[\"int\",\"null\"]},{\"name\":\"is_cold_start\",\"type\":[\"int\",\"null\"]},{\"name\":\"loc_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"course_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"vehicleId\",\"type\":[\"string\",\"null\"]},{\"name\":\"vcuId\",\"type\":[\"string\",\"null\"]},{\"name\":\"plate\",\"type\":[\"string\",\"null\"]},{\"name\":\"infoUpdatedAt\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"_id\",\"type\":[\"string\",\"null\"]},{\"name\":\"_index\",\"type\":[\"string\",\"null\"]},{\"name\":\"synced_at\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]}]}')
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION
's3://test_bucket/test_table/'
TBLPROPERTIES (
'CrawlerSchemaDeserializerVersion'='1.0',
'CrawlerSchemaSerializerVersion'='1.0',
'UPDATED_BY_CRAWLER'='test_table',
'averageRecordSize'='193',
'avro.schema.literal'='{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":[{\"name\":\"_timestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"driverId\",\"type\":[\"string\",\"null\"]},{\"name\":\"course\",\"type\":[\"double\",\"null\"]},{\"name\":\"horizontalAccuracy\",\"type\":[\"double\",\"null\"]},{\"name\":\"locationTimestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"speed\",\"type\":[\"double\",\"null\"]},{\"name\":\"loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_course\",\"type\":[\"double\",\"null\"]},{\"name\":\"is_interpolated\",\"type\":[\"int\",\"null\"]},{\"name\":\"is_cold_start\",\"type\":[\"int\",\"null\"]},{\"name\":\"loc_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"course_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"vehicleId\",\"type\":[\"string\",\"null\"]},{\"name\":\"vcuId\",\"type\":[\"string\",\"null\"]},{\"name\":\"plate\",\"type\":[\"string\",\"null\"]},{\"name\":\"infoUpdatedAt\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"_id\",\"type\":[\"string\",\"null\"]},{\"name\":\"_index\",\"type\":[\"string\",\"null\"]},{\"name\":\"synced_at\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]}]}',
'classification'='avro',
'compressionType'='none',
'objectCount'='42313',
'partition_filtering.enabled'='true',
'recordCount'='722328740',
'sizeKey'='159416670061',
'typeOfData'='file')
cc @jklamer
@calloc any chance you have the stack trace for the Avro type resolution error
exception? That should give us an idea what the issue may be.