trino icon indicating copy to clipboard operation
trino copied to clipboard

HIVE_CANNOT_OPEN_SPLIT when reading AVRO with decimal fields.

Open calloc opened this issue 9 months ago • 2 comments

I ran the query on a Trino engine and by omitting decimal fields the query works. It used to work fine before 21 Apr, 2024. I suspect some changes

the query is very simple

SELECT * FROM test_table LIMIT 1 -- does NOT work 

while

-- does WORK
SELECT
    _timestamp 
    , driverid
    , course
    , horizontalaccuracy
    , locationtimestamp
    , provider
    , speed
    -- , loc_lat
    -- , loc_lon
    -- , raw_loc_lat
    -- , raw_loc_lon
    , raw_course
    , is_interpolated
    , is_cold_start
    , loc_provider
    , course_provider
    , vehicleid
    , vcuid
    , plate
    , infoupdatedat
    , _id
    , _index
    , synced_at
    , load_date
    , driveareaid
    , data_date
    , data_hour
FROM test_table 
LIMIT 1

The error message is

HIVE_CANNOT_OPEN_SPLIT: Avro type resolution error when initializing split from s3://....

Attached link is the sample avro files https://drive.google.com/file/d/1h-jj8SjW4Ob9Xcf-hesQbhM6HL7GbnCE/view?usp=sharing

and the table is created using this query

CREATE EXTERNAL TABLE `test_table`(
  `_timestamp` timestamp COMMENT 'from deserializer', 
  `driverid` string COMMENT 'from deserializer', 
  `course` double COMMENT 'from deserializer', 
  `horizontalaccuracy` double COMMENT 'from deserializer', 
  `locationtimestamp` timestamp COMMENT 'from deserializer', 
  `provider` string COMMENT 'from deserializer', 
  `speed` double COMMENT 'from deserializer', 
  `loc_lat` binary COMMENT 'from deserializer', 
  `loc_lon` binary COMMENT 'from deserializer', 
  `raw_loc_lat` binary COMMENT 'from deserializer', 
  `raw_loc_lon` binary COMMENT 'from deserializer', 
  `raw_course` double COMMENT 'from deserializer', 
  `is_interpolated` int COMMENT 'from deserializer', 
  `is_cold_start` int COMMENT 'from deserializer', 
  `loc_provider` string COMMENT 'from deserializer', 
  `course_provider` string COMMENT 'from deserializer', 
  `vehicleid` string COMMENT 'from deserializer', 
  `vcuid` string COMMENT 'from deserializer', 
  `plate` string COMMENT 'from deserializer', 
  `infoupdatedat` timestamp COMMENT 'from deserializer', 
  `_id` string COMMENT 'from deserializer', 
  `_index` string COMMENT 'from deserializer', 
  `synced_at` timestamp COMMENT 'from deserializer')
PARTITIONED BY ( 
  `load_date` string, 
  `driveareaid` string, 
  `data_date` string, 
  `data_hour` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.avro.AvroSerDe' 
WITH SERDEPROPERTIES ( 
  'avro.schema.literal'='{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":[{\"name\":\"_timestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"driverId\",\"type\":[\"string\",\"null\"]},{\"name\":\"course\",\"type\":[\"double\",\"null\"]},{\"name\":\"horizontalAccuracy\",\"type\":[\"double\",\"null\"]},{\"name\":\"locationTimestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"speed\",\"type\":[\"double\",\"null\"]},{\"name\":\"loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_course\",\"type\":[\"double\",\"null\"]},{\"name\":\"is_interpolated\",\"type\":[\"int\",\"null\"]},{\"name\":\"is_cold_start\",\"type\":[\"int\",\"null\"]},{\"name\":\"loc_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"course_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"vehicleId\",\"type\":[\"string\",\"null\"]},{\"name\":\"vcuId\",\"type\":[\"string\",\"null\"]},{\"name\":\"plate\",\"type\":[\"string\",\"null\"]},{\"name\":\"infoUpdatedAt\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"_id\",\"type\":[\"string\",\"null\"]},{\"name\":\"_index\",\"type\":[\"string\",\"null\"]},{\"name\":\"synced_at\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]}]}') 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION
  's3://test_bucket/test_table/'
TBLPROPERTIES (
  'CrawlerSchemaDeserializerVersion'='1.0', 
  'CrawlerSchemaSerializerVersion'='1.0', 
  'UPDATED_BY_CRAWLER'='test_table', 
  'averageRecordSize'='193', 
  'avro.schema.literal'='{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":[{\"name\":\"_timestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"driverId\",\"type\":[\"string\",\"null\"]},{\"name\":\"course\",\"type\":[\"double\",\"null\"]},{\"name\":\"horizontalAccuracy\",\"type\":[\"double\",\"null\"]},{\"name\":\"locationTimestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"speed\",\"type\":[\"double\",\"null\"]},{\"name\":\"loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_course\",\"type\":[\"double\",\"null\"]},{\"name\":\"is_interpolated\",\"type\":[\"int\",\"null\"]},{\"name\":\"is_cold_start\",\"type\":[\"int\",\"null\"]},{\"name\":\"loc_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"course_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"vehicleId\",\"type\":[\"string\",\"null\"]},{\"name\":\"vcuId\",\"type\":[\"string\",\"null\"]},{\"name\":\"plate\",\"type\":[\"string\",\"null\"]},{\"name\":\"infoUpdatedAt\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"_id\",\"type\":[\"string\",\"null\"]},{\"name\":\"_index\",\"type\":[\"string\",\"null\"]},{\"name\":\"synced_at\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]}]}', 
  'classification'='avro', 
  'compressionType'='none', 
  'objectCount'='42313', 
  'partition_filtering.enabled'='true', 
  'recordCount'='722328740', 
  'sizeKey'='159416670061', 
  'typeOfData'='file')

calloc avatar Apr 26 '24 11:04 calloc

cc @jklamer

findinpath avatar May 03 '24 09:05 findinpath

@calloc any chance you have the stack trace for the Avro type resolution error exception? That should give us an idea what the issue may be.

jklamer avatar May 13 '24 16:05 jklamer