Add support for hoodie.metadata.enable in Hudi connector
Trino version: 428
Used Apache xTable to source Delta Lake data to target / generate Apache Hudi data in S3. Then I registered the Apache Hudi generated data into HMS. Both processes show no errors. See https://github.com/apache/incubator-xtable/issues/460 for details.
atwong@Coolidgelabs demo-s3 % cat trino/catalog/hudi.properties
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
connector.name=hudi
hive.metastore.uri=thrift://hive-metastore:9083
hive.s3.aws-access-key=admin
hive.s3.aws-secret-key=password
hive.s3.endpoint=http://minio:9000
The advice from xtable dev is to set hoodie.metadata.enable=true in hudi.properties but this setting is defunct.
2024-06-05 20:48:11 2024-06-06T03:48:11.571Z ERROR main io.trino.server.Server Configuration errors:
2024-06-05 20:48:11
2024-06-05 20:48:11 1) Error: Configuration property 'hoodie.metadata.enable' was not used
2024-06-05 20:48:11
2024-06-05 20:48:11 1 error
2024-06-05 20:48:11 io.airlift.bootstrap.ApplicationConfigurationException: Configuration errors:
2024-06-05 20:48:11
2024-06-05 20:48:11 1) Error: Configuration property 'hoodie.metadata.enable' was not used
2024-06-05 20:48:11
2024-06-05 20:48:11 1 error
2024-06-05 20:48:11 at io.airlift.bootstrap.Bootstrap.initialize(Bootstrap.java:232)
2024-06-05 20:48:11 at io.trino.plugin.hudi.InternalHudiConnectorFactory.createConnector(InternalHudiConnectorFactory.java:95)
2024-06-05 20:48:11 at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
2024-06-05 20:48:11 at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
2024-06-05 20:48:11 at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
2024-06-05 20:48:11 at java.base/java.lang.reflect.Method.invoke(Method.java:568)
2024-06-05 20:48:11 at io.trino.plugin.hudi.HudiConnectorFactory.create(HudiConnectorFactory.java:48)
2024-06-05 20:48:11 at io.trino.connector.DefaultCatalogFactory.createConnector(DefaultCatalogFactory.java:224)
2024-06-05 20:48:11 at io.trino.connector.DefaultCatalogFactory.createCatalog(DefaultCatalogFactory.java:133)
2024-06-05 20:48:11 at io.trino.connector.LazyCatalogFactory.createCatalog(LazyCatalogFactory.java:45)
2024-06-05 20:48:11 at io.trino.connector.StaticCatalogManager.lambda$loadInitialCatalogs$1(StaticCatalogManager.java:157)
2024-06-05 20:48:11 at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
2024-06-05 20:48:11 at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539)
2024-06-05 20:48:11 at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
2024-06-05 20:48:11 at com.google.common.util.concurrent.DirectExecutor.execute(DirectExecutor.java:31)
2024-06-05 20:48:11 at java.base/java.util.concurrent.ExecutorCompletionService.submit(ExecutorCompletionService.java:184)
2024-06-05 20:48:11 at io.trino.util.Executors.executeUntilFailure(Executors.java:41)
2024-06-05 20:48:11 at io.trino.connector.StaticCatalogManager.loadInitialCatalogs(StaticCatalogManager.java:151)
2024-06-05 20:48:11 at io.trino.server.Server.doStart(Server.java:144)
2024-06-05 20:48:11 at io.trino.server.Server.lambda$start$0(Server.java:91)
2024-06-05 20:48:11 at io.trino.$gen.Trino_428____20240606_034757_1.run(Unknown Source)
2024-06-05 20:48:11 at io.trino.server.Server.start(Server.java:91)
2024-06-05 20:48:11 at io.trino.server.TrinoServer.main(TrinoServer.java:38)
just to try it out, trying also hudi.metadata-enabled=true
2024-06-05 21:04:23 2024-06-06T04:04:23.069Z ERROR main io.trino.server.Server Configuration errors:
2024-06-05 21:04:23
2024-06-05 21:04:23 1) Error: Defunct property 'hudi.metadata-enabled' (class [class HudiConfig]) cannot be configured.
2024-06-05 21:04:23
2024-06-05 21:04:23 2) Error: Configuration property 'hudi.metadata-enabled' was not used
2024-06-05 21:04:23
2024-06-05 21:04:23 2 errors
2024-06-05 21:04:23
2024-06-05 21:04:23 ======================
2024-06-05 21:04:23 Full classname legend:
2024-06-05 21:04:23 ======================
2024-06-05 21:04:23 HudiConfig: "io.trino.plugin.hudi.HudiConfig"
2024-06-05 21:04:23 ========================
2024-06-05 21:04:23 End of classname legend:
2024-06-05 21:04:23 ========================
2024-06-05 21:04:23
2024-06-05 21:04:23 io.airlift.bootstrap.ApplicationConfigurationException: Configuration errors:
2024-06-05 21:04:23
2024-06-05 21:04:23 1) Error: Defunct property 'hudi.metadata-enabled' (class [class HudiConfig]) cannot be configured.
2024-06-05 21:04:23
2024-06-05 21:04:23 2) Error: Configuration property 'hudi.metadata-enabled' was not used
2024-06-05 21:04:23
2024-06-05 21:04:23 2 errors
2024-06-05 21:04:23
2024-06-05 21:04:23 ======================
2024-06-05 21:04:23 Full classname legend:
2024-06-05 21:04:23 ======================
2024-06-05 21:04:23 HudiConfig: "io.trino.plugin.hudi.HudiConfig"
2024-06-05 21:04:23 ========================
2024-06-05 21:04:23 End of classname legend:
2024-06-05 21:04:23 ========================
2024-06-05 21:04:23
2024-06-05 21:04:23 at io.airlift.bootstrap.Bootstrap.initialize(Bootstrap.java:232)
2024-06-05 21:04:23 at io.trino.plugin.hudi.InternalHudiConnectorFactory.createConnector(InternalHudiConnectorFactory.java:95)
2024-06-05 21:04:23 at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
2024-06-05 21:04:23 at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
2024-06-05 21:04:23 at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
2024-06-05 21:04:23 at java.base/java.lang.reflect.Method.invoke(Method.java:568)
2024-06-05 21:04:23 at io.trino.plugin.hudi.HudiConnectorFactory.create(HudiConnectorFactory.java:48)
2024-06-05 21:04:23 at io.trino.connector.DefaultCatalogFactory.createConnector(DefaultCatalogFactory.java:224)
2024-06-05 21:04:23 at io.trino.connector.DefaultCatalogFactory.createCatalog(DefaultCatalogFactory.java:133)
2024-06-05 21:04:23 at io.trino.connector.LazyCatalogFactory.createCatalog(LazyCatalogFactory.java:45)
2024-06-05 21:04:23 at io.trino.connector.StaticCatalogManager.lambda$loadInitialCatalogs$1(StaticCatalogManager.java:157)
2024-06-05 21:04:23 at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
2024-06-05 21:04:23 at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539)
2024-06-05 21:04:23 at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
2024-06-05 21:04:23 at com.google.common.util.concurrent.DirectExecutor.execute(DirectExecutor.java:31)
2024-06-05 21:04:23 at java.base/java.util.concurrent.ExecutorCompletionService.submit(ExecutorCompletionService.java:184)
2024-06-05 21:04:23 at io.trino.util.Executors.executeUntilFailure(Executors.java:41)
2024-06-05 21:04:23 at io.trino.connector.StaticCatalogManager.loadInitialCatalogs(StaticCatalogManager.java:151)
2024-06-05 21:04:23 at io.trino.server.Server.doStart(Server.java:144)
2024-06-05 21:04:23 at io.trino.server.Server.lambda$start$0(Server.java:91)
2024-06-05 21:04:23 at io.trino.$gen.Trino_428____20240606_040414_1.run(Unknown Source)
2024-06-05 21:04:23 at io.trino.server.Server.start(Server.java:91)
2024-06-05 21:04:23 at io.trino.server.TrinoServer.main(TrinoServer.java:38)
xtable developers asked for an additional setting "hoodie.metadata.enable=true" to be set in the Hudi Trino Connector. See https://github.com/apache/incubator-xtable/issues/460#issuecomment-2151343351 fro details
@krvikash and @Praveen2112. I'm new to Trino and I see that you previously committed to HudiConfig. What is the process to add this additional setting? Is it something that I can do?
Setting hoodie.metadata.enable=true won't help in Hudi connector. We already removed Hudi library dependency in https://github.com/trinodb/trino/pull/17392.
@ebyhr what do you suggest then since xtable says that setting is needed to read hudi files?
@ebyhr I'd be happy to test. I'm documenting my testing at https://github.com/apache/incubator-xtable/discussions/461
Per conversation with @codope, he suggested to use Trino 418.
related. https://github.com/trinodb/trino/pull/18840#issuecomment-1772069499
Here is what worked for me https://github.com/soumilshah1995/HudiDeltaStreamer-SCD-Trino
version: "3"
services:
trino-coordinator:
image: 'trinodb/trino:400'
hostname: trino-coordinator
ports:
- '8080:8080'
volumes:
- ./trino/etc:/etc/trino
metastore_db:
image: postgres:11
hostname: metastore_db
ports:
- 5432:5432
environment:
POSTGRES_USER: hive
POSTGRES_PASSWORD: hive
POSTGRES_DB: metastore
hive-metastore:
hostname: hive-metastore
image: 'starburstdata/hive:3.1.2-e.18'
ports:
- '9083:9083' # Metastore Thrift
environment:
HIVE_METASTORE_DRIVER: org.postgresql.Driver
HIVE_METASTORE_JDBC_URL: jdbc:postgresql://metastore_db:5432/metastore
HIVE_METASTORE_USER: hive
HIVE_METASTORE_PASSWORD: hive
HIVE_METASTORE_WAREHOUSE_DIR: s3://datalake/
S3_ENDPOINT: http://minio:9000
S3_ACCESS_KEY: admin
S3_SECRET_KEY: password
S3_PATH_STYLE_ACCESS: "true"
REGION: ""
GOOGLE_CLOUD_KEY_FILE_PATH: ""
AZURE_ADL_CLIENT_ID: ""
AZURE_ADL_CREDENTIAL: ""
AZURE_ADL_REFRESH_URL: ""
AZURE_ABFS_STORAGE_ACCOUNT: ""
AZURE_ABFS_ACCESS_KEY: ""
AZURE_WASB_STORAGE_ACCOUNT: ""
AZURE_ABFS_OAUTH: ""
AZURE_ABFS_OAUTH_TOKEN_PROVIDER: ""
AZURE_ABFS_OAUTH_CLIENT_ID: ""
AZURE_ABFS_OAUTH_SECRET: ""
AZURE_ABFS_OAUTH_ENDPOINT: ""
AZURE_WASB_ACCESS_KEY: ""
HIVE_METASTORE_USERS_IN_ADMIN_ROLE: "admin"
depends_on:
- metastore_db
healthcheck:
test: bash -c "exec 6<> /dev/tcp/localhost/9083"
minio:
image: minio/minio
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
networks:
default:
aliases:
- warehouse.minio
ports:
- 9001:9001
- 9000:9000
command: ["server", "/data", "--console-address", ":9001"]
mc:
depends_on:
- minio
image: minio/mc
environment:
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
entrypoint: >
/bin/sh -c "
until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
/usr/bin/mc rm -r --force minio/warehouse;
/usr/bin/mc mb minio/warehouse;
/usr/bin/mc policy set public minio/warehouse;
tail -f /dev/null
"
volumes:
hive-metastore-postgresql:
networks:
default:
name: hudi
@soumilshah1995 In your example, you're using Trino 400 which is older than Trino 418 version. That's why yours works. If you try Trino 419 or newer, I believe you'll hit the same issue as I did.
yes that's why I was using older version of trino
cc: @codope
Comment from @yihua
Trino 418 depends on Hudi 0.12.3 while Iceberg table with Hudi metadata requires Hudi 0.14.0+ so Trino 418 does not work in your use case.