terraform-aws-glue
terraform-aws-glue copied to clipboard
Glue Crawler add support for hudi and iceberg targets
Describe the Feature
AWS glue crawler now additionally supports
- hudi: https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/glue_crawler#hudi_target
- iceberg: https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/glue_crawler#iceberg_target
Expected Behavior
Adding the below hudi_target would work successfully
module "glue_crawler" {
source = "cloudposse/glue/aws//modules/glue-crawler"
version = "0.X.X"
crawler_description = "Glue crawler that processes data in ${local.data_source} and writes the metadata into a Glue Catalog database"
configuration = jsonencode(var.crawler_configuration)
database_name = module.glue_catalog_database.name
role = local.role_arn
schedule = var.crawler_schedule
schema_change_policy = {
delete_behavior = "DELETE_FROM_DATABASE"
update_behavior = "UPDATE_IN_DATABASE"
}
hudi_target = [
{
connection_name = null
paths = [local.data_source]
exclusions = var.crawler_hudi_target_exclusions
maximum_traversal_depth = var.crawler_hudi_target_maximum_traversal_depth
}
]
context = module.this.context
}
Use Case
Glue crawler to support HUDI and Iceberg
Describe Ideal Solution
Implement the below git diff
$ git diff
diff --git a/modules/glue-crawler/main.tf b/modules/glue-crawler/main.tf
index 18271b2..f67b2f4 100644
--- a/modules/glue-crawler/main.tf
+++ b/modules/glue-crawler/main.tf
@@ -45,6 +45,28 @@ resource "aws_glue_crawler" "this" {
}
}
+ dynamic "hudi_target" {
+ for_each = var.hudi_target != null ? var.hudi_target : []
+
+ content {
+ connection_name = hudi_target.value.connection_name
+ paths = hudi_target.value.paths
+ exclusions = hudi_target.value.exclusions
+ maximum_traversal_depth = hudi_target.value.maximum_traversal_depth
+ }
+ }
+
+ dynamic "iceberg_target" {
+ for_each = var.iceberg_target != null ? var.iceberg_target : []
+
+ content {
+ connection_name = iceberg_target.value.connection_name
+ paths = iceberg_target.value.paths
+ exclusions = iceberg_target.value.exclusions
+ maximum_traversal_depth = iceberg_target.value.maximum_traversal_depth
+ }
+ }
+
dynamic "jdbc_target" {
for_each = var.jdbc_target != null ? var.jdbc_target : []
diff --git a/modules/glue-crawler/variables.tf b/modules/glue-crawler/variables.tf
index 655792f..39c62b8 100644
--- a/modules/glue-crawler/variables.tf
+++ b/modules/glue-crawler/variables.tf
@@ -51,6 +51,34 @@ variable "jdbc_target" {
default = null
}
+variable "hudi_target" {
+ # type = list(object({
+ # connection_name = string
+ # paths = string
+ # exclusions = list(string)
+ # maximum_traversal_depth = number
+ # }))
+
+ # Using `type = list(any)` since some of the the fields are optional and we don't want to force the caller to specify all of them and set to `null` those not used
+ type = list(any)
+ description = "List of nested HUDI target arguments."
+ default = null
+}
+
+variable "iceberg_target" {
+ # type = list(object({
+ # connection_name = string
+ # paths = string
+ # exclusions = list(string)
+ # maximum_traversal_depth = number
+ # }))
+
+ # Using `type = list(any)` since some of the the fields are optional and we don't want to force the caller to specify all of them and set to `null` those not used
+ type = list(any)
+ description = "List of nested Iceberg target arguments."
+ default = null
+}
+
variable "dynamodb_target" {
# type = list(object({
# path = string
Alternatives Considered
No response
Additional Context
No response