Skip to content

Commit

Permalink
Enable Unity Catalog (#326)
Browse files Browse the repository at this point in the history
  • Loading branch information
tanya-borisova authored Sep 19, 2023
1 parent ace752b commit 59a9fdd
Show file tree
Hide file tree
Showing 28 changed files with 1,050 additions and 10 deletions.
18 changes: 18 additions & 0 deletions config.sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,23 @@ transform: # Optional
- Bronze
- Silver
- Gold

unity_catalog: # Optional. If you wish to disable Unity Catalog, remove this section
catalog_name: catalog
catalog_name_prefix: catalog # Either name, or prefix need to be set
schema_name: schema
schema_name_prefix: schema
datalake_zones: # From datalake section above, the zones that Unity Catalog will have access to
- Gold

unity_catalog_metastore: # Needs to be present if unity_catalog section is present
metastore_name: metastore-westeurope # Setting this will result in a new metastore being deployed
storage_account_name: stgexampleuc # Globally unique name for the Metastore storage account
resource_group_name: examplemetastore # Name for the resource group for Metastore resources
metastore_id: a12abc12-abcd-abcd-abcd-abcd1234abcd # Either metastore_name or metastore_id need to be set

databricks_account_id: a12abc12-abcd-abcd-abcd-abcd1234abcd # Databricks Account ID unique per tenant. Required to deploy Unity Catalog

spark_config: # Optional
spark.configuration.key: value
databricks_secrets:
Expand All @@ -52,6 +69,7 @@ transform: # Optional
category: "Memory Optimised"
autotermination_minutes: 120
runtime_engine: STANDARD # Optional: STANDARD or PHOTON (https://learn.microsoft.com/en-us/azure/databricks/runtime/photon)
data_security_mode: SINGLE_USER # Optional: see https://registry.terraform.io/providers/databricks/databricks/latest/docs/resources/cluster#data_security_mode
num_of_workers: 0 # Set to 0 for single node mode or any number for fixed cluster (ignored if autoscale also defined)
autoscale:
min_workers: 1
Expand Down
26 changes: 20 additions & 6 deletions infrastructure/transform/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions infrastructure/transform/data.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,8 @@ data "azurerm_log_analytics_workspace" "core" {
name = "log-${lower(var.naming_suffix)}"
resource_group_name = var.core_rg_name
}

# Get "Application ID" for ADF MI
data "azuread_service_principal" "adf_identity_sp" {
object_id = azurerm_data_factory.adf.identity[0].principal_id
}
7 changes: 7 additions & 0 deletions infrastructure/transform/databricks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ resource "databricks_cluster" "cluster" {
autotermination_minutes = var.transform.databricks_cluster.autotermination_minutes
num_workers = !local.autoscale_cluster ? var.transform.databricks_cluster.num_of_workers : null
runtime_engine = var.transform.databricks_cluster.runtime_engine
data_security_mode = var.transform.databricks_cluster.data_security_mode
single_user_name = var.transform.databricks_cluster.data_security_mode == "SINGLE_USER" ? databricks_service_principal.adf_managed_identity_sp.application_id : null

dynamic "autoscale" {
for_each = local.autoscale_cluster ? [1] : []
Expand Down Expand Up @@ -204,3 +206,8 @@ resource "databricks_secret_scope" "secrets" {
name = "flowehr-secrets"
depends_on = [time_sleep.wait_for_databricks_network]
}

resource "databricks_service_principal" "adf_managed_identity_sp" {
application_id = data.azuread_service_principal.adf_identity_sp.application_id
display_name = "ADF Service Principal for ${var.naming_suffix}"
}
4 changes: 4 additions & 0 deletions infrastructure/transform/datalake/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ output "adls_name" {
value = azurerm_storage_account.adls.name
}

output "adls_id" {
value = azurerm_storage_account.adls.id
}

output "databricks_adls_app_id" {
value = azuread_application.databricks_adls.application_id
}
Expand Down
10 changes: 8 additions & 2 deletions infrastructure/transform/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,14 @@ locals {
adb_linked_service_name = "ADBLinkedServiceViaMSI"
dbfs_storage_account_name = "dbfs${var.naming_suffix_truncated}"
datalake_enabled = try(var.transform.datalake, null) != null
autoscale_cluster = var.transform.databricks_cluster.autoscale != null
single_node = !local.autoscale_cluster && var.transform.databricks_cluster.num_of_workers == 0
unity_catalog_enabled = try(var.transform.unity_catalog, null) != null
create_unity_catalog_metastore = (
local.unity_catalog_enabled
&& try(var.transform.unity_catalog_metastore.metastore_name, null) != null
)

autoscale_cluster = var.transform.databricks_cluster.autoscale != null
single_node = !local.autoscale_cluster && var.transform.databricks_cluster.num_of_workers == 0

# IPs required for Databricks UDRs
# Built from https://learn.microsoft.com/en-us/azure/databricks/resources/supported-regions#--control-plane-nat-webapp-and-extended-infrastructure-ip-addresses-and-domains
Expand Down
79 changes: 79 additions & 0 deletions infrastructure/transform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,83 @@ module "datalake" {
databricks_adls_app_name = local.databricks_adls_app_name
databricks_secret_scope_id = databricks_secret_scope.secrets.id
tags = var.tags

providers = {
databricks = databricks
databricks.accounts = databricks.accounts
}
}

module "unity_catalog_metastore" {
count = local.create_unity_catalog_metastore ? 1 : 0
source = "./unity-catalog-metastore"

core_rg_name = var.core_rg_name
naming_suffix = var.naming_suffix

resource_group_name = var.transform.unity_catalog_metastore.resource_group_name
location = var.core_rg_location
tags = var.tags

metastore_name = var.transform.unity_catalog_metastore.metastore_name
storage_account_name = var.transform.unity_catalog_metastore.storage_account_name
metastore_access_connector_name = "metastore-access-connector"
private_dns_zones = var.private_dns_zones
tf_in_automation = var.tf_in_automation
deployer_ip = var.deployer_ip

catalog_admin_group_name = var.transform.unity_catalog.catalog_admin_group_name
external_storage_admin_group_name = var.transform.unity_catalog.external_storage_admin_group_name

providers = {
databricks = databricks
databricks.accounts = databricks.accounts
}
}

module "unity_catalog" {
count = local.unity_catalog_enabled ? 1 : 0
source = "./unity-catalog"

core_rg_name = var.core_rg_name
naming_suffix = var.naming_suffix

metastore_id = (
local.create_unity_catalog_metastore
? module.unity_catalog_metastore[0].metastore_id
: var.transform.unity_catalog_metastore.metastore_id
)
metastore_rg_name = var.transform.unity_catalog_metastore.resource_group_name
metastore_access_connector_name = "metastore-access-connector"
metastore_storage_account_name = var.transform.unity_catalog_metastore.storage_account_name
metastore_created = local.create_unity_catalog_metastore

catalog_name = var.transform.unity_catalog.catalog_name
catalog_name_prefix = var.transform.unity_catalog.catalog_name_prefix
catalog_admin_group_name = var.transform.unity_catalog.catalog_admin_group_name
catalog_admin_privileges = var.transform.unity_catalog.catalog_admin_privileges

schema_name = var.transform.unity_catalog.schema_name
schema_name_prefix = var.transform.unity_catalog.schema_name_prefix

external_storage_admin_group_name = var.transform.unity_catalog.external_storage_admin_group_name
external_storage_admin_privileges = var.transform.unity_catalog.external_storage_admin_privileges

databricks_workspace_name = azurerm_databricks_workspace.databricks.name
adf_managed_identity_sp_id = databricks_service_principal.adf_managed_identity_sp.id

external_storage_accounts = [{
storage_account_id = module.datalake[0].adls_id
storage_account_name = module.datalake[0].adls_name
container_names = var.transform.unity_catalog.datalake_zones
}]

private_dns_zones = var.private_dns_zones

providers = {
databricks = databricks
databricks.accounts = databricks.accounts
}

depends_on = [azurerm_databricks_workspace.databricks, module.unity_catalog_metastore]
}
10 changes: 9 additions & 1 deletion infrastructure/transform/pipeline.tf
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,15 @@ resource "azurerm_data_factory_pipeline" "pipeline" {
name = each.value.name
data_factory_id = azurerm_data_factory.adf.id
activities_json = jsonencode(each.value.properties.activities)
parameters = { for param_name, param in each.value.properties.parameters : param_name => param.defaultValue }

parameters = merge(
tomap({
for param_name, param in each.value.properties.parameters : param_name => param.defaultValue
}),
local.unity_catalog_enabled ? tomap({
"catalog_name" = module.unity_catalog[0].catalog_name,
"schema_name" = module.unity_catalog[0].schema_name
}) : tomap({}))

depends_on = [
azurerm_data_factory_linked_service_azure_databricks.msi_linked
Expand Down
52 changes: 51 additions & 1 deletion infrastructure/transform/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ include "shared" {
}

locals {
providers = read_terragrunt_config("${get_repo_root()}/providers.hcl")
providers = read_terragrunt_config("${get_repo_root()}/providers.hcl")
configuration = read_terragrunt_config("${get_repo_root()}/configuration.hcl")
}

terraform {
Expand Down Expand Up @@ -90,6 +91,48 @@ terraform {
EOF
}

generate "unity_catalog_terraform" {
path = "unity-catalog/terraform.tf"
if_exists = "overwrite_terragrunt"
contents = <<EOF
terraform {
required_version = "${local.providers.locals.terraform_version}"
required_providers {
${local.providers.locals.required_provider_azure}
${local.providers.locals.required_provider_azapi}
${local.providers.locals.required_provider_databricks}
}
}
provider "databricks" {
alias = "accounts"
}
EOF
}

generate "unity_catalog_metastore_terraform" {
path = "unity-catalog-metastore/terraform.tf"
if_exists = "overwrite_terragrunt"
contents = <<EOF
terraform {
required_version = "${local.providers.locals.terraform_version}"
required_providers {
${local.providers.locals.required_provider_azure}
${local.providers.locals.required_provider_azapi}
${local.providers.locals.required_provider_databricks}
}
}
provider "databricks" {
alias = "accounts"
}
EOF
}

generate "provider" {
path = "provider.tf"
if_exists = "overwrite_terragrunt"
Expand All @@ -100,6 +143,13 @@ provider "databricks" {
azure_workspace_resource_id = azurerm_databricks_workspace.databricks.id
host = azurerm_databricks_workspace.databricks.workspace_url
}
provider "databricks" {
host = "https://accounts.azuredatabricks.net"
alias = "accounts"
account_id = "${try(local.configuration.locals.merged_root_config.transform.databricks_account_id, "")}"
}
EOF
}

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 59a9fdd

Please sign in to comment.