From 4d09cca592bc68cd098272fca073341ba70d17f2 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Wed, 9 Oct 2024 11:59:10 -0700 Subject: [PATCH 1/6] add option to scope path --- dbt/adapters/databricks/impl.py | 20 +++++++++++++++++++ .../databricks/macros/adapters/python.sql | 8 +++----- .../databricks/macros/relations/location.sql | 3 ++- tests/conftest.py | 2 +- .../adapter/python_model/fixtures.py | 3 ++- .../adapter/python_model/test_python_model.py | 9 --------- 6 files changed, 28 insertions(+), 17 deletions(-) diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 6efddb072..54b33f1c4 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -112,6 +112,7 @@ class DatabricksConfig(AdapterConfig): file_format: str = "delta" table_format: TableFormat = TableFormat.DEFAULT location_root: Optional[str] = None + include_full_name_in_path: bool = False partition_by: Optional[Union[List[str], str]] = None clustered_by: Optional[Union[List[str], str]] = None liquid_clustered_by: Optional[Union[List[str], str]] = None @@ -206,6 +207,25 @@ def update_tblproperties_for_iceberg( result["delta.universalFormat.enabledFormats"] = "iceberg" return result + @available.parse(lambda *a, **k: 0) + def compute_external_path( + self, config: BaseConfig, model: BaseConfig, is_incremental: bool = False + ) -> str: + location_root = config.get("location_root") + database = model.get("database", "hive_metastore") + schema = model.get("schema", "default") + identifier = model.get("alias") + if location_root is None: + raise DbtConfigError("location_root is required for external tables.") + include_full_name_in_path = config.get("include_full_name_in_path", False) + if include_full_name_in_path: + path = os.path.join(location_root, database, schema, identifier) + else: + path = os.path.join(location_root, identifier) + if is_incremental: + path = path + "_tmp" + return path + # override/overload def acquire_connection( self, name: Optional[str] = None, query_header_context: Any = None diff --git a/dbt/include/databricks/macros/adapters/python.sql b/dbt/include/databricks/macros/adapters/python.sql index af068ffb5..4716a6e94 100644 --- a/dbt/include/databricks/macros/adapters/python.sql +++ b/dbt/include/databricks/macros/adapters/python.sql @@ -53,6 +53,7 @@ writer.saveAsTable("{{ target_relation }}") {%- macro py_get_writer_options() -%} {%- set location_root = config.get('location_root', validator=validation.any[basestring]) -%} +{%- set include_catalog_schema = config.get('include_full_name_in_path', False) -%} {%- set file_format = config.get('file_format', validator=validation.any[basestring])|default('delta', true) -%} {%- set partition_by = config.get('partition_by', validator=validation.any[list, basestring]) -%} {%- set liquid_clustered_by = config.get('liquid_clustered_by', validator=validation.any[list, basestring]) -%} @@ -60,11 +61,8 @@ writer.saveAsTable("{{ target_relation }}") {%- set buckets = config.get('buckets', validator=validation.any[int]) -%} .format("{{ file_format }}") {%- if location_root is not none %} -{%- set identifier = model['alias'] %} -{%- if is_incremental() %} -{%- set identifier = identifier + '__dbt_tmp' %} -{%- endif %} -.option("path", "{{ location_root }}/{{ identifier }}") +{%- set model_path = adapter.compute_external_path(config, model, is_incremental()) %} +.option("path", "{{ model_path }}") {%- endif -%} {%- if partition_by is not none -%} {%- if partition_by is string -%} diff --git a/dbt/include/databricks/macros/relations/location.sql b/dbt/include/databricks/macros/relations/location.sql index f18a94472..b4079a3ff 100644 --- a/dbt/include/databricks/macros/relations/location.sql +++ b/dbt/include/databricks/macros/relations/location.sql @@ -3,7 +3,8 @@ {%- set file_format = config.get('file_format', default='delta') -%} {%- set identifier = model['alias'] -%} {%- if location_root is not none %} - location '{{ location_root }}/{{ identifier }}' + {%- set model_path = adapter.compute_external_path(config, model, is_incremental()) %} + location '{{ model_path }}' {%- elif (not relation.is_hive_metastore()) and file_format != 'delta' -%} {{ exceptions.raise_compiler_error( 'Incompatible configuration: `location_root` must be set when using a non-delta file format with Unity Catalog' diff --git a/tests/conftest.py b/tests/conftest.py index b8a0e0775..a6b572116 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ def pytest_addoption(parser): - parser.addoption("--profile", action="store", default="databricks_uc_sql_endpoint", type=str) + parser.addoption("--profile", action="store", default="databricks_uc_cluster", type=str) # Using @pytest.mark.skip_profile('databricks_cluster') uses the 'skip_by_adapter_type' diff --git a/tests/functional/adapter/python_model/fixtures.py b/tests/functional/adapter/python_model/fixtures.py index 9e048d285..9f1421221 100644 --- a/tests/functional/adapter/python_model/fixtures.py +++ b/tests/functional/adapter/python_model/fixtures.py @@ -102,7 +102,8 @@ def model(dbt, spark): marterialized: table tags: ["python"] create_notebook: true - location_root: "{root}/{schema}" + include_full_name_in_path: true + location_root: "{{ env_var('DBT_DATABRICKS_LOCATION_ROOT') }}" columns: - name: date tests: diff --git a/tests/functional/adapter/python_model/test_python_model.py b/tests/functional/adapter/python_model/test_python_model.py index e20f11346..87f8a4a96 100644 --- a/tests/functional/adapter/python_model/test_python_model.py +++ b/tests/functional/adapter/python_model/test_python_model.py @@ -119,15 +119,6 @@ def project_config_update(self): } def test_expected_handling_of_complex_config(self, project): - unformatted_schema_yml = util.read_file("models", "schema.yml") - util.write_file( - unformatted_schema_yml.replace( - "root", os.environ["DBT_DATABRICKS_LOCATION_ROOT"] - ).replace("{schema}", project.test_schema), - "models", - "schema.yml", - ) - util.run_dbt(["seed"]) util.run_dbt(["build", "-s", "complex_config"]) util.run_dbt(["build", "-s", "complex_config"]) From 04ecadfa761829bbe557559556d647e12b24c4ef Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Thu, 10 Oct 2024 10:28:03 -0700 Subject: [PATCH 2/6] try more in tests --- tests/conftest.py | 2 +- tests/functional/adapter/basic/test_incremental.py | 2 ++ .../adapter/incremental/test_incremental_strategies.py | 3 +++ tests/functional/adapter/persist_docs/fixtures.py | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index a6b572116..b8a0e0775 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,7 @@ def pytest_addoption(parser): - parser.addoption("--profile", action="store", default="databricks_uc_cluster", type=str) + parser.addoption("--profile", action="store", default="databricks_uc_sql_endpoint", type=str) # Using @pytest.mark.skip_profile('databricks_cluster') uses the 'skip_by_adapter_type' diff --git a/tests/functional/adapter/basic/test_incremental.py b/tests/functional/adapter/basic/test_incremental.py index 8d630a0a9..3958b3304 100644 --- a/tests/functional/adapter/basic/test_incremental.py +++ b/tests/functional/adapter/basic/test_incremental.py @@ -34,6 +34,7 @@ def project_config_update(self): "models": { "+file_format": "parquet", "+location_root": f"{location_root}/parquet", + "+include_full_name_in_path": "true", "+incremental_strategy": "append", }, } @@ -61,6 +62,7 @@ def project_config_update(self): "models": { "+file_format": "csv", "+location_root": f"{location_root}/csv", + "+include_full_name_in_path": "true", "+incremental_strategy": "append", }, } diff --git a/tests/functional/adapter/incremental/test_incremental_strategies.py b/tests/functional/adapter/incremental/test_incremental_strategies.py index 6effcb0ed..88db60eed 100644 --- a/tests/functional/adapter/incremental/test_incremental_strategies.py +++ b/tests/functional/adapter/incremental/test_incremental_strategies.py @@ -52,6 +52,7 @@ def project_config_update(self): "models": { "+file_format": "parquet", "+location_root": f"{location_root}/parquet_append", + "+include_full_name_in_path": "true", "+incremental_strategy": "append", }, } @@ -129,6 +130,7 @@ def project_config_update(self): "models": { "+file_format": "parquet", "+location_root": f"{location_root}/parquet_insert_overwrite", + "+include_full_name_in_path": "true", "+incremental_strategy": "insert_overwrite", }, } @@ -144,6 +146,7 @@ def project_config_update(self): "models": { "+file_format": "parquet", "+location_root": f"{location_root}/parquet_insert_overwrite_partitions", + "+include_full_name_in_path": "true", "+incremental_strategy": "insert_overwrite", "+partition_by": "id", }, diff --git a/tests/functional/adapter/persist_docs/fixtures.py b/tests/functional/adapter/persist_docs/fixtures.py index 938578dc0..915188536 100644 --- a/tests/functional/adapter/persist_docs/fixtures.py +++ b/tests/functional/adapter/persist_docs/fixtures.py @@ -5,6 +5,7 @@ description: 'A seed description' config: location_root: '{{ env_var("DBT_DATABRICKS_LOCATION_ROOT") }}' + include_full_name_in_path: true persist_docs: relation: True columns: True @@ -22,6 +23,7 @@ description: 'A seed description' config: location_root: '/mnt/dbt_databricks/seeds' + include_full_name_in_path: true persist_docs: relation: True columns: True From 90d5fd5ba81216894e7594e3b48d0fc6ecc2641c Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 11 Oct 2024 08:29:54 -0700 Subject: [PATCH 3/6] remove unneeded line --- dbt/include/databricks/macros/adapters/python.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/dbt/include/databricks/macros/adapters/python.sql b/dbt/include/databricks/macros/adapters/python.sql index 4716a6e94..96da3ef9a 100644 --- a/dbt/include/databricks/macros/adapters/python.sql +++ b/dbt/include/databricks/macros/adapters/python.sql @@ -53,7 +53,6 @@ writer.saveAsTable("{{ target_relation }}") {%- macro py_get_writer_options() -%} {%- set location_root = config.get('location_root', validator=validation.any[basestring]) -%} -{%- set include_catalog_schema = config.get('include_full_name_in_path', False) -%} {%- set file_format = config.get('file_format', validator=validation.any[basestring])|default('delta', true) -%} {%- set partition_by = config.get('partition_by', validator=validation.any[list, basestring]) -%} {%- set liquid_clustered_by = config.get('liquid_clustered_by', validator=validation.any[list, basestring]) -%} From b2393d2f0e5ff40550365ef6acc23b54ebbcdff0 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 11 Oct 2024 09:23:08 -0700 Subject: [PATCH 4/6] fix unit tests --- tests/unit/macros/adapters/test_python_macros.py | 11 +---------- tests/unit/macros/relations/test_table_macros.py | 4 ++++ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/unit/macros/adapters/test_python_macros.py b/tests/unit/macros/adapters/test_python_macros.py index 0a8c655c9..c1f425b5d 100644 --- a/tests/unit/macros/adapters/test_python_macros.py +++ b/tests/unit/macros/adapters/test_python_macros.py @@ -33,21 +33,12 @@ def test_py_get_writer__specified_file_format(self, config, template): def test_py_get_writer__specified_location_root(self, config, template, context): config["location_root"] = "s3://fake_location" + template.globals["adapter"].compute_external_path.return_value = "s3://fake_location/schema" result = self.run_macro_raw(template, "py_get_writer_options") expected = '.format("delta")\n.option("path", "s3://fake_location/schema")' assert result == expected - def test_py_get_writer__specified_location_root_on_incremental( - self, config, template: Template, context - ): - config["location_root"] = "s3://fake_location" - context["is_incremental"].return_value = True - result = self.run_macro_raw(template, "py_get_writer_options") - - expected = '.format("delta")\n.option("path", "s3://fake_location/schema__dbt_tmp")' - assert result == expected - def test_py_get_writer__partition_by_single_column(self, config, template): config["partition_by"] = "name" result = self.run_macro_raw(template, "py_get_writer_options") diff --git a/tests/unit/macros/relations/test_table_macros.py b/tests/unit/macros/relations/test_table_macros.py index 0445a8b0e..61e49d418 100644 --- a/tests/unit/macros/relations/test_table_macros.py +++ b/tests/unit/macros/relations/test_table_macros.py @@ -28,6 +28,10 @@ def context(self, template) -> dict: return template.globals def render_create_table_as(self, template_bundle, temporary=False, sql="select 1"): + external_path = f"/mnt/root/{template_bundle.relation.identifier}" + template_bundle.template.globals["adapter"].compute_external_path.return_value = ( + external_path + ) return self.run_macro( template_bundle.template, "databricks__create_table_as", From 0fdd3c665d55203951c9929909931f3bb324e065 Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 11 Oct 2024 09:46:51 -0700 Subject: [PATCH 5/6] fix linting --- tests/unit/macros/adapters/test_python_macros.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/macros/adapters/test_python_macros.py b/tests/unit/macros/adapters/test_python_macros.py index c1f425b5d..590dd1c58 100644 --- a/tests/unit/macros/adapters/test_python_macros.py +++ b/tests/unit/macros/adapters/test_python_macros.py @@ -1,5 +1,4 @@ import pytest -from jinja2 import Template from mock import MagicMock from tests.unit.macros.base import MacroTestBase From 887bf9690fe1d194056c8a5d853ea7e6971ea57c Mon Sep 17 00:00:00 2001 From: Ben Cassell Date: Fri, 11 Oct 2024 13:48:37 -0700 Subject: [PATCH 6/6] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bc8786f1..339a8e56b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - Allow for the use of custom constraints, using the `custom` constraint type with an `expression` as the constraint (thanks @roydobbe). ([792](https://github.com/databricks/dbt-databricks/pull/792)) - Add "use_info_schema_for_columns" behavior flag to turn on use of information_schema to get column info where possible. This may have more latency but will not truncate complex data types the way that 'describe' can. ([808](https://github.com/databricks/dbt-databricks/pull/808)) - Add support for table_format: iceberg. This uses UniForm under the hood to provide iceberg compatibility for tables or incrementals. ([815](https://github.com/databricks/dbt-databricks/pull/815)) +- Add `include_full_name_in_path` config boolean for external locations. This writes tables to {location_root}/{catalog}/{schema}/{table} ([823](https://github.com/databricks/dbt-databricks/pull/823)) ### Under the Hood