diff --git a/.github/workflows/check-plugin.yml b/.github/workflows/check-plugin.yml index 045c36c13..00bc8a083 100644 --- a/.github/workflows/check-plugin.yml +++ b/.github/workflows/check-plugin.yml @@ -93,6 +93,34 @@ jobs: - name: Run linter run: make plugin=${{ inputs.plugin }} lint + RTD-build: + if: inputs.plugin == 'kedro-datasets' + defaults: + run: + shell: bash + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + - name: Set up Python 3.8 + uses: actions/setup-python@v3 + with: + python-version: "3.8" + - name: Cache python packages + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-ubuntu-latest-python-"3.8" + restore-keys: ${{inputs.plugin}} + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + pip install ".[docs]" + pip install ".[test]" + - name: RTD build for kedro-datasets + run: | + make rtd + e2e-tests: if: inputs.plugin != 'kedro-datasets' defaults: diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml index ef0c87ef9..fecb91db2 100644 --- a/.github/workflows/kedro-airflow.yml +++ b/.github/workflows/kedro-airflow.yml @@ -15,6 +15,9 @@ on: - "kedro-datasets/**" - "kedro-docker/**" - "kedro-telemetry/**" + schedule: + # Run every day at midnight (UTC time) + - cron: '0 0 * * *' jobs: airflow-test: diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml index 943453ee7..be4315cd9 100644 --- a/.github/workflows/kedro-datasets.yml +++ b/.github/workflows/kedro-datasets.yml @@ -15,6 +15,9 @@ on: - "kedro-airflow/**" - "kedro-docker/**" - "kedro-telemetry/**" + schedule: + # Run every day at midnight (UTC time) + - cron: '0 0 * * *' jobs: datasets-test: diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 71a77cb24..430058513 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -15,6 +15,9 @@ on: - "kedro-airflow/**" - "kedro-datasets/**" - "kedro-telemetry/**" + schedule: + # Run every day at midnight (UTC time) + - cron: '0 0 * * *' jobs: docker-test: diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml index f53841bde..88fb8ae7a 100644 --- a/.github/workflows/kedro-telemetry.yml +++ b/.github/workflows/kedro-telemetry.yml @@ -15,6 +15,9 @@ on: - "kedro-airflow/**" - "kedro-datasets/**" - "kedro-docker/**" + schedule: + # Run every day at midnight (UTC time) + - cron: '0 0 * * *' jobs: telemetry-test: diff --git a/Makefile b/Makefile index 074095ba7..1c6c7e478 100644 --- a/Makefile +++ b/Makefile @@ -60,3 +60,6 @@ test-no-spark-sequential: # kedro-datasets/snowflake tests skipped from default scope test-snowflake-only: cd kedro-datasets && pytest tests --no-cov --numprocesses 1 --dist loadfile -m snowflake + +rtd: + cd kedro-datasets && python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck diff --git a/kedro-airflow/README.md b/kedro-airflow/README.md index 6b1d59815..b61ed141d 100644 --- a/kedro-airflow/README.md +++ b/kedro-airflow/README.md @@ -46,7 +46,7 @@ Please visit the guide to [deploy Kedro as a Python package](https://kedro.readt #### What if my DAG file is in a different directory to my project folder? -By default the generated DAG file is configured to live in the same directory as your project as per this [template](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/kedro_airflow/airflow_dag_template.j2#L44). If your DAG file is located in a different directory to your project, you will need to tweak this manually after running the `kedro airflow create` command. +By default, the generated DAG file is configured to live in the same directory as your project as per this [template](https://github.com/kedro-org/kedro-plugins/blob/main/kedro-airflow/kedro_airflow/airflow_dag_template.j2#L44). If your DAG file is located in a different directory to your project, you will need to tweak this manually after running the `kedro airflow create` command. #### What if I want to use a different Jinja2 template? @@ -56,6 +56,92 @@ You can use the additional command line argument `--jinja-file` (alias `-j`) to kedro airflow create --jinja-file=./custom/template.j2 ``` +#### How can I pass arguments to the Airflow DAGs dynamically? + +`kedro-airflow` picks up configuration from `airflow.yml` in `conf/base` or `conf/local` of your Kedro project. +Or it could be in a folder starting with `airflow`. +The [parameters](https://docs.kedro.org/en/stable/configuration/parameters.html) are read by Kedro. +Arguments can be specified globally, or per pipeline: + +```yaml +# Global parameters +default: + start_date: [2023, 1, 1] + max_active_runs: 3 + # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs + schedule_interval: "@once" + catchup: false + # Default settings applied to all tasks + owner: "airflow" + depends_on_past: false + email_on_failure: false + email_on_retry: false + retries: 1 + retry_delay: 5 + +# Arguments specific to the pipeline (overrides the parameters above) +data_science: + owner: "airflow-ds" +``` + +Arguments can also be passed via `--params` in the command line: + +```bash +kedro airflow create --params "schedule_interval='@weekly'" +``` + +These variables are passed to the Jinja2 template that creates an Airflow DAG from your pipeline. + +### What if I want to use a configuration pattern other than `airflow*` and `airflow**`? + +In order to configure the config loader, update the `settings.py` file in your Kedro project. +For instance, if you would like to use the name `scheduler`, then change the file as follows: + +```python +CONFIG_LOADER_ARGS = { + "config_patterns": {"airflow": ["scheduler*", "scheduler/**"]} +} + +Follow Kedro's official documentation, to see how to add templating, custom resolvers etc. (https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader)[https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader] + +#### What if I want to pass different arguments? + +In order to pass arguments other than those specified in the default template, simply pass a custom template (see: _"What if I want to use a different Jinja2 template?"_) + +The syntax for arguments is: +``` +{{ argument_name }} +``` + +In order to make arguments optional, one can use: +``` +{{ argument_name | default("default_value") }} +``` + +For examples, please have a look at the default template (`airflow_dag_template.j2`). + +### What if I want to use a configuration file other than `airflow.yml`? + +The default configuration pattern is `["airflow*", "airflow/**"]`. +In order to configure the `OmegaConfigLoader`, update the `settings.py` file in your Kedro project as follows: + +```python +from kedro.config import OmegaConfigLoader +CONFIG_LOADER_CLASS = OmegaConfigLoader +CONFIG_LOADER_ARGS = { + # other args + "config_patterns": {"airflow": ["airflow*", "airflow/**"]} # configure the pattern for configuration files +} +``` + +Follow Kedro's official documentation, to see how to add templating, custom resolvers etc. (https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader)[https://docs.kedro.org/en/stable/configuration/advanced_configuration.html#how-to-do-templating-with-the-omegaconfigloader] + +#### How can I use Airflow runtime parameters? + +It is possible to pass parameters when triggering an Airflow DAG from the user interface. +In order to use this feature, create a custom template using the [Params syntax](https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/params.html). +See ["What if I want to use a different Jinja2 template?"](#what-if-i-want-to-use-a-different-jinja2-template) for instructions on using custom templates. + #### What if I want to use a different Airflow Operator? Which Airflow Operator to use depends on the environment your project is running in. diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index c2e0615b4..0d6fabd0c 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,6 +1,17 @@ -# Upcoming release 0.5.2 +# Upcoming Release + +# Release 0.6.0 * Change reference to `kedro.pipeline.Pipeline` object throughout test suite with `kedro.modular_pipeline.pipeline` factory. * Migrate all project metadata to static `pyproject.toml`. +* Configure DAG kwargs via `airflow.yml`. +* The generated DAG file now contains the pipeline name. +* Included help for CLI arguments (see `kedro airflow create --help`). +* Added additional CLI argument `--params` to pass configuration to the Jinja2 template. + +## Community contributions +Many thanks to the following Kedroids for contributing PRs to this release: + +* [sbrugman](https://github.com/sbrugman) # Release 0.5.1 * Added additional CLI argument `--jinja-file` to provide a path to a custom Jinja2 template. diff --git a/kedro-airflow/features/steps/cli_steps.py b/kedro-airflow/features/steps/cli_steps.py index 9fbde1df2..c4d23ad24 100644 --- a/kedro-airflow/features/steps/cli_steps.py +++ b/kedro-airflow/features/steps/cli_steps.py @@ -87,7 +87,7 @@ def install_kedro(context, version): if version == "latest": cmd = [context.pip, "install", "-U", "kedro[pandas]"] else: - cmd = [context.pip, "install", "kedro[pandas]=={}".format(version)] + cmd = [context.pip, "install", f"kedro[pandas]=={version}"] res = run(cmd, env=context.env) if res.returncode != OK_EXIT_CODE: @@ -121,7 +121,7 @@ def check_message_printed(context, msg): stdout = context.result.stdout assert msg in stdout, ( "Expected the following message segment to be printed on stdout: " - "{exp_msg},\nbut got {actual_msg}".format(exp_msg=msg, actual_msg=stdout) + f"{msg},\nbut got {stdout}" ) @@ -187,6 +187,6 @@ def check_status_code(context): if context.result.returncode != OK_EXIT_CODE: print(context.result.stdout) print(context.result.stderr) - assert False, "Expected exit code {}" " but got {}".format( - OK_EXIT_CODE, context.result.returncode + raise AssertionError( + f"Expected exit code {OK_EXIT_CODE} but got {context.result.returncode}" ) diff --git a/kedro-airflow/kedro_airflow/__init__.py b/kedro-airflow/kedro_airflow/__init__.py index 44d4aab54..7274ceb2e 100644 --- a/kedro-airflow/kedro_airflow/__init__.py +++ b/kedro-airflow/kedro_airflow/__init__.py @@ -1,3 +1,3 @@ """Kedro plugin for running a project with Airflow.""" -__version__ = "0.5.1" +__version__ = "0.6.0" diff --git a/kedro-airflow/kedro_airflow/airflow_dag_template.j2 b/kedro-airflow/kedro_airflow/airflow_dag_template.j2 index 92c6296e1..7c2f2706e 100644 --- a/kedro-airflow/kedro_airflow/airflow_dag_template.j2 +++ b/kedro-airflow/kedro_airflow/airflow_dag_template.j2 @@ -1,3 +1,4 @@ +from __future__ import annotations from datetime import datetime, timedelta from pathlib import Path @@ -10,14 +11,13 @@ from kedro.framework.project import configure_project class KedroOperator(BaseOperator): - @apply_defaults def __init__( self, package_name: str, pipeline_name: str, node_name: str, - project_path: str, + project_path: str | Path, env: str, *args, **kwargs ) -> None: @@ -35,46 +35,43 @@ class KedroOperator(BaseOperator): env=self.env) as session: session.run(self.pipeline_name, node_names=[self.node_name]) + # Kedro settings required to run your pipeline env = "{{ env }}" pipeline_name = "{{ pipeline_name }}" project_path = Path.cwd() package_name = "{{ package_name }}" -# Default settings applied to all tasks -default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 1, - 'retry_delay': timedelta(minutes=5) -} - # Using a DAG context manager, you don't have to specify the dag property of each task with DAG( - "{{ dag_name | safe | slugify }}", - start_date=datetime(2019, 1, 1), - max_active_runs=3, - schedule_interval=timedelta(minutes=30), # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs - default_args=default_args, - catchup=False # enable if you don't want historical dag runs to run -) as dag: - - tasks = {} - {% for node in pipeline.nodes %} - tasks["{{ node.name | safe | slugify }}"] = KedroOperator( - task_id="{{ node.name | safe | slugify }}", - package_name=package_name, - pipeline_name=pipeline_name, - node_name="{{ node.name | safe }}", - project_path=project_path, - env=env, + dag_id="{{ dag_name | safe | slugify }}", + start_date=datetime({{ start_date | default([2023, 1, 1]) | join(",")}}), + max_active_runs={{ max_active_runs | default(3) }}, + # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs + schedule_interval="{{ schedule_interval | default('@once') }}", + catchup={{ catchup | default(False) }}, + # Default settings applied to all tasks + default_args=dict( + owner="{{ owner | default('airflow') }}", + depends_on_past={{ depends_on_past | default(False) }}, + email_on_failure={{ email_on_failure | default(False) }}, + email_on_retry={{ email_on_retry | default(False) }}, + retries={{ retries | default(1) }}, + retry_delay=timedelta(minutes={{ retry_delay | default(5) }}) ) - {% endfor %} +) as dag: + tasks = { + {% for node in pipeline.nodes %} "{{ node.name | safe | slugify }}": KedroOperator( + task_id="{{ node.name | safe | slugify }}", + package_name=package_name, + pipeline_name=pipeline_name, + node_name="{{ node.name | safe }}", + project_path=project_path, + env=env, + ), +{% endfor %} } {% for parent_node, child_nodes in dependencies.items() -%} - {% for child in child_nodes %} - tasks["{{ parent_node.name | safe | slugify }}"] >> tasks["{{ child.name | safe | slugify }}"] + {% for child in child_nodes %} tasks["{{ parent_node.name | safe | slugify }}"] >> tasks["{{ child.name | safe | slugify }}"] {% endfor %} {%- endfor %} diff --git a/kedro-airflow/kedro_airflow/plugin.py b/kedro-airflow/kedro_airflow/plugin.py index c1a62b0f3..569e91be2 100644 --- a/kedro-airflow/kedro_airflow/plugin.py +++ b/kedro-airflow/kedro_airflow/plugin.py @@ -1,15 +1,25 @@ """ Kedro plugin for running a project with Airflow """ +from __future__ import annotations from collections import defaultdict from pathlib import Path +from typing import Any import click import jinja2 from click import secho +from kedro.config import MissingConfigException +from kedro.framework.cli.project import PARAMS_ARG_HELP +from kedro.framework.cli.utils import ENV_HELP, KedroCliError, _split_params +from kedro.framework.context import KedroContext from kedro.framework.project import pipelines -from kedro.framework.startup import ProjectMetadata +from kedro.framework.session import KedroSession +from kedro.framework.startup import ProjectMetadata, bootstrap_project from slugify import slugify +PIPELINE_ARG_HELP = """Name of the registered pipeline to convert. +If not set, the '__default__' pipeline is used.""" + @click.group(name="Kedro-Airflow") def commands(): # pylint: disable=missing-function-docstring @@ -22,15 +32,44 @@ def airflow_commands(): pass +def _load_config(context: KedroContext, pipeline_name: str) -> dict[str, Any]: + # Set the default pattern for `airflow` if not provided in `settings.py` + if "airflow" not in context.config_loader.config_patterns.keys(): + context.config_loader.config_patterns.update( # pragma: no cover + {"airflow": ["airflow*", "airflow/**"]} + ) + + assert "airflow" in context.config_loader.config_patterns.keys() + + # Load the config + try: + config_airflow = context.config_loader["airflow"] + except MissingConfigException: + # File does not exist + return {} + + dag_config = {} + # Load the default config if specified + if "default" in config_airflow: + dag_config.update(config_airflow["default"]) + # Update with pipeline-specific config if present + if pipeline_name in config_airflow: + dag_config.update(config_airflow[pipeline_name]) + return dag_config + + @airflow_commands.command() -@click.option("-p", "--pipeline", "pipeline_name", default="__default__") -@click.option("-e", "--env", default="local") +@click.option( + "-p", "--pipeline", "pipeline_name", default="__default__", help=PIPELINE_ARG_HELP +) +@click.option("-e", "--env", default="local", help=ENV_HELP) @click.option( "-t", "--target-dir", "target_path", type=click.Path(writable=True, resolve_path=True, file_okay=False), default="./airflow_dags/", + help="The directory path to store the generated Airflow dags", ) @click.option( "-j", @@ -39,6 +78,14 @@ def airflow_commands(): exists=True, readable=True, resolve_path=True, file_okay=True, dir_okay=False ), default=Path(__file__).parent / "airflow_dag_template.j2", + help="The template file for the generated Airflow dags", +) +@click.option( + "--params", + type=click.UNPROCESSED, + default="", + help=PARAMS_ARG_HELP, + callback=_split_params, ) @click.pass_obj def create( @@ -47,8 +94,18 @@ def create( env, target_path, jinja_file, + params, ): # pylint: disable=too-many-locals,too-many-arguments """Create an Airflow DAG for a project""" + project_path = Path.cwd().resolve() + bootstrap_project(project_path) + with KedroSession.create(project_path=project_path, env=env) as session: + context = session.load_context() + dag_config = _load_config(context, pipeline_name) + + # Update with params if provided + dag_config.update(params) + jinja_file = Path(jinja_file).resolve() loader = jinja2.FileSystemLoader(jinja_file.parent) jinja_env = jinja2.Environment(autoescape=True, loader=loader, lstrip_blocks=True) @@ -56,7 +113,11 @@ def create( template = jinja_env.get_template(jinja_file.name) package_name = metadata.package_name - dag_filename = f"{package_name}_dag.py" + dag_filename = ( + f"{package_name}_dag.py" + if pipeline_name == "__default__" + else f"{package_name}_{pipeline_name}_dag.py" + ) target_path = Path(target_path) target_path = target_path / dag_filename @@ -64,6 +125,8 @@ def create( target_path.parent.mkdir(parents=True, exist_ok=True) pipeline = pipelines.get(pipeline_name) + if pipeline is None: + raise KedroCliError(f"Pipeline {pipeline_name} not found.") dependencies = defaultdict(list) for node, parent_nodes in pipeline.node_dependencies.items(): @@ -77,6 +140,7 @@ def create( pipeline_name=pipeline_name, package_name=package_name, pipeline=pipeline, + **dag_config, ).dump(str(target_path)) secho("") @@ -84,7 +148,8 @@ def create( secho(str(target_path)) secho("This file should be copied to your Airflow DAG folder.", fg="yellow") secho( - "The Airflow configuration can be customized by editing this file.", fg="green" + "The Airflow configuration can be customized by editing this file.", + fg="green", ) secho("") secho( @@ -101,4 +166,3 @@ def create( "And all local paths in both the data catalog and log config must be absolute paths.", fg="yellow", ) - secho("") diff --git a/kedro-airflow/tests/conftest.py b/kedro-airflow/tests/conftest.py index c23cc5916..ea285bb2c 100644 --- a/kedro-airflow/tests/conftest.py +++ b/kedro-airflow/tests/conftest.py @@ -4,16 +4,21 @@ discover them automatically. More info here: https://docs.pytest.org/en/latest/fixture.html """ +from __future__ import annotations + +import os from pathlib import Path from shutil import copyfile from click.testing import CliRunner +from cookiecutter.main import cookiecutter from kedro import __version__ as kedro_version +from kedro.framework.cli.starters import TEMPLATE_PATH from kedro.framework.startup import ProjectMetadata from pytest import fixture -@fixture(name="cli_runner") +@fixture(name="cli_runner", scope="session") def cli_runner(): runner = CliRunner() cwd = Path.cwd() @@ -23,10 +28,79 @@ def cli_runner(): yield runner -@fixture -def metadata(cli_runner): # pylint: disable=unused-argument +def _create_kedro_settings_py(file_name: Path, patterns: list[str]): + patterns = ", ".join([f'"{p}"' for p in patterns]) + content = f"""from kedro.config import OmegaConfigLoader +CONFIG_LOADER_CLASS = OmegaConfigLoader +CONFIG_LOADER_ARGS = {{ + "config_patterns": {{ + "airflow": [{patterns}], # configure the pattern for configuration files + }} +}} +""" + file_name.write_text(content) + + +@fixture(scope="session") +def kedro_project(cli_runner): # pylint: disable=unused-argument + tmp_path = Path().cwd() + # From `kedro-mlflow.tests.conftest.py` + config = { + "output_dir": tmp_path, + "kedro_version": kedro_version, + "project_name": "This is a fake project", + "repo_name": "fake-project", + "python_package": "fake_project", + "include_example": True, + } + + cookiecutter( + str(TEMPLATE_PATH), + output_dir=config["output_dir"], + no_input=True, + extra_context=config, + ) + + pipeline_registry_py = """ +from kedro.pipeline import Pipeline, node + + +def identity(arg): + return arg + + +def register_pipelines(): + pipeline = Pipeline( + [ + node(identity, ["input"], ["intermediate"], name="node0"), + node(identity, ["intermediate"], ["output"], name="node1"), + ], + tags="pipeline0", + ) + return { + "__default__": pipeline, + "ds": pipeline, + } + """ + + project_path = tmp_path / "fake-project" + (project_path / "src" / "fake_project" / "pipeline_registry.py").write_text( + pipeline_registry_py + ) + + settings_file = project_path / "src" / "fake_project" / "settings.py" + _create_kedro_settings_py( + settings_file, ["airflow*", "airflow/**", "scheduler*", "scheduler/**"] + ) + + os.chdir(project_path) + return project_path + + +@fixture(scope="session") +def metadata(kedro_project): # pylint: disable=unused-argument # cwd() depends on ^ the isolated filesystem, created by CliRunner() - project_path = Path.cwd() + project_path = kedro_project return ProjectMetadata( project_path / "pyproject.toml", "hello_world", diff --git a/kedro-airflow/tests/test_plugin.py b/kedro-airflow/tests/test_plugin.py index 77c051ff5..4b67ff840 100644 --- a/kedro-airflow/tests/test_plugin.py +++ b/kedro-airflow/tests/test_plugin.py @@ -1,50 +1,228 @@ +from __future__ import annotations + from pathlib import Path +from typing import Any import pytest -from kedro.framework.project import pipelines -from kedro.pipeline import node -from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline +import yaml from kedro_airflow.plugin import commands -def identity(arg): - return arg - - @pytest.mark.parametrize( "dag_name,pipeline_name,command", [ # Test normal execution - ("hello_world_dag", "__default__", ["airflow", "create"]), + ("hello_world", "__default__", ["airflow", "create"]), # Test execution with alternate pipeline name - ("hello_world_dag", "ds", ["airflow", "create", "--pipeline", "ds"]), - # Test execution with different dir and filename for Jinja2 Template - ( - "hello_world_dag", - "__default__", - ["airflow", "create", "-j", "airflow_dag.j2"], - ), + ("hello_world", "ds", ["airflow", "create", "--pipeline", "ds"]), ], ) -def test_create_airflow_dag( - dag_name, pipeline_name, command, mocker, cli_runner, metadata -): +def test_create_airflow_dag(dag_name, pipeline_name, command, cli_runner, metadata): """Check the generation and validity of a simple Airflow DAG.""" - dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}.py" - mock_pipeline = modular_pipeline( - [ - node(identity, ["input"], ["intermediate"], name="node0"), - node(identity, ["intermediate"], ["output"], name="node1"), - ], - tags="pipeline0", + dag_file = ( + Path.cwd() + / "airflow_dags" + / ( + f"{dag_name}_dag.py" + if pipeline_name == "__default__" + else f"{dag_name}_{pipeline_name}_dag.py" + ) + ) + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + + expected_airflow_dag = 'tasks["node0"] >> tasks["node1"]' + with dag_file.open(encoding="utf-8") as f: + dag_code = [line.strip() for line in f.read().splitlines()] + assert expected_airflow_dag in dag_code + dag_file.unlink() + + +def _create_kedro_airflow_yml(file_name: Path, content: dict[str, Any]): + file_name.parent.mkdir(parents=True, exist_ok=True) + with file_name.open("w") as fp: + yaml.dump(content, fp) + + +def test_airflow_config_params( + cli_runner, metadata +): # pylint: disable=too-many-statements + """Check if config variables are picked up""" + dag_name = "hello_world" + template_name = "airflow_params.j2" + content = "{{ owner | default('hello')}}" + + _create_kedro_airflow_jinja_template(Path.cwd(), template_name, content) + + # default + default_content = "hello" + command = ["airflow", "create", "-j", template_name] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == default_content + dag_file.unlink() + + # "--params" + expected_content = "testme" + command = ["airflow", "create", "--params", "owner=testme", "-j", template_name] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + dag_file.unlink() + + # airflow.yml + expected_content = "someone else" + file_name = Path.cwd() / "conf" / "base" / "airflow.yml" + _create_kedro_airflow_yml(file_name, {"default": {"owner": expected_content}}) + command = ["airflow", "create", "-j", template_name] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + file_name.unlink() + dag_file.unlink() + + # ../airflow.yml + expected_content = "yet someone else" + file_name = Path.cwd() / "conf" / "base" / "airflow" / "default.yml" + _create_kedro_airflow_yml(file_name, {"default": {"owner": expected_content}}) + command = ["airflow", "create", "-j", template_name] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + file_name.unlink() + + # random.yml + expected_content = "yet someone else again" + file_name = Path.cwd() / "conf" / "base" / "random.yml" + _create_kedro_airflow_yml(file_name, {"default": {"owner": expected_content}}) + command = ["airflow", "create", "-j", template_name] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == default_content + dag_file.unlink() + + # scheduler.yml + file_name = Path.cwd() / "conf" / "base" / "scheduler.yml" + _create_kedro_airflow_yml(file_name, {"default": {"owner": expected_content}}) + command = ["airflow", "create", "-j", template_name] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + dag_file.unlink() + file_name.unlink() + + # env + expected_content = "again someone else" + file_name = Path.cwd() / "conf" / "local" / "airflow.yml" + _create_kedro_airflow_yml(file_name, {"default": {"owner": expected_content}}) + command = ["airflow", "create", "-j", template_name, "-e", "local"] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + dag_file.unlink() + + # custom pipeline name + expected_content = "finally someone else" + file_name = Path.cwd() / "conf" / "base" / "airflow.yml" + _create_kedro_airflow_yml( + file_name, {"default": {"owner": "foobar"}, "ds": {"owner": expected_content}} ) - mocker.patch.dict(pipelines, {pipeline_name: mock_pipeline}) + command = ["airflow", "create", "-j", template_name, "-p", "ds"] + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_ds_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + dag_file.unlink() + + +def _create_kedro_airflow_jinja_template(path: Path, name: str, content: str): + (path / name).write_text(content) + + +def test_custom_template_exists(cli_runner, metadata): + """Test execution with different dir and filename for Jinja2 Template""" + dag_name = "hello_world" + template_name = "custom_template.j2" + command = ["airflow", "create", "-j", template_name] + content = "print('my custom dag')" + # because there are no jinja variables + expected_content = content + + _create_kedro_airflow_jinja_template(Path.cwd(), template_name, content) + + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" result = cli_runner.invoke(commands, command, obj=metadata) - assert result.exit_code == 0 + assert result.exit_code == 0, (result.exit_code, result.stdout) + assert dag_file.exists() + assert dag_file.read_text() == expected_content + + +def test_custom_template_nonexistent(cli_runner, metadata): + """Test execution with different dir and filename for Jinja2 Template""" + template_name = "non_existent_custom_template.j2" + command = ["airflow", "create", "-j", template_name] + result = cli_runner.invoke(commands, command, obj=metadata) + assert result.exit_code == 2 + assert ( + f"Error: Invalid value for '-j' / '--jinja-file': File '{template_name}' does not exist." + in result.stdout + ) + + +def _kedro_create_env(project_root: Path): + (project_root / "conf" / "remote").mkdir(parents=True) + + +def test_create_airflow_dag_env_parameter_exists(cli_runner, metadata): + """Test the `env` parameter""" + dag_name = "hello_world" + command = ["airflow", "create", "--env", "remote"] + + _kedro_create_env(Path.cwd()) + + dag_file = Path.cwd() / "airflow_dags" / f"{dag_name}_dag.py" + result = cli_runner.invoke(commands, command, obj=metadata) + + assert result.exit_code == 0, (result.exit_code, result.stdout) assert dag_file.exists() expected_airflow_dag = 'tasks["node0"] >> tasks["node1"]' - with open(dag_file, "r", encoding="utf-8") as f: + with dag_file.open(encoding="utf-8") as f: dag_code = [line.strip() for line in f.read().splitlines()] assert expected_airflow_dag in dag_code + + +def test_create_airflow_dag_nonexistent_pipeline(cli_runner, metadata): + """Test executing with a non-existing pipeline""" + command = ["airflow", "create", "--pipeline", "de"] + result = cli_runner.invoke(commands, command, obj=metadata) + assert result.exit_code == 1 + assert ( + "kedro.framework.cli.utils.KedroCliError: Pipeline de not found." + in result.stdout + ) diff --git a/kedro-datasets/.readthedocs.yaml b/kedro-datasets/.readthedocs.yaml new file mode 100644 index 000000000..ca40fa54c --- /dev/null +++ b/kedro-datasets/.readthedocs.yaml @@ -0,0 +1,30 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.8" + jobs: + pre_build: + - pip freeze + - python -m sphinx -WETan -j auto -D language=en -b linkcheck -d kedro-datasets/_build/doctrees kedro-datasets/docs/source kedro-datasets/_build/linkcheck + +# Build documentation in the docs/ directory with Sphinx +sphinx: + builder: html + configuration: kedro-datasets/docs/source/conf.py + fail_on_warning: true + +# Optionally set the version of Python and requirements required to build your docs +python: + install: + - method: pip + path: kedro-datasets + extra_requirements: + - docs + - test diff --git a/kedro-datasets/README.md b/kedro-datasets/README.md index 2a4e271d5..daf3ffe4b 100644 --- a/kedro-datasets/README.md +++ b/kedro-datasets/README.md @@ -27,7 +27,7 @@ These data connectors are supported with the APIs of `pandas`, `spark`, `network [The Data Catalog](https://kedro.readthedocs.io/en/stable/data/data_catalog.html) allows you to work with a range of file formats on local file systems, network file systems, cloud object stores, and Hadoop. -Here is a full list of [supported data connectors and APIs](https://kedro.readthedocs.io/en/stable/kedro.datasets.html). +Here is a full list of [supported data connectors and APIs](https://docs.kedro.org/en/stable/kedro_datasets.html). ## How can I create my own `AbstractDataSet` implementation? diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index c25c03b18..cbf4324fa 100644 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,18 +1,49 @@ -# Upcoming Release: +# Upcoming Release +## Major features and improvements + +## Bug fixes and other changes + +# Release 1.5.3: +## Bug fixes and other changes +* Made `databricks.ManagedTableDataSet` read-only by default. + * The user needs to specify `write_mode` to allow `save` on the data set. +* Fixed an issue on `api.APIDataSet` where the sent data was doubly converted to json + string (once by us and once by the `requests` library). +* Fixed problematic `kedro-datasets` optional dependencies, revert to `setup.py` + +## Community contributions +# Release 1.5.2: + +## Bug fixes and other changes +* Fixed problematic `kedro-datasets` optional dependencies. + +# Release 1.5.1: + +## Bug fixes and other changes +* Fixed problematic docstrings in `pandas.DeltaTableDataSet` causing Read the Docs builds on Kedro to fail. + +# Release 1.5.0 ## Major features and improvements -* Added automatic inference of file format for `pillow.ImageDataSet` to be passed to `save()` +* Implemented lazy loading of dataset subpackages and classes. + * Suppose that SQLAlchemy, a Python SQL toolkit, is installed in your Python environment. With this change, the SQLAlchemy library will not be loaded (for `pandas.SQLQueryDataSet` or `pandas.SQLTableDataSet`) if you load a different pandas dataset (e.g. `pandas.CSVDataSet`). +* Added automatic inference of file format for `pillow.ImageDataSet` to be passed to `save()`. +* Added `pandas.DeltaTableDataSet`. ## Bug fixes and other changes +* Improved error messages for missing dataset dependencies. + * Suppose that SQLAlchemy, a Python SQL toolkit, is not installed in your Python environment. Previously, `from kedro_datasets.pandas import SQLQueryDataSet` or `from kedro_datasets.pandas import SQLTableDataSet` would result in `ImportError: cannot import name 'SQLTableDataSet' from 'kedro_datasets.pandas'`. Now, the same imports raise the more helpful and intuitive `ModuleNotFoundError: No module named 'sqlalchemy'`. ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: * [Daniel-Falk](https://github.com/daniel-falk) +* [afaqueahmad7117](https://github.com/afaqueahmad7117) +* [everdark](https://github.com/everdark) # Release 1.4.2 ## Bug fixes and other changes -* Fixed documentations of `GeoJSONDataSet` and `SparkStreamingDataSet` +* Fixed documentations of `GeoJSONDataSet` and `SparkStreamingDataSet`. * Fixed problematic docstrings causing Read the Docs builds on Kedro to fail. # Release 1.4.1: @@ -27,21 +58,22 @@ Many thanks to the following Kedroids for contributing PRs to this release: ## Bug fixes and other changes * Fixed problematic docstrings of `APIDataSet`. + # Release 1.3.0: ## Major features and improvements * Added pandas 2.0 support. * Added SQLAlchemy 2.0 support (and dropped support for versions below 1.4). -* Added a save method to the APIDataSet +* Added a save method to `APIDataSet`. * Reduced constructor arguments for `APIDataSet` by replacing most arguments with a single constructor argument `load_args`. This makes it more consistent with other Kedro DataSets and the underlying `requests` API, and automatically enables the full configuration domain: stream, certificates, proxies, and more. -* Relaxed Kedro version pin to `>=0.16` +* Relaxed Kedro version pin to `>=0.16`. * Added `metadata` attribute to all existing datasets. This is ignored by Kedro, but may be consumed by users or external plugins. * Added `ManagedTableDataSet` for managed delta tables on Databricks. ## Bug fixes and other changes * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x. * Upgraded required `polars` version to 0.17. -* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in kedro-datasets. +* Renamed `TensorFlowModelDataset` to `TensorFlowModelDataSet` to be consistent with all other plugins in Kedro-Datasets. ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: @@ -102,11 +134,11 @@ Datasets are Kedro’s way of dealing with input and output in a data and machin The datasets have always been part of the core Kedro Framework project inside `kedro.extras`. In Kedro `0.19.0`, we will remove datasets from Kedro to reduce breaking changes associated with dataset dependencies. Instead, users will need to use the datasets from the `kedro-datasets` repository instead. ## Major features and improvements -* Changed `pandas.ParquetDataSet` to load data using pandas instead of parquet +* Changed `pandas.ParquetDataSet` to load data using pandas instead of parquet. # Release 0.1.0: -The initial release of `kedro-datasets`. +The initial release of Kedro-Datasets. ## Thanks to our main contributors diff --git a/kedro-datasets/docs/source/_templates/autosummary/base.rst b/kedro-datasets/docs/source/_templates/autosummary/base.rst new file mode 100644 index 000000000..b7556ebf7 --- /dev/null +++ b/kedro-datasets/docs/source/_templates/autosummary/base.rst @@ -0,0 +1,5 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} diff --git a/kedro-datasets/docs/source/_templates/autosummary/class.rst b/kedro-datasets/docs/source/_templates/autosummary/class.rst new file mode 100644 index 000000000..10c8ff8be --- /dev/null +++ b/kedro-datasets/docs/source/_templates/autosummary/class.rst @@ -0,0 +1,32 @@ +{{ fullname | escape | underline }} + +.. currentmodule:: {{ module }} + +.. autoclass:: {{ objname }} + :members: + :undoc-members: + :inherited-members: + + {% block attributes %} + {% if attributes %} + .. rubric:: Attributes + + .. autosummary:: + {% for item in attributes %} + ~{{ name }}.{{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block methods %} + {% if methods %} + .. rubric:: Methods + + .. autosummary:: + {% for item in all_methods %} + {%- if not item.startswith('_') %} + ~{{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} + {% endif %} + {% endblock %} diff --git a/kedro-datasets/docs/source/_templates/autosummary/module.rst b/kedro-datasets/docs/source/_templates/autosummary/module.rst new file mode 100644 index 000000000..a496ca3f5 --- /dev/null +++ b/kedro-datasets/docs/source/_templates/autosummary/module.rst @@ -0,0 +1,56 @@ +{{ fullname | escape | underline }} + +.. rubric:: Description + +.. automodule:: {{ fullname }} + + {% block functions %} + {% if functions %} + .. rubric:: Functions + + .. autosummary:: + :toctree: + {% for item in functions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block classes %} + {% if classes %} + .. rubric:: Classes + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + {% for item in classes %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + + {% block exceptions %} + {% if exceptions %} + .. rubric:: Exceptions + + .. autosummary:: + :toctree: + :template: autosummary/class.rst + {% for item in exceptions %} + {{ item }} + {%- endfor %} + {% endif %} + {% endblock %} + +{% block modules %} +{% if modules %} +.. rubric:: Modules + +.. autosummary:: + :toctree: + :recursive: +{% for item in modules %} + {{ item }} +{%- endfor %} +{% endif %} +{% endblock %} diff --git a/kedro-datasets/docs/source/_templates/breadcrumbs.html b/kedro-datasets/docs/source/_templates/breadcrumbs.html new file mode 100644 index 000000000..49fa4779f --- /dev/null +++ b/kedro-datasets/docs/source/_templates/breadcrumbs.html @@ -0,0 +1,94 @@ +{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #} + +{% if page_source_suffix %} +{% set suffix = page_source_suffix %} +{% else %} +{% set suffix = source_suffix %} +{% endif %} + +{# modification to enable custom github_url #} + +{% if meta is not defined or meta is none %} + {% set meta = {} %} +{% endif %} + +{% if github_url is defined %} + {% set _dummy = meta.update({'github_url': github_url}) %} +{% endif %} + +{# // modification to enable custom github_url #} + +{% if meta is defined and meta is not none %} +{% set check_meta = True %} +{% else %} +{% set check_meta = False %} +{% endif %} + +{% if check_meta and 'github_url' in meta %} +{% set display_github = True %} +{% endif %} + +{% if check_meta and 'bitbucket_url' in meta %} +{% set display_bitbucket = True %} +{% endif %} + +{% if check_meta and 'gitlab_url' in meta %} +{% set display_gitlab = True %} +{% endif %} + +
diff --git a/kedro-datasets/docs/source/_templates/layout.html b/kedro-datasets/docs/source/_templates/layout.html new file mode 100644 index 000000000..ecdde06f1 --- /dev/null +++ b/kedro-datasets/docs/source/_templates/layout.html @@ -0,0 +1,8 @@ +{% extends "!layout.html" %} + +{%- block extrahead %} + +{% endblock %} diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py new file mode 100644 index 000000000..4b231efe9 --- /dev/null +++ b/kedro-datasets/docs/source/conf.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +# +# Kedro documentation build configuration file, created by +# sphinx-quickstart on Mon Dec 18 11:31:24 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. +from __future__ import annotations + +import importlib +import os +import re +import sys +from inspect import getmembers, isclass, isfunction +from pathlib import Path + +from click import secho, style +from kedro import __version__ as release + +# -- Project information ----------------------------------------------------- + +project = "kedro-datasets" +author = "kedro" + +# The short X.Y version. +version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) + + +# -- General configuration --------------------------------------------------- +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", + "sphinx.ext.doctest", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx_copybutton", + "myst_parser", + "notfound.extension", +] + +# enable autosummary plugin (table of contents for modules/classes/class +# methods) +autosummary_generate = True +autosummary_generate_overwrite = False +napoleon_include_init_with_doc = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [ + "**.ipynb_checkpoints", + "_templates", + "modules.rst", + "source", + "kedro_docs_style_guide.md", +] + + +type_targets = { + "py:class": ( + "kedro.io.core.AbstractDataSet", + "kedro.io.AbstractDataSet", + "kedro.io.core.Version", + "requests.auth.AuthBase", + "google.oauth2.credentials.Credentials", + "deltalake.table.Metadata", + "DataCatalog" + ), + "py:data": ( + "typing.Any", + "typing.Union", + "typing.Optional", + "typing.Tuple", + ), + "py:exc": ( + "DataSetError", + "DatasetError", + ), +} +# https://stackoverflow.com/questions/61770698/sphinx-nit-picky-mode-but-only-for-links-i-explicitly-wrote +nitpick_ignore = [(key, value) for key in type_targets for value in type_targets[key]] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" +here = Path(__file__).parent.absolute() + +# Theme options are theme-specific and customise the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +html_theme_options = {"collapse_navigation": False, "style_external_links": True} + +# Removes, from all docs, the copyright footer. +html_show_copyright = False + +# retry before render a link broken (fix for "too many requests") +linkcheck_retries = 5 +linkcheck_rate_limit_timeout = 2.0 + +html_context = { + "display_github": True, + "github_url": "https://github.com/kedro-org/kedro-plugins/tree/main/kedro-datasets/docs/source", +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + +html_show_sourcelink = False + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "Kedrodoc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [(master_doc, "Kedro.tex", "Kedro Documentation", "Kedro", "manual")] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "kedro", "Kedro Documentation", [author], 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "Kedro", + "Kedro Documentation", + author, + "Kedro", + "Kedro is a Python framework for creating reproducible, maintainable and modular data science code.", + "Data-Science", + ) +] + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# -- Kedro specific configuration ----------------------------------------- +KEDRO_MODULES = [ + "kedro_datasets", +] + + +def get_classes(module): + importlib.import_module(module) + return [obj[0] for obj in getmembers(sys.modules[module], lambda obj: isclass(obj))] + + +def get_functions(module): + importlib.import_module(module) + return [ + obj[0] for obj in getmembers(sys.modules[module], lambda obj: isfunction(obj)) + ] + + +def remove_arrows_in_examples(lines): + for i, line in enumerate(lines): + lines[i] = line.replace(">>>", "") + + +def autolink_replacements(what: str) -> list[tuple[str, str, str]]: + """ + Create a list containing replacement tuples of the form: + (``regex``, ``replacement``, ``obj``) for all classes and methods which are + imported in ``KEDRO_MODULES`` ``__init__.py`` files. The ``replacement`` + is a reStructuredText link to their documentation. + + For example, if the docstring reads: + This LambdaDataSet loads and saves ... + + Then the word ``LambdaDataSet``, will be replaced by + :class:`~kedro.io.LambdaDataSet` + + Works for plural as well, e.g: + These ``LambdaDataSet``s load and save + + Will convert to: + These :class:`kedro.io.LambdaDataSet` load and save + + Args: + what: The objects to create replacement tuples for. Possible values + ["class", "func"]. + + Returns: + A list of tuples: (regex, replacement, obj), for all "what" objects + imported in __init__.py files of ``KEDRO_MODULES``. + + """ + replacements = [] + suggestions = [] + for module in KEDRO_MODULES: + if what == "class": + objects = get_classes(module) + elif what == "func": + objects = get_functions(module) + + # Look for recognised class names/function names which are + # surrounded by double back-ticks + if what == "class": + # first do plural only for classes + replacements += [ + ( + rf"``{obj}``s", + f":{what}:`~{module}.{obj}`\\\\s", + obj, + ) + for obj in objects + ] + + # singular + replacements += [ + (rf"``{obj}``", f":{what}:`~{module}.{obj}`", obj) for obj in objects + ] + + # Look for recognised class names/function names which are NOT + # surrounded by double back-ticks, so that we can log these in the + # terminal + if what == "class": + # first do plural only for classes + suggestions += [ + (rf"(?>>" in lines[i]: + continue + + for existing, replacement, obj in suggestions: + new = re.sub(existing, rf"{replacement}", lines[i]) + if new == lines[i]: + continue + if ":rtype:" in lines[i] or ":type " in lines[i]: + continue + + if not title_printed: + secho("-" * 50 + "\n" + name + ":\n" + "-" * 50, fg="blue") + title_printed = True + + print( + "[" + + str(i) + + "] " + + re.sub(existing, r"{}".format(style(obj, fg="magenta")), lines[i]) + ) + print( + "[" + + str(i) + + "] " + + re.sub(existing, r"``{}``".format(style(obj, fg="green")), lines[i]) + ) + + if title_printed: + print("\n") + + +def autolink_classes_and_methods(lines): + for i in range(len(lines)): + if ">>>" in lines[i]: + continue + + for existing, replacement, obj in replacements: + lines[i] = re.sub(existing, rf"{replacement}", lines[i]) + + +def autodoc_process_docstring(app, what, name, obj, options, lines): + try: + # guarded method to make sure build never fails + log_suggestions(lines, name) + autolink_classes_and_methods(lines) + except Exception as e: + print( + style( + "Failed to check for class name mentions that can be " + "converted to reStructuredText links in docstring of {}. " + "Error is: \n{}".format(name, str(e)), + fg="red", + ) + ) + + remove_arrows_in_examples(lines) + + +def env_override(default_appid): + build_version = os.getenv("READTHEDOCS_VERSION") + + if build_version == "latest": + return os.environ["HEAP_APPID_QA"] + if build_version == "stable": + return os.environ["HEAP_APPID_PROD"] + + return default_appid # default to Development for local builds + + +def _add_jinja_filters(app): + # https://github.com/crate/crate/issues/10833 + from sphinx.builders.latex import LaTeXBuilder + from sphinx.builders.linkcheck import CheckExternalLinksBuilder + + # LaTeXBuilder is used in the PDF docs build, + # and it doesn't have attribute 'templates' + if not ( + isinstance(app.builder, (LaTeXBuilder,CheckExternalLinksBuilder)) + ): + app.builder.templates.environment.filters["env_override"] = env_override + + +def _override_permalinks_icon(app): + # https://github.com/readthedocs/sphinx_rtd_theme/issues/98#issuecomment-1503211439 + app.config.html_permalinks_icon = "¶" + + +def setup(app): + app.connect("builder-inited", _add_jinja_filters) + app.connect("builder-inited", _override_permalinks_icon) + app.connect("autodoc-process-docstring", autodoc_process_docstring) + +# (regex, restructuredText link replacement, object) list +replacements = [] + +# (regex, class/function name surrounded with back-ticks, object) list +suggestions = [] + +try: + # guarded code to make sure build never fails + replacements_f, suggestions_f = autolink_replacements("func") + replacements_c, suggestions_c = autolink_replacements("class") + replacements = replacements_f + replacements_c + suggestions = suggestions_f + suggestions_c +except Exception as e: + print( + style( + "Failed to create list of (regex, reStructuredText link " + "replacement) for class names and method names in docstrings. " + "Error is: \n{}".format(str(e)), + fg="red", + ) + ) + +user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0" + +myst_heading_anchors = 5 diff --git a/kedro-datasets/docs/source/index.rst b/kedro-datasets/docs/source/index.rst new file mode 100644 index 000000000..84decee2a --- /dev/null +++ b/kedro-datasets/docs/source/index.rst @@ -0,0 +1,22 @@ +.. Kedro documentation master file, created by + sphinx-quickstart on Mon Dec 18 11:31:24 2017. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + + +API documentation +================= + +.. autosummary:: + :toctree: + :caption: API documentation + :template: autosummary/module.rst + :recursive: + + kedro_datasets + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` diff --git a/kedro-datasets/docs/source/kedro_datasets.rst b/kedro-datasets/docs/source/kedro_datasets.rst new file mode 100644 index 000000000..18bff8808 --- /dev/null +++ b/kedro-datasets/docs/source/kedro_datasets.rst @@ -0,0 +1,57 @@ +kedro_datasets +============== + +.. rubric:: Description + +.. automodule:: kedro_datasets + +.. rubric:: Classes + +.. autosummary:: + :toctree: + :template: autosummary/class.rst + + kedro_datasets.api.APIDataSet + kedro_datasets.biosequence.BioSequenceDataSet + kedro_datasets.dask.ParquetDataSet + kedro_datasets.databricks.ManagedTableDataSet + kedro_datasets.email.EmailMessageDataSet + kedro_datasets.geopandas.GeoJSONDataSet + kedro_datasets.holoviews.HoloviewsWriter + kedro_datasets.json.JSONDataSet + kedro_datasets.matplotlib.MatplotlibWriter + kedro_datasets.networkx.GMLDataSet + kedro_datasets.networkx.GraphMLDataSet + kedro_datasets.networkx.JSONDataSet + kedro_datasets.pandas.CSVDataSet + kedro_datasets.pandas.DeltaTableDataSet + kedro_datasets.pandas.ExcelDataSet + kedro_datasets.pandas.FeatherDataSet + kedro_datasets.pandas.GBQQueryDataSet + kedro_datasets.pandas.GBQTableDataSet + kedro_datasets.pandas.GenericDataSet + kedro_datasets.pandas.HDFDataSet + kedro_datasets.pandas.JSONDataSet + kedro_datasets.pandas.ParquetDataSet + kedro_datasets.pandas.SQLQueryDataSet + kedro_datasets.pandas.SQLTableDataSet + kedro_datasets.pandas.XMLDataSet + kedro_datasets.pickle.PickleDataSet + kedro_datasets.pillow.ImageDataSet + kedro_datasets.plotly.JSONDataSet + kedro_datasets.plotly.PlotlyDataSet + kedro_datasets.polars.CSVDataSet + kedro_datasets.redis.PickleDataSet + kedro_datasets.snowflake.SnowparkTableDataSet + kedro_datasets.spark.DeltaTableDataSet + kedro_datasets.spark.SparkDataSet + kedro_datasets.spark.SparkHiveDataSet + kedro_datasets.spark.SparkJDBCDataSet + kedro_datasets.spark.SparkStreamingDataSet + kedro_datasets.svmlight.SVMLightDataSet + kedro_datasets.tensorflow.TensorFlowModelDataSet + kedro_datasets.text.TextDataSet + kedro_datasets.tracking.JSONDataSet + kedro_datasets.tracking.MetricsDataSet + kedro_datasets.video.VideoDataSet + kedro_datasets.yaml.YAMLDataSet diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index 77031e96c..3387d78e0 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,3 +1,3 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" -__version__ = "1.4.2" +__version__ = "1.5.3" diff --git a/kedro-datasets/kedro_datasets/api/__init__.py b/kedro-datasets/kedro_datasets/api/__init__.py index ccd799b2c..5910d7916 100644 --- a/kedro-datasets/kedro_datasets/api/__init__.py +++ b/kedro-datasets/kedro_datasets/api/__init__.py @@ -2,10 +2,13 @@ and returns them into either as string or json Dict. It uses the python requests library: https://requests.readthedocs.io/en/latest/ """ +from typing import Any -__all__ = ["APIDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +APIDataSet: Any -with suppress(ImportError): - from .api_dataset import APIDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"api_dataset": ["APIDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/api/api_dataset.py b/kedro-datasets/kedro_datasets/api/api_dataset.py index 8414a23a9..a69281aaf 100644 --- a/kedro-datasets/kedro_datasets/api/api_dataset.py +++ b/kedro-datasets/kedro_datasets/api/api_dataset.py @@ -207,9 +207,9 @@ def _execute_save_with_chunks( def _execute_save_request(self, json_data: Any) -> requests.Response: try: - json_.loads(json_data) + self._request_args["json"] = json_.loads(json_data) except TypeError: - self._request_args["json"] = json_.dumps(json_data) + self._request_args["json"] = json_data try: response = requests.request(**self._request_args) response.raise_for_status() diff --git a/kedro-datasets/kedro_datasets/biosequence/__init__.py b/kedro-datasets/kedro_datasets/biosequence/__init__.py index 9f2f1a2a2..d245f23ab 100644 --- a/kedro-datasets/kedro_datasets/biosequence/__init__.py +++ b/kedro-datasets/kedro_datasets/biosequence/__init__.py @@ -1,8 +1,11 @@ """``AbstractDataSet`` implementation to read/write from/to a sequence file.""" +from typing import Any -__all__ = ["BioSequenceDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +BioSequenceDataSet: Any -with suppress(ImportError): - from .biosequence_dataset import BioSequenceDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"biosequence_dataset": ["BioSequenceDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/dask/__init__.py b/kedro-datasets/kedro_datasets/dask/__init__.py index d93bf4c63..cd8d04120 100644 --- a/kedro-datasets/kedro_datasets/dask/__init__.py +++ b/kedro-datasets/kedro_datasets/dask/__init__.py @@ -1,8 +1,11 @@ """Provides I/O modules using dask dataframe.""" +from typing import Any -__all__ = ["ParquetDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +ParquetDataSet: Any -with suppress(ImportError): - from .parquet_dataset import ParquetDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"parquet_dataset": ["ParquetDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/databricks/__init__.py b/kedro-datasets/kedro_datasets/databricks/__init__.py index d416ac291..c42ce4502 100644 --- a/kedro-datasets/kedro_datasets/databricks/__init__.py +++ b/kedro-datasets/kedro_datasets/databricks/__init__.py @@ -1,8 +1,11 @@ """Provides interface to Unity Catalog Tables.""" +from typing import Any -__all__ = ["ManagedTableDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +ManagedTableDataSet: Any -with suppress(ImportError): - from .managed_table_dataset import ManagedTableDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"managed_table_dataset": ["ManagedTableDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py index e9a43650f..03ec2e097 100644 --- a/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py +++ b/kedro-datasets/kedro_datasets/databricks/managed_table_dataset.py @@ -29,7 +29,7 @@ class ManagedTable: database: str catalog: Optional[str] table: str - write_mode: str + write_mode: Union[str, None] dataframe_type: str primary_key: Optional[str] owner_group: str @@ -82,7 +82,10 @@ def _validate_write_mode(self): Raises: DataSetError: If an invalid `write_mode` is passed. """ - if self.write_mode not in self._VALID_WRITE_MODES: + if ( + self.write_mode is not None + and self.write_mode not in self._VALID_WRITE_MODES + ): valid_modes = ", ".join(self._VALID_WRITE_MODES) raise DataSetError( f"Invalid `write_mode` provided: {self.write_mode}. " @@ -196,7 +199,7 @@ def __init__( # pylint: disable=R0913 table: str, catalog: str = None, database: str = "default", - write_mode: str = "overwrite", + write_mode: Union[str, None] = None, dataframe_type: str = "spark", primary_key: Optional[Union[str, List[str]]] = None, version: Version = None, @@ -215,10 +218,11 @@ def __init__( # pylint: disable=R0913 Defaults to None. database: the name of the database. (also referred to as schema). Defaults to "default". - write_mode: the mode to write the data into the table. + write_mode: the mode to write the data into the table. If not + present, the data set is read-only. Options are:["overwrite", "append", "upsert"]. "upsert" mode requires primary_key field to be populated. - Defaults to "overwrite". + Defaults to None. dataframe_type: "pandas" or "spark" dataframe. Defaults to "spark". primary_key: the primary key of the table. @@ -365,6 +369,11 @@ def _save(self, data: Union[DataFrame, pd.DataFrame]) -> None: Args: data (Any): Spark or pandas dataframe to save to the table location """ + if self._table.write_mode is None: + raise DataSetError( + "'save' can not be used in read-only mode. " + "Change 'write_mode' value to `overwrite`, `upsert` or `append`." + ) # filter columns specified in schema and match their ordering if self._table.schema(): cols = self._table.schema().fieldNames() diff --git a/kedro-datasets/kedro_datasets/email/__init__.py b/kedro-datasets/kedro_datasets/email/__init__.py index 97aa7a345..c96654080 100644 --- a/kedro-datasets/kedro_datasets/email/__init__.py +++ b/kedro-datasets/kedro_datasets/email/__init__.py @@ -1,8 +1,11 @@ """``AbstractDataSet`` implementations for managing email messages.""" +from typing import Any -__all__ = ["EmailMessageDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +EmailMessageDataSet: Any -with suppress(ImportError): - from .message_dataset import EmailMessageDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"message_dataset": ["EmailMessageDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/geopandas/__init__.py b/kedro-datasets/kedro_datasets/geopandas/__init__.py index 966577fc3..be4ff13ee 100644 --- a/kedro-datasets/kedro_datasets/geopandas/__init__.py +++ b/kedro-datasets/kedro_datasets/geopandas/__init__.py @@ -1,8 +1,11 @@ -"""``GeoJSONDataSet`` is an ``AbstractVersionedDataSet`` to save and load GeoJSON files. -""" -__all__ = ["GeoJSONDataSet"] +"""``GeoJSONDataSet`` is an ``AbstractVersionedDataSet`` to save and load GeoJSON files.""" +from typing import Any -from contextlib import suppress +import lazy_loader as lazy -with suppress(ImportError): - from .geojson_dataset import GeoJSONDataSet +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +GeoJSONDataSet: Any + +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"geojson_dataset": ["GeoJSONDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/holoviews/__init__.py b/kedro-datasets/kedro_datasets/holoviews/__init__.py index c97bd72a6..03731d2e2 100644 --- a/kedro-datasets/kedro_datasets/holoviews/__init__.py +++ b/kedro-datasets/kedro_datasets/holoviews/__init__.py @@ -1,8 +1,11 @@ """``AbstractDataSet`` implementation to save Holoviews objects as image files.""" +from typing import Any -__all__ = ["HoloviewsWriter"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +HoloviewsWriter: Any -with suppress(ImportError): - from .holoviews_writer import HoloviewsWriter +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"holoviews_writer": ["HoloviewsWriter"]} +) diff --git a/kedro-datasets/kedro_datasets/json/__init__.py b/kedro-datasets/kedro_datasets/json/__init__.py index 5f023b35f..f9d1f606a 100644 --- a/kedro-datasets/kedro_datasets/json/__init__.py +++ b/kedro-datasets/kedro_datasets/json/__init__.py @@ -1,8 +1,11 @@ """``AbstractDataSet`` implementation to load/save data from/to a JSON file.""" +from typing import Any -__all__ = ["JSONDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +JSONDataSet: Any -with suppress(ImportError): - from .json_dataset import JSONDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"json_dataset": ["JSONDataSet"]} +) diff --git a/kedro-datasets/kedro_datasets/matplotlib/__init__.py b/kedro-datasets/kedro_datasets/matplotlib/__init__.py index ee2bc0646..14d2641f2 100644 --- a/kedro-datasets/kedro_datasets/matplotlib/__init__.py +++ b/kedro-datasets/kedro_datasets/matplotlib/__init__.py @@ -1,8 +1,10 @@ """``AbstractDataSet`` implementation to save matplotlib objects as image files.""" +from typing import Any -__all__ = ["MatplotlibWriter"] +import lazy_loader as lazy -from contextlib import suppress +MatplotlibWriter: Any -with suppress(ImportError): - from .matplotlib_writer import MatplotlibWriter +__getattr__, __dir__, __all__ = lazy.attach( + __name__, submod_attrs={"matplotlib_writer": ["MatplotlibWriter"]} +) diff --git a/kedro-datasets/kedro_datasets/networkx/__init__.py b/kedro-datasets/kedro_datasets/networkx/__init__.py index 73674c81f..6349a4dac 100644 --- a/kedro-datasets/kedro_datasets/networkx/__init__.py +++ b/kedro-datasets/kedro_datasets/networkx/__init__.py @@ -1,15 +1,19 @@ -"""``AbstractDataSet`` implementation to save and load NetworkX graphs in JSON -, GraphML and GML formats using ``NetworkX``.""" +"""``AbstractDataSet`` implementation to save and load NetworkX graphs in JSON, +GraphML and GML formats using ``NetworkX``.""" +from typing import Any -__all__ = ["GMLDataSet", "GraphMLDataSet", "JSONDataSet"] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +GMLDataSet: Any +GraphMLDataSet: Any +JSONDataSet: Any -with suppress(ImportError): - from .gml_dataset import GMLDataSet - -with suppress(ImportError): - from .graphml_dataset import GraphMLDataSet - -with suppress(ImportError): - from .json_dataset import JSONDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, + submod_attrs={ + "gml_dataset": ["GMLDataSet"], + "graphml_dataset": ["GraphMLDataSet"], + "json_dataset": ["JSONDataSet"], + }, +) diff --git a/kedro-datasets/kedro_datasets/pandas/__init__.py b/kedro-datasets/kedro_datasets/pandas/__init__.py index 2ac29379a..f01c79536 100644 --- a/kedro-datasets/kedro_datasets/pandas/__init__.py +++ b/kedro-datasets/kedro_datasets/pandas/__init__.py @@ -1,42 +1,36 @@ """``AbstractDataSet`` implementations that produce pandas DataFrames.""" +from typing import Any -__all__ = [ - "CSVDataSet", - "DeltaTableDataSet", - "ExcelDataSet", - "FeatherDataSet", - "GBQTableDataSet", - "GBQQueryDataSet", - "HDFDataSet", - "JSONDataSet", - "ParquetDataSet", - "SQLQueryDataSet", - "SQLTableDataSet", - "XMLDataSet", - "GenericDataSet", -] +import lazy_loader as lazy -from contextlib import suppress +# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 +CSVDataSet: Any +DeltaTableDataSet: Any +ExcelDataSet: Any +FeatherDataSet: Any +GBQQueryDataSet: Any +GBQTableDataSet: Any +GenericDataSet: Any +HDFDataSet: Any +JSONDataSet: Any +ParquetDataSet: Any +SQLQueryDataSet: Any +SQLTableDataSet: Any +XMLDataSet: Any -with suppress(ImportError): - from .csv_dataset import CSVDataSet -with suppress(ImportError): - from .deltatable_dataset import DeltaTableDataSet -with suppress(ImportError): - from .excel_dataset import ExcelDataSet -with suppress(ImportError): - from .feather_dataset import FeatherDataSet -with suppress(ImportError): - from .gbq_dataset import GBQQueryDataSet, GBQTableDataSet -with suppress(ImportError): - from .hdf_dataset import HDFDataSet -with suppress(ImportError): - from .json_dataset import JSONDataSet -with suppress(ImportError): - from .parquet_dataset import ParquetDataSet -with suppress(ImportError): - from .sql_dataset import SQLQueryDataSet, SQLTableDataSet -with suppress(ImportError): - from .xml_dataset import XMLDataSet -with suppress(ImportError): - from .generic_dataset import GenericDataSet +__getattr__, __dir__, __all__ = lazy.attach( + __name__, + submod_attrs={ + "csv_dataset": ["CSVDataSet"], + "deltatable_dataset": ["DeltaTableDataSet"], + "excel_dataset": ["ExcelDataSet"], + "feather_dataset": ["FeatherDataSet"], + "gbq_dataset": ["GBQQueryDataSet", "GBQTableDataSet"], + "generic_dataset": ["GenericDataSet"], + "hdf_dataset": ["HDFDataSet"], + "json_dataset": ["JSONDataSet"], + "parquet_dataset": ["ParquetDataSet"], + "sql_dataset": ["SQLQueryDataSet", "SQLTableDataSet"], + "xml_dataset": ["XMLDataSet"], + }, +) diff --git a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py index 16eefca25..23a278d45 100644 --- a/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py +++ b/kedro-datasets/kedro_datasets/pandas/deltatable_dataset.py @@ -21,7 +21,9 @@ class DeltaTableDataSet(AbstractDataSet): # pylint:disable=too-many-instance-at mode=overwrite together with partition_filters. This will remove all files within the matching partition and insert your data as new files. - Example usage for the `YAML API`_: + Example usage for the + `YAML API