From fb018325ceef953acd24f552f1092ec7f88fc6d8 Mon Sep 17 00:00:00 2001 From: David Wood Date: Tue, 22 Oct 2024 11:59:28 -0400 Subject: [PATCH] add license/copyright as appropriate to .py files and add check-licensing.sh script (#715) * add license/copyright as appropriate to .py files and add check-licensing.sh script Signed-off-by: David Wood * add check-licensing.sh to test-misc.yml workflow Signed-off-by: David Wood --------- Signed-off-by: David Wood --- .github/mkdocs_hook.py | 12 +++ .github/workflows/test-misc.yml | 7 ++ .../test/dpk_connector/core/test_crawler.py | 13 +++- .../dpk_connector/core/test_middlewares.py | 12 +++ .../dpk_connector/core/test_sitemap_spider.py | 20 +++-- .../test/dpk_connector/core/test_utils.py | 16 +++- .../data_access/data_access_factory_test.py | 12 +++ .../src/data_processing/utils/multilock.py | 12 +++ .../data_processing/utils/unrecoverable.py | 13 ++++ .../data_access/daf_local_test.py | 12 +++ kfp/kfp_ray_components/src/subworkflow.py | 12 +++ .../pipeline_utils/pipelines_tests_utils.py | 12 +++ .../compile_utils/component.py | 12 +++ .../pipeline_utils/pipelines_tests_utils.py | 12 +++ .../single-pipeline/pipeline_generator.py | 13 ++++ .../superpipeline/super_pipeline_generator.py | 12 +++ .../kfp_v1/superworkflow_code_sample_wf.py | 26 ++++++- .../kfp_v1/superworkflow_dedups_sample_wf.py | 12 +++ .../kfp_v2/superpipeline_noop_docId_v2_wf.py | 54 ++++++++++--- scripts/check-licensing.sh | 27 +++++++ .../python/src/code_quality_transform.py | 2 +- .../src/code_quality_transform_python.py | 1 + .../ray/src/code_quality_transform_ray.py | 10 +-- .../python/src/header_cleanser_transform.py | 12 +++ .../license_select/python/src/transformer.py | 12 +++ .../internal/check_languages.py | 12 +++ .../internal/repo_grouper.py | 12 +++ .../internal/repo_level_wrappers.py | 12 +++ .../semantic_ordering/build_dep_graph.py | 12 +++ .../semantic_ordering/sort_by_semantic_dep.py | 12 +++ .../semantic_ordering/topological_sort.py | 12 +++ .../sorting/semantic_ordering/utils.py | 12 +++ .../internal/store/ray_store.py | 12 +++ .../internal/store/store.py | 12 +++ .../internal/store/store_factory.py | 12 +++ .../python/src/html2parquet_transform.py | 76 +++++++++++-------- .../src/html2parquet_transform_python.py | 16 +++- .../pii_redactor/kfp_ray/pii_redactor_wf.py | 18 +++-- .../python/src/flair_recognizer.py | 11 +++ .../pii_redactor/python/src/pii_analyzer.py | 16 +++- .../pii_redactor/python/src/pii_anonymizer.py | 11 +++ .../python/src/pii_redactor_local.py | 2 - .../python/src/pii_redactor_transform.py | 12 ++- .../src/pii_redactor_transform_python.py | 1 - .../pii_redactor/python/test/test_data.py | 11 +++ .../python/test/test_pii_analyzer.py | 14 +++- .../python/test/test_pii_anonymizer.py | 16 +++- .../test_pii_redactor_redact_anonamize.py | 15 +++- .../test/test_pii_redactor_transform.py | 11 +++ 49 files changed, 628 insertions(+), 80 deletions(-) mode change 100755 => 100644 kfp/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py create mode 100644 scripts/check-licensing.sh diff --git a/.github/mkdocs_hook.py b/.github/mkdocs_hook.py index 5624f2f91..ccb059454 100644 --- a/.github/mkdocs_hook.py +++ b/.github/mkdocs_hook.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import logging import os import re diff --git a/.github/workflows/test-misc.yml b/.github/workflows/test-misc.yml index 2fa493f03..65432fb98 100644 --- a/.github/workflows/test-misc.yml +++ b/.github/workflows/test-misc.yml @@ -45,3 +45,10 @@ jobs: uses: actions/checkout@v4 - name: Make sure all transforms have a test workflow run: bash scripts/check-workflows.sh + check-licensing: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Make repo content includes license headers. + run: bash scripts/check-licensing.sh diff --git a/data-connector-lib/test/dpk_connector/core/test_crawler.py b/data-connector-lib/test/dpk_connector/core/test_crawler.py index 88d90293a..01adecf6e 100644 --- a/data-connector-lib/test/dpk_connector/core/test_crawler.py +++ b/data-connector-lib/test/dpk_connector/core/test_crawler.py @@ -1,5 +1,16 @@ -import pytest +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import pytest from dpk_connector.core.crawler import crawl diff --git a/data-connector-lib/test/dpk_connector/core/test_middlewares.py b/data-connector-lib/test/dpk_connector/core/test_middlewares.py index a2346888f..3db4d3f4f 100644 --- a/data-connector-lib/test/dpk_connector/core/test_middlewares.py +++ b/data-connector-lib/test/dpk_connector/core/test_middlewares.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import pytest from dpk_connector.core.middlewares import DelayingProtegoRobotParser from pytest_mock import MockerFixture diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py index 308c4ff89..337e67791 100644 --- a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py +++ b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from pathlib import Path import pytest @@ -73,9 +85,7 @@ def callback(url: str, body: bytes, headers: dict): assert body.decode("utf-8") == response_body assert headers == {"Content-Type": "text/html"} - spider = ConnectorSitemapSpider.from_crawler( - crawler, seed_urls=("http://example.com",), callback=callback - ) + spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback) request = Request( "http://example.com/index.html", meta={ @@ -93,9 +103,7 @@ def callback(url: str, body: bytes, headers: dict): parsed = spider.parse(response) item = next(parsed) - assert item == ConnectorItem( - dropped=False, downloaded=True, system_request=False, sitemap=False - ) + assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False) for next_request in parsed: assert isinstance(next_request, Request) is True diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py index 54f15a70d..009b37f98 100644 --- a/data-connector-lib/test/dpk_connector/core/test_utils.py +++ b/data-connector-lib/test/dpk_connector/core/test_utils.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + # Assisted by WCA@IBM # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2 @@ -20,9 +32,7 @@ def test_get_header_value(): - response = Response( - "http://example.com", headers={"Content-Type": "application/json"} - ) + response = Response("http://example.com", headers={"Content-Type": "application/json"}) assert get_header_value(response, "Content-Type") == "application/json" diff --git a/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py b/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py index 1f07e69e0..16066d8d7 100644 --- a/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py +++ b/data-processing-lib/python/src/data_processing/test_support/data_access/data_access_factory_test.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os import sys from argparse import ArgumentParser diff --git a/data-processing-lib/python/src/data_processing/utils/multilock.py b/data-processing-lib/python/src/data_processing/utils/multilock.py index b6f7d942a..dca123f72 100644 --- a/data-processing-lib/python/src/data_processing/utils/multilock.py +++ b/data-processing-lib/python/src/data_processing/utils/multilock.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import abc import datetime import fcntl diff --git a/data-processing-lib/python/src/data_processing/utils/unrecoverable.py b/data-processing-lib/python/src/data_processing/utils/unrecoverable.py index c07badf0f..034a3216b 100644 --- a/data-processing-lib/python/src/data_processing/utils/unrecoverable.py +++ b/data-processing-lib/python/src/data_processing/utils/unrecoverable.py @@ -1,3 +1,16 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + class UnrecoverableException(Exception): """ Raised when a transform wants to cancel overall execution diff --git a/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py b/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py index f5a029411..37b00fa4b 100644 --- a/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py +++ b/data-processing-lib/python/test/data_processing_tests/data_access/daf_local_test.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os from data_processing.test_support.data_access import AbstractDataAccessFactoryTests diff --git a/kfp/kfp_ray_components/src/subworkflow.py b/kfp/kfp_ray_components/src/subworkflow.py index 39aabe261..78d33f655 100644 --- a/kfp/kfp_ray_components/src/subworkflow.py +++ b/kfp/kfp_ray_components/src/subworkflow.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import sys from data_processing.utils import ParamsUtils diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index de8246651..f60e1196e 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os import sys diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py index 695fa936a..ac5e32689 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import json import os from typing import Dict diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 7d65ea712..6b23067f9 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os import sys diff --git a/kfp/pipeline_generator/single-pipeline/pipeline_generator.py b/kfp/pipeline_generator/single-pipeline/pipeline_generator.py index ae23eab6f..225fbaa1b 100644 --- a/kfp/pipeline_generator/single-pipeline/pipeline_generator.py +++ b/kfp/pipeline_generator/single-pipeline/pipeline_generator.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + PIPELINE_TEMPLATE_FILE = "simple_pipeline.py" @@ -14,6 +26,7 @@ if __name__ == "__main__": import argparse import os + import yaml from jinja2 import Environment, FileSystemLoader diff --git a/kfp/pipeline_generator/superpipeline/super_pipeline_generator.py b/kfp/pipeline_generator/superpipeline/super_pipeline_generator.py index 961c65cb9..27c124362 100644 --- a/kfp/pipeline_generator/superpipeline/super_pipeline_generator.py +++ b/kfp/pipeline_generator/superpipeline/super_pipeline_generator.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import yaml diff --git a/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py b/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py index d1479d794..b97c8428f 100644 --- a/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py +++ b/kfp/superworkflows/ray/kfp_v1/superworkflow_code_sample_wf.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl @@ -82,7 +94,7 @@ def sample_code_ray_orchestrator( p4_ededup_doc_column: str = "contents", p4_ededup_hash_cpu: float = 0.5, p4_ededup_use_snapshot: bool = False, - p4_ededup_snapshot_directory: str = None, # data sampling + p4_ededup_snapshot_directory: str = None, # data sampling p4_ededup_n_samples: int = 10, # overriding parameters p4_overriding_params: str = '{"ray_worker_options": {"image": "' @@ -293,13 +305,21 @@ def _set_component(op: dsl.BaseOp, displaied_name: str, prev_op: dsl.BaseOp = No # header cleanser header_cleanser = run_header_cleanser_op( - name=p1_orch_header_cleanser_name, prefix="p11_", params=args, host=orch_host, input_folder=license_check.output + name=p1_orch_header_cleanser_name, + prefix="p11_", + params=args, + host=orch_host, + input_folder=license_check.output, ) _set_component(header_cleanser, "header_cleanser", license_check) # tokenization tokenization = run_tokenization_op( - name=p1_orch_tokenization_wf_name, prefix="p10_", params=args, host=orch_host, input_folder=header_cleanser.output + name=p1_orch_tokenization_wf_name, + prefix="p10_", + params=args, + host=orch_host, + input_folder=header_cleanser.output, ) _set_component(tokenization, "tokenization", header_cleanser) diff --git a/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py b/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py index 8243a65b5..293469ab4 100644 --- a/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py +++ b/kfp/superworkflows/ray/kfp_v1/superworkflow_dedups_sample_wf.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl diff --git a/kfp/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/kfp/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py old mode 100755 new mode 100644 index 240547500..434d84ab0 --- a/kfp/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py +++ b/kfp/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py @@ -1,11 +1,25 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from typing import Any, NamedTuple + import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl +from universal.doc_id.kfp_ray.doc_id_wf import doc_id +from universal.noop.kfp_ray.noop_wf import noop + from kfp import dsl -from universal.noop.kfp_ray.noop_wf import noop -from universal.doc_id.kfp_ray.doc_id_wf import doc_id noop_image = "quay.io/dataprep1/data-prep-kit/noop-ray:latest" doc_id_image = "quay.io/dataprep1/data-prep-kit/doc_id-ray:latest" @@ -38,7 +52,7 @@ def super_pipeline( p1_pipeline_additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', p1_pipeline_data_s3_access_secret: str = "s3-secret", p1_pipeline_runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, - p1_pipeline_runtime_actor_options: dict = {'num_cpus': 0.8}, + p1_pipeline_runtime_actor_options: dict = {"num_cpus": 0.8}, # data access p1_pipeline_data_max_files: int = -1, p1_pipeline_data_num_samples: int = -1, @@ -49,12 +63,28 @@ def super_pipeline( p2_noop_sleep_sec: int = 10, p2_ray_name: str = "noop-kfp-ray", p2_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": noop_image}, - p2_ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "", "image": noop_image}, + p2_ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image_pull_secret": "", + "image": noop_image, + }, # Document ID step parameters p3_name: str = "doc_id", p3_ray_name: str = "docid-kfp-ray", p3_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, - p3_ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, + p3_ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image_pull_secret": "", + "image": doc_id_image, + }, # p3_skip: bool = False, # orchestrator p3_data_data_sets: str = "", @@ -70,9 +100,15 @@ def super_pipeline( transform1_prefix = "p2_" transform2_prefix = "p3_" # split the input parameters according to thier prefixes. - common_params = {key[len(common_params_prefix) :]: value for key, value in args.items() if key.startswith(common_params_prefix)} - task1_params = {key[len(transform1_prefix) :]: value for key, value in args.items() if key.startswith(transform1_prefix)} - task2_params = {key[len(transform2_prefix) :]: value for key, value in args.items() if key.startswith(transform2_prefix)} + common_params = { + key[len(common_params_prefix) :]: value for key, value in args.items() if key.startswith(common_params_prefix) + } + task1_params = { + key[len(transform1_prefix) :]: value for key, value in args.items() if key.startswith(transform1_prefix) + } + task2_params = { + key[len(transform2_prefix) :]: value for key, value in args.items() if key.startswith(transform2_prefix) + } # get the input path, output path of the whole pipeline, and the intermediate path for storing the files between the transforms input_path = common_params.get("input_path", "") @@ -97,7 +133,7 @@ def super_pipeline( # call the doc_id pipeline from doc_id_wf.py file with the expected parameters doc_id_task = doc_id(**pipeline_prms_to_pass) doc_id_task.after(noop_task) - + if __name__ == "__main__": # Compiling the pipeline diff --git a/scripts/check-licensing.sh b/scripts/check-licensing.sh new file mode 100644 index 000000000..d03a419bf --- /dev/null +++ b/scripts/check-licensing.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Looks in files to make sure there is some form of comment with license text +echo -n Checking for missing license text... +# Script is having trouble dealing with names with white space, so exclude the 'fine tuning' directory +#files2check=$(find . -name '*.py' -o -name '*.sh' -o -name '*.ipynb' | grep -v venv | grep -v __init__ | grep -v 'fine tuning') +# And, for now only check .py files that are not in the examples tree since there are lots otherwise. +files2check=$(find . -name '*.py' | grep -v venv | grep -v __init__ | grep -v examples | grep -v 'fine tuning') +files= +for file in $files2check; do + license=$(cat "$file" | grep '^#.*[lL]icense') + auto=$(cat "$file" | grep '^#.*auto') # Ignore auto-generated files + if [ -z "$license" -a -z "$auto" ]; then + if [ -z "$files" ]; then + echo the following appear to be missing license text. + fi + echo $file + files="$files $file" + fi +done +if [ ! -z "$files" ]; then + echo "To address this, add a comment header with license text (including the word 'license')." + status=1 +else + echo no files appear to be missing license text. + status=0 +fi +exit $status diff --git a/transforms/code/code_quality/python/src/code_quality_transform.py b/transforms/code/code_quality/python/src/code_quality_transform.py index 28c75675a..4defb43fe 100644 --- a/transforms/code/code_quality/python/src/code_quality_transform.py +++ b/transforms/code/code_quality/python/src/code_quality_transform.py @@ -1,3 +1,4 @@ +# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -7,7 +8,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - ################################################################################ # Collection of code data specific annotations and its heuristics are borrowed from: diff --git a/transforms/code/code_quality/python/src/code_quality_transform_python.py b/transforms/code/code_quality/python/src/code_quality_transform_python.py index 57c9e6054..0dd2f7cbe 100644 --- a/transforms/code/code_quality/python/src/code_quality_transform_python.py +++ b/transforms/code/code_quality/python/src/code_quality_transform_python.py @@ -1,3 +1,4 @@ +# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/transforms/code/code_quality/ray/src/code_quality_transform_ray.py b/transforms/code/code_quality/ray/src/code_quality_transform_ray.py index 5b63c4e14..0459b1ec0 100644 --- a/transforms/code/code_quality/ray/src/code_quality_transform_ray.py +++ b/transforms/code/code_quality/ray/src/code_quality_transform_ray.py @@ -1,3 +1,4 @@ +# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -7,17 +8,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - ################################################################################ -# Collection of code data specific annotations and its heuristics are borrowed from: -# CodeParrot https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot#preprocessing -# BigCode Dataset https://github.com/bigcode-project/bigcode-dataset/tree/main/preprocessing -# -# Code specific heuristics like alpha numeric, char token ratio implementations & others are taken from CodeParrot and BigCode Dataset -# preprocessing scripts and modified according to data-prep-kit specific framework. - - import os from code_quality_transform import CodeQualityTransformConfiguration diff --git a/transforms/code/header_cleanser/python/src/header_cleanser_transform.py b/transforms/code/header_cleanser/python/src/header_cleanser_transform.py index 711171344..00fa3c892 100644 --- a/transforms/code/header_cleanser/python/src/header_cleanser_transform.py +++ b/transforms/code/header_cleanser/python/src/header_cleanser_transform.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/transforms/code/license_select/python/src/transformer.py b/transforms/code/license_select/python/src/transformer.py index edddd0c0c..6003a130f 100644 --- a/transforms/code/license_select/python/src/transformer.py +++ b/transforms/code/license_select/python/src/transformer.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from collections.abc import Callable, Iterable from typing import TypeVar diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/check_languages.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/check_languages.py index a7f437192..e016e3c3b 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/check_languages.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/check_languages.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os import random diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_grouper.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_grouper.py index c85d84d6a..0f26e58d2 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_grouper.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_grouper.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os from typing import List diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py index f328884ed..1e9a24993 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/repo_level_wrappers.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import logging import os import uuid diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py index a623c644d..a655589ca 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/build_dep_graph.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import logging import re from collections import defaultdict, namedtuple diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py index 55956a148..4c309dd84 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/sort_by_semantic_dep.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import time from logging import Logger diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py index fd4b2c600..064b56923 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/topological_sort.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import random import networkx as nx diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/utils.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/utils.py index 3bcf48be9..86b7f35d1 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/utils.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/sorting/semantic_ordering/utils.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import logging import os import sys diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/ray_store.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/ray_store.py index f491fdd7c..caf07b1b4 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/ray_store.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/ray_store.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import random import ray diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store.py index 8b82fed7e..32a673adc 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import os from pyarrow.fs import FileSelector, LocalFileSystem, S3FileSystem diff --git a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py index e5a2910f1..004beba33 100644 --- a/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py +++ b/transforms/code/repo_level_ordering/ray/src/dpk_repo_level_order/internal/store/store_factory.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from dpk_repo_level_order.internal.store.ray_store import ( KeyedValueListActorPool, create_pool, diff --git a/transforms/language/html2parquet/python/src/html2parquet_transform.py b/transforms/language/html2parquet/python/src/html2parquet_transform.py index b85da9c30..9821a49a7 100644 --- a/transforms/language/html2parquet/python/src/html2parquet_transform.py +++ b/transforms/language/html2parquet/python/src/html2parquet_transform.py @@ -1,13 +1,28 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import enum +import io import time -from argparse import ArgumentParser, Namespace -from typing import Any import zipfile -import io -import trafilatura +from argparse import ArgumentParser, Namespace from datetime import datetime +from typing import Any import pyarrow as pa +import trafilatura +from data_processing.transform import AbstractBinaryTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger + # disabled for now # from data_processing_ray.runtime.ray import RayTransformLauncher @@ -17,9 +32,6 @@ # import data_processing -from data_processing.transform import AbstractBinaryTransform, TransformConfiguration -from data_processing.utils import CLIArgumentProvider, get_logger, TransformUtils - class Html2ParquetTransform(AbstractBinaryTransform): @@ -31,18 +43,18 @@ def __init__(self, config: dict[str, Any]): self.favor_recall = config.get(html2parquet_favor_recall_key, html2parquet_favor_recall.TRUE) if not isinstance(self.output_format, html2parquet_output_format): - self.output_format = html2parquet_output_format[self.output_format] + self.output_format = html2parquet_output_format[self.output_format] if not isinstance(self.favor_precision, html2parquet_favor_precision): self.favor_precision = html2parquet_favor_precision[self.favor_precision] if not isinstance(self.favor_recall, html2parquet_favor_recall): - self.favor_recall = html2parquet_favor_recall[self.favor_recall] + self.favor_recall = html2parquet_favor_recall[self.favor_recall] - def _convert_html2parquet(self, member_filename:str, file_name:str, content_bytes: bytes) -> dict: + def _convert_html2parquet(self, member_filename: str, file_name: str, content_bytes: bytes) -> dict: title = member_filename if member_filename else TransformUtils.get_file_basename(file_name) - + output_format_value = str(self.output_format) if output_format_value not in ["markdown", "txt"]: raise RuntimeError(f"Unknown output_format {self.output_format}.") @@ -51,17 +63,16 @@ def _convert_html2parquet(self, member_filename:str, file_name:str, content_byte favor_precision_value = True elif self.favor_precision == html2parquet_favor_precision.FALSE: favor_precision_value = False - else: + else: raise RuntimeError(f"Unknown favor_precision {self.favor_precision}.") if self.favor_recall == html2parquet_favor_recall.TRUE: favor_recall_value = True elif self.favor_recall == html2parquet_favor_recall.FALSE: favor_recall_value = False - else: + else: raise RuntimeError(f"Unknown favor_recall {self.favor_recall}.") - # Use Trafilatura library content_string = trafilatura.extract( content_bytes, @@ -71,10 +82,9 @@ def _convert_html2parquet(self, member_filename:str, file_name:str, content_byte include_links=True, include_formatting=True, favor_precision=favor_precision_value, - favor_recall=favor_recall_value + favor_recall=favor_recall_value, ) - if content_string is None: raise RuntimeError("Failed in converting.") @@ -84,7 +94,7 @@ def _convert_html2parquet(self, member_filename:str, file_name:str, content_byte "contents": content_string, "document_id": TransformUtils.str_to_hash(content_string), "size": len(content_string), - "date_acquired": datetime.now().isoformat() + "date_acquired": datetime.now().isoformat(), } return row_data @@ -106,7 +116,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl number_of_rows = 0 # Process ZIP archive of HTML documents - if(TransformUtils.get_file_extension(file_name)[1] == ".zip"): + if TransformUtils.get_file_extension(file_name)[1] == ".zip": with zipfile.ZipFile(io.BytesIO(bytes(byte_array))) as opened_zip: # Loop through each file member in the ZIP archive for member in opened_zip.infolist(): @@ -116,29 +126,31 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl # Read the content of the file content_bytes = file.read() - row_data = self._convert_html2parquet(member_filename=member.filename ,file_name=file_name, content_bytes=content_bytes) + row_data = self._convert_html2parquet( + member_filename=member.filename, file_name=file_name, content_bytes=content_bytes + ) data.append(row_data) number_of_rows += 1 except Exception as e: logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping") - - + # Process single HTML documents - elif(TransformUtils.get_file_extension(file_name)[1] == ".html"): + elif TransformUtils.get_file_extension(file_name)[1] == ".html": try: buf = io.BytesIO(bytes(byte_array)) # Read the content of the HTML file content_bytes = buf.read() - row_data = self._convert_html2parquet(member_filename=None ,file_name=file_name, content_bytes=content_bytes) + row_data = self._convert_html2parquet( + member_filename=None, file_name=file_name, content_bytes=content_bytes + ) data.append(row_data) number_of_rows += 1 except Exception as e: logger.warning(f"Exception {str(e)} processing file {file_name}, skipping") - table = pa.Table.from_pylist(data) return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"nrows": number_of_rows} @@ -159,6 +171,8 @@ class html2parquet_output_format(str, enum.Enum): def __str__(self): return str(self.value) + + class html2parquet_favor_precision(str, enum.Enum): TRUE = True FALSE = False @@ -166,6 +180,7 @@ class html2parquet_favor_precision(str, enum.Enum): def __str__(self): return str(self.value) + class html2parquet_favor_recall(str, enum.Enum): TRUE = True FALSE = False @@ -173,6 +188,7 @@ class html2parquet_favor_recall(str, enum.Enum): def __str__(self): return str(self.value) + html2parquet_output_format_default = html2parquet_output_format.MARKDOWN html2parquet_favor_precision_default = html2parquet_favor_precision.TRUE html2parquet_favor_recall_default = html2parquet_favor_recall.TRUE @@ -196,15 +212,15 @@ def add_input_params(self, parser: ArgumentParser) -> None: type=html2parquet_output_format, choices=list(html2parquet_output_format), help="Output format for the contents column.", - default=html2parquet_output_format.MARKDOWN + default=html2parquet_output_format.MARKDOWN, ) - + parser.add_argument( f"--{html2parquet_favor_precision_cli_param}", type=html2parquet_favor_precision, choices=list(html2parquet_favor_precision), help="Prefers less content but more accurate extraction.", - default=html2parquet_favor_precision.TRUE + default=html2parquet_favor_precision.TRUE, ) parser.add_argument( @@ -212,13 +228,11 @@ def add_input_params(self, parser: ArgumentParser) -> None: type=html2parquet_favor_recall, choices=list(html2parquet_favor_recall), help="Extracts more content when uncertain.", - default=html2parquet_favor_recall.TRUE + default=html2parquet_favor_recall.TRUE, ) - - def apply_input_params(self, args: Namespace) -> bool: captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) self.params = self.params | captured logger.info(f"html2parquet parameters are : {self.params}") - return True \ No newline at end of file + return True diff --git a/transforms/language/html2parquet/python/src/html2parquet_transform_python.py b/transforms/language/html2parquet/python/src/html2parquet_transform_python.py index 826b9b5a0..6adb92855 100644 --- a/transforms/language/html2parquet/python/src/html2parquet_transform_python.py +++ b/transforms/language/html2parquet/python/src/html2parquet_transform_python.py @@ -1,3 +1,15 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import time from data_processing.runtime.pure_python import PythonTransformLauncher @@ -15,6 +27,7 @@ class Html2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfigurati """ Implements the PythonTransformConfiguration for HTML2PARQUET as required by the PythonTransformLauncher. """ + def __init__(self): """ Initialization @@ -22,7 +35,8 @@ def __init__(self): """ super().__init__(transform_config=Html2ParquetTransformConfiguration()) + if __name__ == "__main__": launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration()) logger.info("Launching html2parquet transform") - launcher.launch() \ No newline at end of file + launcher.launch() diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index 621a4e179..d457470ba 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -1,4 +1,3 @@ -# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -99,7 +98,14 @@ def pii_redactor( ray_name: str = "pii-redactor-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/pii_redactor/input/', 'output_folder': 'test/pii_redactor/output/'}", @@ -107,9 +113,9 @@ def pii_redactor( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # pii_redactor parameters pii_redactor_contents: str = "title", # additional parameters @@ -152,7 +158,9 @@ def pii_redactor( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/language/pii_redactor/python/src/flair_recognizer.py b/transforms/language/pii_redactor/python/src/flair_recognizer.py index 9355d2c48..09c67d4cf 100644 --- a/transforms/language/pii_redactor/python/src/flair_recognizer.py +++ b/transforms/language/pii_redactor/python/src/flair_recognizer.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + """ Referred from https://github.com/sahajsoft/pii-detection-and-anonymizer/blob/main/src/recognizer/flair_recognizer.py diff --git a/transforms/language/pii_redactor/python/src/pii_analyzer.py b/transforms/language/pii_redactor/python/src/pii_analyzer.py index 2d48f0f33..894c7ec35 100644 --- a/transforms/language/pii_redactor/python/src/pii_analyzer.py +++ b/transforms/language/pii_redactor/python/src/pii_analyzer.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import logging import spacy @@ -65,7 +76,8 @@ def analyze_text(self, text, language="en"): List[entity_types]: Types of PII entities identified in the given input text """ analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine, registry=self.registry) - analyze_results = analyzer.analyze(text=text, language=language, entities=self.supported_entities, - score_threshold=self.score_threshold) + analyze_results = analyzer.analyze( + text=text, language=language, entities=self.supported_entities, score_threshold=self.score_threshold + ) entity_types = [result.entity_type for result in analyze_results] return analyze_results, entity_types diff --git a/transforms/language/pii_redactor/python/src/pii_anonymizer.py b/transforms/language/pii_redactor/python/src/pii_anonymizer.py index a2073a442..c29e1fb54 100644 --- a/transforms/language/pii_redactor/python/src/pii_anonymizer.py +++ b/transforms/language/pii_redactor/python/src/pii_anonymizer.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities import OperatorConfig diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_local.py b/transforms/language/pii_redactor/python/src/pii_redactor_local.py index a4bd46498..baa6d3894 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_local.py +++ b/transforms/language/pii_redactor/python/src/pii_redactor_local.py @@ -1,4 +1,3 @@ -# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -34,4 +33,3 @@ # Transform the table table_list, metadata = transform.transform(table) - diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_transform.py b/transforms/language/pii_redactor/python/src/pii_redactor_transform.py index fdc3ce8c2..6c1d1c17f 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_transform.py +++ b/transforms/language/pii_redactor/python/src/pii_redactor_transform.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + """ Author: Sowmya.L.R, email:lrsowmya@gmail.com """ @@ -83,7 +94,6 @@ def transform(self, table: pa.Table, file_name: Optional[str] = None) -> tuple[l TransformUtils.validate_columns(table=table, required=[pii_contents_column]) metadata = {"original_table_rows": table.num_rows, "original_column_count": len(table.column_names)} - redacted_texts, entity_types_list = zip(*table[pii_contents_column].to_pandas().apply(self._redact_pii)) table = table.add_column(0, self.doc_contents_key, [redacted_texts]) table = table.add_column(0, "detected_pii", [entity_types_list]) diff --git a/transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py b/transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py index d0b58843c..c42f887f8 100644 --- a/transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py +++ b/transforms/language/pii_redactor/python/src/pii_redactor_transform_python.py @@ -1,4 +1,3 @@ -# (C) Copyright IBM Corp. 2024. # Licensed under the Apache License, Version 2.0 (the “License”); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/transforms/language/pii_redactor/python/test/test_data.py b/transforms/language/pii_redactor/python/test/test_data.py index 9b310ac3e..dcee489a9 100644 --- a/transforms/language/pii_redactor/python/test/test_data.py +++ b/transforms/language/pii_redactor/python/test/test_data.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import pyarrow as pa diff --git a/transforms/language/pii_redactor/python/test/test_pii_analyzer.py b/transforms/language/pii_redactor/python/test/test_pii_analyzer.py index e2d738519..75d237218 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_analyzer.py +++ b/transforms/language/pii_redactor/python/test/test_pii_analyzer.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import pytest from pii_analyzer import PIIAnalyzerEngine @@ -14,7 +25,8 @@ def analyzer(): def test_analyse_text_for_pii_data(analyzer): input_text = ( - "This is a sample test which has my name Sowmya and my email as sowmya@techiediver.com and " "self.config" + "This is a sample test which has my name Sowmya and my email as sowmya@techiediver.com and " + "self.config" "Born on 31.05.2021" ) result, entity_types = analyzer.analyze_text(input_text, language="en") diff --git a/transforms/language/pii_redactor/python/test/test_pii_anonymizer.py b/transforms/language/pii_redactor/python/test/test_pii_anonymizer.py index d98e6809b..5bcfd4aa3 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_anonymizer.py +++ b/transforms/language/pii_redactor/python/test/test_pii_anonymizer.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + import pytest from pii_analyzer import PIIAnalyzerEngine from pii_anonymizer import PIIAnonymizer @@ -24,8 +35,8 @@ def test_pii_anonymised_text_for_pii_data(analyzer, anonymizer): results, _ = analyzer.analyze_text(sample_input) anonymized_results = anonymizer.anonymize_text(sample_input, results) assert anonymized_results.text != sample_input - assert 'Sowmya' not in anonymized_results.text - assert 'sowmya@gmail.com' not in anonymized_results.text + assert "Sowmya" not in anonymized_results.text + assert "sowmya@gmail.com" not in anonymized_results.text def test_input_not_modified_for_non_pii_text(analyzer, anonymizer): @@ -33,4 +44,3 @@ def test_input_not_modified_for_non_pii_text(analyzer, anonymizer): results, _ = analyzer.analyze_text(sample_input) anonymized_results = anonymizer.anonymize_text(sample_input, results) assert anonymized_results.text == sample_input - diff --git a/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py b/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py index 8b5d9d334..71d3bfb25 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py +++ b/transforms/language/pii_redactor/python/test/test_pii_redactor_redact_anonamize.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from data_processing.test_support.transform.table_transform_test import ( AbstractTableTransformTest, ) @@ -18,7 +29,9 @@ class TestPIIRedactTransform(AbstractTableTransformTest): def get_test_transform_fixtures(self) -> list[tuple]: fixtures = [ ( - PIIRedactorTransform({doc_transformed_contents_key: doc_transformed_contents_key, redaction_operator_key: "redact"}), + PIIRedactorTransform( + {doc_transformed_contents_key: doc_transformed_contents_key, redaction_operator_key: "redact"} + ), [table], [redacted_expected_table], expected_metadata_list, diff --git a/transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py b/transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py index 2bc80b85e..867d0e7d1 100644 --- a/transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py +++ b/transforms/language/pii_redactor/python/test/test_pii_redactor_transform.py @@ -1,3 +1,14 @@ +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + from data_processing.test_support.transform.table_transform_test import ( AbstractTableTransformTest, )