diff --git a/tools/ingest2parquet/README.md b/tools/ingest2parquet/README.md index c7485994e..dce3d042f 100644 --- a/tools/ingest2parquet/README.md +++ b/tools/ingest2parquet/README.md @@ -19,6 +19,11 @@ Each file contained within the ZIP is transformed into a distinct row within the - **Description:** Name of the ZIP file containing the current file. - **Example:** `"document": "example.zip"` +**repo_name:** + +- **Description:** Name of the repository the code belongs to. Repo_name is same as zip file name. +- **Example:** `"repo_name": "data"` + **contents:** (string) - **Description:** Content of the file, converted to a string. diff --git a/tools/ingest2parquet/src/ingest2parquet.py b/tools/ingest2parquet/src/ingest2parquet.py index 9aec72994..49777c3b7 100644 --- a/tools/ingest2parquet/src/ingest2parquet.py +++ b/tools/ingest2parquet/src/ingest2parquet.py @@ -59,6 +59,7 @@ def zip_to_table(data_access: DataAccess, file_path, detect_prog_lang: Any) -> p "hash": TransformUtils.str_to_hash(content_string), "size": TransformUtils.deep_get_size(content_string), "date_acquired": datetime.now().isoformat(), + "repo_name" :os.path.splitext(zip_name)[0] } if detect_prog_lang: lang = detect_prog_lang.get_lang_from_ext(ext) diff --git a/transforms/code/code2parquet/python/README.md b/transforms/code/code2parquet/python/README.md index 4b53b2bd3..b93ff3717 100644 --- a/transforms/code/code2parquet/python/README.md +++ b/transforms/code/code2parquet/python/README.md @@ -17,6 +17,11 @@ Each file contained within the ZIP is transformed into a distinct row within the - **Description:** Name of the ZIP file containing the current file. - **Example:** `"document": "example.zip"` +**repo_name:** + +- **Description:** The name of the repository to which the code belongs. This should match the name of the zip file containing the repository. +- **Example:** `"repo_name": "example"` + **contents:** (string) - **Description:** Content of the file, converted to a string. diff --git a/transforms/code/code2parquet/python/src/code2parquet_local.py b/transforms/code/code2parquet/python/src/code2parquet_local.py index 425e8daa2..8ebd4370b 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local.py +++ b/transforms/code/code2parquet/python/src/code2parquet_local.py @@ -25,7 +25,7 @@ supported_languages_file = os.path.abspath( os.path.join(os.path.dirname(__file__), "../../ray/test-data/languages/lang_extensions.json") ) -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../ray/test-data/input")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) params = { supported_langs_file_key: supported_languages_file, diff --git a/transforms/code/code2parquet/python/src/code2parquet_local_python.py b/transforms/code/code2parquet/python/src/code2parquet_local_python.py index 36b72ab07..66713a02f 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_local_python.py +++ b/transforms/code/code2parquet/python/src/code2parquet_local_python.py @@ -25,10 +25,10 @@ # create parameters supported_languages_file = os.path.abspath( - os.path.join(os.path.dirname(__file__), "../../ray/test-data/languages/lang_extensions.json") + os.path.join(os.path.dirname(__file__), "../test-data/languages/lang_extensions.json") ) -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../ray/test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../ray/output")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/code/code2parquet/python/src/code2parquet_transform.py b/transforms/code/code2parquet/python/src/code2parquet_transform.py index a5f6ad2a3..b6dbce63c 100644 --- a/transforms/code/code2parquet/python/src/code2parquet_transform.py +++ b/transforms/code/code2parquet/python/src/code2parquet_transform.py @@ -18,6 +18,7 @@ from argparse import ArgumentParser, Namespace from datetime import datetime from typing import Any +import os import pyarrow as pa from data_processing.data_access import DataAccess, DataAccessFactory @@ -135,6 +136,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl "hash": TransformUtils.str_to_hash(content_string), "size": len(content_string), "date_acquired": datetime.now().isoformat(), + "repo_name":os.path.splitext(os.path.basename(file_name))[0] } | self.shared_columns if self.detect_programming_lang: lang = self._get_lang_from_ext(ext) diff --git a/transforms/code/code2parquet/python/test-data/expected/application-java.parquet b/transforms/code/code2parquet/python/test-data/expected/application-java.parquet index f8a61a918..db0818676 100644 Binary files a/transforms/code/code2parquet/python/test-data/expected/application-java.parquet and b/transforms/code/code2parquet/python/test-data/expected/application-java.parquet differ diff --git a/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet b/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet index 855487296..fc5499de2 100644 Binary files a/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet and b/transforms/code/code2parquet/python/test-data/expected/data-processing-lib.parquet differ diff --git a/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet b/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet index 344fdd809..e2abe2b3f 100644 Binary files a/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet and b/transforms/code/code2parquet/python/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet differ diff --git a/transforms/code/code2parquet/python/test-data/expected/metadata.json b/transforms/code/code2parquet/python/test-data/expected/metadata.json index c187a83d1..d85d0787f 100644 --- a/transforms/code/code2parquet/python/test-data/expected/metadata.json +++ b/transforms/code/code2parquet/python/test-data/expected/metadata.json @@ -1,40 +1,46 @@ { - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-06-26 15:55:33", - "end_time": "2024-06-26 15:55:33", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": "github", - "domain": "code", - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 88484, - "processing_time": 0.36685895919799805, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetdbmtc8ox", - "type": "path" - } -} + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "code2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-07-23 20:59:16", + "end_time": "2024-07-23 20:59:16", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "supported_langs_file": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json", + "detect_programming_lang": true, + "snapshot": null, + "domain": null, + "s3_cred": null, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".zip" + ] + }, + "job_output_stats": { + "source_files": 3, + "source_size": 33885652, + "result_files": 3, + "result_size": 85597, + "processing_time": 0.5968606472015381, + "number of rows": 74 + }, + "source": { + "name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/code/code2parquet/python/test/test_code2parquet.py b/transforms/code/code2parquet/python/test/test_code2parquet.py index c5be7277c..57e28d656 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet.py +++ b/transforms/code/code2parquet/python/test/test_code2parquet.py @@ -42,8 +42,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: config = { supported_langs_file_key: lang_supported_file, detect_programming_lang_key: True, - snapshot_key: "github", - domain_key: "code", + # snapshot_key: "github", + # domain_key: "code", data_factory_key: DataAccessFactory(), } @@ -51,6 +51,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: expected_files = [ (binary, TransformUtils.get_file_extension(name)[1]) for name, binary in expected_files.items() ] + + return [(CodeToParquetTransform(config), input_files, expected_files, expected_metadata_list)] diff --git a/transforms/code/code2parquet/python/test/test_code2parquet_python.py b/transforms/code/code2parquet/python/test/test_code2parquet_python.py index a4ea630c2..55547da2b 100644 --- a/transforms/code/code2parquet/python/test/test_code2parquet_python.py +++ b/transforms/code/code2parquet/python/test/test_code2parquet_python.py @@ -44,8 +44,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: "data_files_to_use": ast.literal_eval("['.zip']"), supported_langs_file_cli_key: lang_supported_file, detect_programming_lang_cli_key: True, - snapshot_cli_key: "github", - domain_cli_key: "code", + # snapshot_cli_key: "github", + # domain_cli_key: "code", } fixtures = [ ( diff --git a/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet b/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet index f8a61a918..db0818676 100644 Binary files a/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet and b/transforms/code/code2parquet/ray/test-data/expected/application-java.parquet differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet b/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet index 855487296..fc5499de2 100644 Binary files a/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet and b/transforms/code/code2parquet/ray/test-data/expected/data-processing-lib.parquet differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet b/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet index 344fdd809..e2abe2b3f 100644 Binary files a/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet and b/transforms/code/code2parquet/ray/test-data/expected/https___github.com_00000o1_environments_archive_refs_heads_master.parquet differ diff --git a/transforms/code/code2parquet/ray/test-data/expected/metadata.json b/transforms/code/code2parquet/ray/test-data/expected/metadata.json index c187a83d1..d85d0787f 100644 --- a/transforms/code/code2parquet/ray/test-data/expected/metadata.json +++ b/transforms/code/code2parquet/ray/test-data/expected/metadata.json @@ -1,40 +1,46 @@ { - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "code2parquet", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-06-26 15:55:33", - "end_time": "2024-06-26 15:55:33", - "status": "success" - }, - "code": null, - "job_input_params": { - "supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json", - "detect_programming_lang": true, - "snapshot": "github", - "domain": "code", - "s3_cred": null, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [".zip"] - }, - "job_output_stats": { - "source_files": 3, - "source_size": 33885652, - "result_files": 3, - "result_size": 88484, - "processing_time": 0.36685895919799805, - "number of rows": 74 - }, - "source": { - "name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/tmp/code2parquetdbmtc8ox", - "type": "path" - } -} + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "code2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-07-23 20:59:16", + "end_time": "2024-07-23 20:59:16", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "supported_langs_file": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json", + "detect_programming_lang": true, + "snapshot": null, + "domain": null, + "s3_cred": null, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".zip" + ] + }, + "job_output_stats": { + "source_files": 3, + "source_size": 33885652, + "result_files": 3, + "result_size": 85597, + "processing_time": 0.5968606472015381, + "number of rows": 74 + }, + "source": { + "name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py b/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py index dc87f1131..b8118554c 100644 --- a/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py +++ b/transforms/code/code2parquet/ray/test/test_code2parquet_ray.py @@ -47,8 +47,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: "data_files_to_use": ast.literal_eval("['.zip']"), supported_langs_file_cli_key: lang_supported_file, detect_programming_lang_cli_key: True, - snapshot_cli_key: "github", - domain_cli_key: "code", + # snapshot_cli_key: "github", + # domain_cli_key: "code", } fixtures = [ (