Skip to content

Commit

Permalink
Merge pull request #428 from sapthasurendran/c2p
Browse files Browse the repository at this point in the history
Add repo_name to code2parquet transform and injest2parquet tool
  • Loading branch information
Param-S authored Jul 25, 2024
2 parents 889e509 + 2e9efe5 commit 038b03a
Show file tree
Hide file tree
Showing 17 changed files with 115 additions and 88 deletions.
5 changes: 5 additions & 0 deletions tools/ingest2parquet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ Each file contained within the ZIP is transformed into a distinct row within the
- **Description:** Name of the ZIP file containing the current file.
- **Example:** `"document": "example.zip"`

**repo_name:**

- **Description:** Name of the repository the code belongs to. Repo_name is same as zip file name.
- **Example:** `"repo_name": "data"`

**contents:** (string)

- **Description:** Content of the file, converted to a string.
Expand Down
1 change: 1 addition & 0 deletions tools/ingest2parquet/src/ingest2parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def zip_to_table(data_access: DataAccess, file_path, detect_prog_lang: Any) -> p
"hash": TransformUtils.str_to_hash(content_string),
"size": TransformUtils.deep_get_size(content_string),
"date_acquired": datetime.now().isoformat(),
"repo_name" :os.path.splitext(zip_name)[0]
}
if detect_prog_lang:
lang = detect_prog_lang.get_lang_from_ext(ext)
Expand Down
5 changes: 5 additions & 0 deletions transforms/code/code2parquet/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ Each file contained within the ZIP is transformed into a distinct row within the
- **Description:** Name of the ZIP file containing the current file.
- **Example:** `"document": "example.zip"`

**repo_name:**

- **Description:** The name of the repository to which the code belongs. This should match the name of the zip file containing the repository.
- **Example:** `"repo_name": "example"`

**contents:** (string)

- **Description:** Content of the file, converted to a string.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
supported_languages_file = os.path.abspath(
os.path.join(os.path.dirname(__file__), "../../ray/test-data/languages/lang_extensions.json")
)
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../ray/test-data/input"))
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))

params = {
supported_langs_file_key: supported_languages_file,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@

# create parameters
supported_languages_file = os.path.abspath(
os.path.join(os.path.dirname(__file__), "../../ray/test-data/languages/lang_extensions.json")
os.path.join(os.path.dirname(__file__), "../test-data/languages/lang_extensions.json")
)
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../ray/test-data/input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../ray/output"))
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from argparse import ArgumentParser, Namespace
from datetime import datetime
from typing import Any
import os

import pyarrow as pa
from data_processing.data_access import DataAccess, DataAccessFactory
Expand Down Expand Up @@ -135,6 +136,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
"hash": TransformUtils.str_to_hash(content_string),
"size": len(content_string),
"date_acquired": datetime.now().isoformat(),
"repo_name":os.path.splitext(os.path.basename(file_name))[0]
} | self.shared_columns
if self.detect_programming_lang:
lang = self._get_lang_from_ext(ext)
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,40 +1,46 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "code2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-06-26 15:55:33",
"end_time": "2024-06-26 15:55:33",
"status": "success"
},
"code": null,
"job_input_params": {
"supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json",
"detect_programming_lang": true,
"snapshot": "github",
"domain": "code",
"s3_cred": null,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [".zip"]
},
"job_output_stats": {
"source_files": 3,
"source_size": 33885652,
"result_files": 3,
"result_size": 88484,
"processing_time": 0.36685895919799805,
"number of rows": 74
},
"source": {
"name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input",
"type": "path"
},
"target": {
"name": "/tmp/code2parquetdbmtc8ox",
"type": "path"
}
}
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "code2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-07-23 20:59:16",
"end_time": "2024-07-23 20:59:16",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"supported_langs_file": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json",
"detect_programming_lang": true,
"snapshot": null,
"domain": null,
"s3_cred": null,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".zip"
]
},
"job_output_stats": {
"source_files": 3,
"source_size": 33885652,
"result_files": 3,
"result_size": 85597,
"processing_time": 0.5968606472015381,
"number of rows": 74
},
"source": {
"name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/output",
"type": "path"
}
}
6 changes: 4 additions & 2 deletions transforms/code/code2parquet/python/test/test_code2parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,17 @@ def get_test_transform_fixtures(self) -> list[tuple]:
config = {
supported_langs_file_key: lang_supported_file,
detect_programming_lang_key: True,
snapshot_key: "github",
domain_key: "code",
# snapshot_key: "github",
# domain_key: "code",
data_factory_key: DataAccessFactory(),
}

expected_files = get_files_in_folder(os.path.join(basedir, "expected"), ".parquet")
expected_files = [
(binary, TransformUtils.get_file_extension(name)[1]) for name, binary in expected_files.items()
]


return [(CodeToParquetTransform(config), input_files, expected_files, expected_metadata_list)]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
"data_files_to_use": ast.literal_eval("['.zip']"),
supported_langs_file_cli_key: lang_supported_file,
detect_programming_lang_cli_key: True,
snapshot_cli_key: "github",
domain_cli_key: "code",
# snapshot_cli_key: "github",
# domain_cli_key: "code",
}
fixtures = [
(
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
84 changes: 45 additions & 39 deletions transforms/code/code2parquet/ray/test-data/expected/metadata.json
Original file line number Diff line number Diff line change
@@ -1,40 +1,46 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "code2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-06-26 15:55:33",
"end_time": "2024-06-26 15:55:33",
"status": "success"
},
"code": null,
"job_input_params": {
"supported_langs_file": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/languages/lang_extensions.json",
"detect_programming_lang": true,
"snapshot": "github",
"domain": "code",
"s3_cred": null,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [".zip"]
},
"job_output_stats": {
"source_files": 3,
"source_size": 33885652,
"result_files": 3,
"result_size": 88484,
"processing_time": 0.36685895919799805,
"number of rows": 74
},
"source": {
"name": "/Users/dawood/git/data-prep-kit/transforms/code/code2parquet/python/test-data/input",
"type": "path"
},
"target": {
"name": "/tmp/code2parquetdbmtc8ox",
"type": "path"
}
}
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "code2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-07-23 20:59:16",
"end_time": "2024-07-23 20:59:16",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"supported_langs_file": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/ray/test-data/languages/lang_extensions.json",
"detect_programming_lang": true,
"snapshot": null,
"domain": null,
"s3_cred": null,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".zip"
]
},
"job_output_stats": {
"source_files": 3,
"source_size": 33885652,
"result_files": 3,
"result_size": 85597,
"processing_time": 0.5968606472015381,
"number of rows": 74
},
"source": {
"name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/test-data/input",
"type": "path"
},
"target": {
"name": "/Users/sapthasurendran/Documents/GUF/data-prep-lab/data-prep-lab/transforms/code/code2parquet/python/output",
"type": "path"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
"data_files_to_use": ast.literal_eval("['.zip']"),
supported_langs_file_cli_key: lang_supported_file,
detect_programming_lang_cli_key: True,
snapshot_cli_key: "github",
domain_cli_key: "code",
# snapshot_cli_key: "github",
# domain_cli_key: "code",
}
fixtures = [
(
Expand Down

0 comments on commit 038b03a

Please sign in to comment.