From aecce23c29e2bd6a4fed404ec5e011afc13df071 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Tue, 22 Oct 2024 10:56:13 -0600 Subject: [PATCH] Add community health file based metrics (#115) * restructure for community health metrics * linting * add file check function and data to report * reset to main; add test for file metrics * add code of conduct reference file * add test for the almanack --- CODE_OF_CONDUCT.md | 3 + poetry.lock | 2 +- pyproject.toml | 1 + src/almanack/metrics/data.py | 77 +++++++++-- src/almanack/metrics/metrics.yml | 28 ++++ tests/conftest.py | 23 +++- tests/data/almanack/repo_setup/create_repo.py | 47 ++++++- .../{entropy => }/test_calculate_entropy.py | 10 +- tests/metrics/test_community_health.py | 3 + tests/metrics/test_data.py | 123 +++++++++++++++++- tests/test_git.py | 21 +-- 11 files changed, 301 insertions(+), 37 deletions(-) create mode 100644 CODE_OF_CONDUCT.md rename tests/metrics/{entropy => }/test_calculate_entropy.py (83%) create mode 100644 tests/metrics/test_community_health.py diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..51a477c6 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# Code of Conduct + +Please see our organization-wide [CODE_OF_CONDUCT.md](https://github.com/software-gardening/.github/blob/main/CODE_OF_CONDUCT.md) for more information. diff --git a/poetry.lock b/poetry.lock index 799760d6..1c68568e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3968,4 +3968,4 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-it [metadata] lock-version = "2.0" python-versions = ">=3.9,<=3.12" -content-hash = "4051d293e6424ae300b199eb789a91bdb528124b19ad6b6c7aeaf7184abad2df" +content-hash = "7cb5f65c0e0257a8bd64b42242811e92c3bd80034eeb04288cf40ab3980680d5" diff --git a/pyproject.toml b/pyproject.toml index 438bc204..8582b3d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ requests = "^2.32.3" kaleido = "0.2.1" pygithub = "^2.3.0" jupyterlab-spellchecker = "^0.8.4" +jsonschema = "^4.23.0" [tool.poetry.scripts] almanack = "almanack.reporting.cli:trigger" diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py index 17d211c9..d0c6b3fe 100644 --- a/src/almanack/metrics/data.py +++ b/src/almanack/metrics/data.py @@ -67,6 +67,54 @@ def get_table(repo_path: str) -> Dict[str, Any]: ] +def file_exists_in_repo( + repo: pygit2.Repository, + expected_file_name: str, + check_extension: bool = False, + extensions: list[str] = [".md", ""], +) -> bool: + """ + Check if a file (case-insensitive and with optional extensions) + exists in the latest commit of the repository. + + Args: + repo (pygit2.Repository): + The repository object to search in. + expected_file_name (str): + The base file name to check (e.g., "readme"). + check_extension (bool): + Whether to check the extension of the file or not. + extensions (list[str]): + List of possible file extensions to check (e.g., [".md", ""]). + + Returns: + bool: + True if the file exists, False otherwise. + """ + + # Gather a tree from the HEAD of the repo + tree = repo.revparse_single("HEAD").tree + + # Normalize expected file name to lowercase for case-insensitive comparison + expected_file_name = expected_file_name.lower() + + for entry in tree: + # Normalize entry name to lowercase + entry_name = entry.name.lower() + + # Check if the base file name matches with any allowed extension + if check_extension and any( + entry_name == f"{expected_file_name}{ext.lower()}" for ext in extensions + ): + return True + + # Check whether the filename without an extension matches the expected file name + if not check_extension and entry_name.split(".", 1)[0] == expected_file_name: + return True + + return False + + def compute_repo_data(repo_path: str) -> None: """ Computes comprehensive data for a GitHub repository. @@ -75,13 +123,7 @@ def compute_repo_data(repo_path: str) -> None: repo_path (str): The local path to the Git repository. Returns: - dict: A dictionary containing the following key-value pairs: - - "repo_path": The path of the repository. - - "total_normalized_entropy": The total normalized entropy calculated for the repository. - - "number_of_commits": The total number of commits in the repository. - - "number_of_files": The number of files that have been edited between the first and most recent commit. - - "time_range_of_commits": A tuple containing the dates of the first and most recent commits. - - "file_level_entropy": A dictionary of entropy values for each file. + dict: A dictionary containing data key-pairs. """ try: # Convert repo_path to an absolute path and initialize the repository @@ -122,10 +164,26 @@ def compute_repo_data(repo_path: str) -> None: # Return the data structure return { "repo_path": str(repo_path), - "normalized_total_entropy": normalized_total_entropy, "number_of_commits": len(commits), "number_of_files": len(file_names), "time_range_of_commits": (first_commit_date, most_recent_commit_date), + "readme-included": file_exists_in_repo( + repo=repo, + expected_file_name="readme", + ), + "contributing-included": file_exists_in_repo( + repo=repo, + expected_file_name="contributing", + ), + "code-of-conduct-included": file_exists_in_repo( + repo=repo, + expected_file_name="code_of_conduct", + ), + "license-included": file_exists_in_repo( + repo=repo, + expected_file_name="license", + ), + "normalized_total_entropy": normalized_total_entropy, "file_level_entropy": file_entropy, } @@ -134,9 +192,6 @@ def compute_repo_data(repo_path: str) -> None: return {"repo_path": str(repo_path), "error": str(e)} -from typing import Any, Dict - - def compute_pr_data(repo_path: str, pr_branch: str, main_branch: str) -> Dict[str, Any]: """ Computes entropy data for a PR compared to the main branch. diff --git a/src/almanack/metrics/metrics.yml b/src/almanack/metrics/metrics.yml index f7a24c8e..c5bac5b3 100644 --- a/src/almanack/metrics/metrics.yml +++ b/src/almanack/metrics/metrics.yml @@ -24,6 +24,34 @@ metrics: result-data-key: "time_range_of_commits" description: >- Starting commit and most recent commit for the repository. + - name: "includes-readme" + id: "SGA-GL-0001" + result-type: "bool" + result-data-key: "readme-included" + description: >- + Boolean value indicating the presence of a README file + in the repository. + - name: "includes-contributing" + id: "SGA-GL-0002" + result-type: "bool" + result-data-key: "contributing-included" + description: >- + Boolean value indicating the presence of a CONTRIBUTING file + in the repository. + - name: "includes-code-of-conduct" + id: "SGA-GL-0003" + result-type: "bool" + result-data-key: "code-of-conduct-included" + description: >- + Boolean value indicating the presence of a CODE_OF_CONDUCT file + in the repository. + - name: "includes-license" + id: "SGA-GL-0004" + result-type: "bool" + result-data-key: "license-included" + description: >- + Boolean value indicating the presence of a LICENSE file + in the repository. - name: "agg-info-entropy" id: "SGA-VS-0001" result-type: "float" diff --git a/tests/conftest.py b/tests/conftest.py index 4eca5980..526a48c6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,10 @@ import pytest -from tests.data.almanack.repo_setup.create_repo import create_repositories +from tests.data.almanack.repo_setup.create_repo import ( + create_community_health_repository, + create_entropy_repositories, +) from .utils import check_subproc_run_for_nonzero @@ -67,16 +70,16 @@ def build_jupyter_book( @pytest.fixture(scope="session") -def repository_paths(tmp_path_factory): +def entropy_repository_paths(tmp_path_factory): """ - Fixture to call create_repositories, create the repositories, then delete them + Fixture to call create_entropy_repositories, create the repositories, then delete them using the tmp_path_factory fixture to provide a temporary directory for tests. """ # Create a base temporary directory base_path = tmp_path_factory.mktemp("almanack_entropy") # Run create_repositories with the base_path argument - create_repositories(base_path) + create_entropy_repositories(base_path) repositories = { "3_file_repo": base_path / "3_file_repo", @@ -99,3 +102,15 @@ def repo_file_sets(): "3_file_repo": ["file_1.md", "file_2.md", "file_3.md"], "1_file_repo": ["file_1.md"], } + + +@pytest.fixture(scope="session") +def community_health_repository_path(tmp_path_factory): + """ + Fixture to call create_community_health_repository, create the repositories, then delete them + using the tmp_path_factory fixture to provide a temporary directory for tests. + """ + # Create a base temporary directory + base_path = tmp_path_factory.mktemp("almanack_community_health") + + yield create_community_health_repository(base_path) diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index 43104488..fc449db3 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -58,7 +58,7 @@ def commit_changes(repo_path: pathlib.Path, message: str) -> None: repo.set_head("refs/heads/main") -def create_repositories(base_path: pathlib.Path) -> None: +def create_entropy_repositories(base_path: pathlib.Path) -> None: """ Sets up Git repositories with baseline content and adds entropy. @@ -153,3 +153,48 @@ def create_repositories(base_path: pathlib.Path) -> None: for repo_name in ["3_file_repo", "1_file_repo"]: repo_path = base_path / repo_name commit_changes(repo_path, "Commit with added lines of code") + + +def create_community_health_repository(base_path: pathlib.Path) -> str: + + filenames_and_contents = { + "README.md": "# This is an example readme\n\nWelcome to our repo!", + "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", + "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", + "LICENSE.txt": "This is an example LICENSE file.", + } + + repo_path = base_path / "community_health" + repo_path.mkdir(parents=True, exist_ok=True) + repo = pygit2.init_repository(path=str(repo_path), bare=False) + + # Set user.name and user.email in the config + set_repo_user_config(repo) + + for filename, content in filenames_and_contents.items(): + # add content to each file based on the filenames and contents dict + with open((repo_path / filename).resolve(), "w") as f: + f.write(content) + + # add all files to the index + repo.index.add_all() + # write the files to the index + repo.index.write() + + # create a tree for the index + tree = repo.index.write_tree() + # gather a default signature author + author = repo.default_signature + repo.create_commit( + "refs/heads/main", + author, + author, + "Committing community health files", + tree, + [], + ) + + # set the head to the main branch + repo.set_head("refs/heads/main") + + return str(repo_path) diff --git a/tests/metrics/entropy/test_calculate_entropy.py b/tests/metrics/test_calculate_entropy.py similarity index 83% rename from tests/metrics/entropy/test_calculate_entropy.py rename to tests/metrics/test_calculate_entropy.py index ef967b67..44216252 100644 --- a/tests/metrics/entropy/test_calculate_entropy.py +++ b/tests/metrics/test_calculate_entropy.py @@ -12,12 +12,13 @@ def test_calculate_normalized_entropy( - repository_paths: dict[str, pathlib.Path], repo_file_sets: dict[str, list[str]] + entropy_repository_paths: dict[str, pathlib.Path], + repo_file_sets: dict[str, list[str]], ) -> None: """ Test the calculate_normalized_entropy function. """ - for label, repo_path in repository_paths.items(): + for label, repo_path in entropy_repository_paths.items(): # Extract two most recent commits: source and target source_commit, target_commit = get_most_recent_commits(repo_path) @@ -35,14 +36,15 @@ def test_calculate_normalized_entropy( def test_calculate_aggregate_entropy( - repository_paths: dict[str, pathlib.Path], repo_file_sets: dict[str, list[str]] + entropy_repository_paths: dict[str, pathlib.Path], + repo_file_sets: dict[str, list[str]], ) -> None: """ Test that calculate_aggregate_entropy function """ repo_entropies = {} - for label, repo_path in repository_paths.items(): + for label, repo_path in entropy_repository_paths.items(): # Extract two most recent commits: source and target source_commit, target_commit = get_most_recent_commits(repo_path) # Call calculate_normalized_entropy function diff --git a/tests/metrics/test_community_health.py b/tests/metrics/test_community_health.py new file mode 100644 index 00000000..928b94df --- /dev/null +++ b/tests/metrics/test_community_health.py @@ -0,0 +1,3 @@ +""" +Tests various community health metric functionality. +""" diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index 96a49461..5fd3ed80 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -3,17 +3,27 @@ """ import pathlib +from typing import List +import jsonschema import pandas as pd +import pygit2 +import pytest +import yaml -from almanack.metrics.data import compute_repo_data, get_table +from almanack.metrics.data import ( + METRICS_TABLE, + compute_repo_data, + file_exists_in_repo, + get_table, +) -def test_generate_repo_data(repository_paths: dict[str, pathlib.Path]) -> None: +def test_generate_repo_data(entropy_repository_paths: dict[str, pathlib.Path]) -> None: """ Testing generate_whole_repo_data produces the expected output for given repositories. """ - for _, repo_path in repository_paths.items(): + for _, repo_path in entropy_repository_paths.items(): # Call the function data = compute_repo_data(str(repo_path)) @@ -24,10 +34,14 @@ def test_generate_repo_data(repository_paths: dict[str, pathlib.Path]) -> None: # Check for expected keys expected_keys = [ "repo_path", - "normalized_total_entropy", "number_of_commits", "number_of_files", "time_range_of_commits", + "readme-included", + "contributing-included", + "code-of-conduct-included", + "license-included", + "normalized_total_entropy", "file_level_entropy", ] assert all(key in data for key in expected_keys) @@ -36,12 +50,12 @@ def test_generate_repo_data(repository_paths: dict[str, pathlib.Path]) -> None: assert data["repo_path"] == str(repo_path) -def test_get_table(repository_paths: dict[str, pathlib.Path]) -> None: +def test_get_table(entropy_repository_paths: dict[str, pathlib.Path]) -> None: """ Tests the almanack.metrics.data.get_table function """ - for name, repo_path in repository_paths.items(): + for _, repo_path in entropy_repository_paths.items(): # create a table from the repo table = get_table(str(repo_path)) @@ -57,3 +71,100 @@ def test_get_table(repository_paths: dict[str, pathlib.Path]) -> None: "description", "result", ] + + +def test_metrics_yaml(): + """ + Test the metrics yaml for expected results + """ + + # define an expected jsonschema for metrics.yml + schema = { + "type": "object", + "properties": { + "metrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "id": {"type": "string"}, + "result-type": {"type": "string"}, + "result-data-key": {"type": "string"}, + "description": {"type": "string"}, + }, + "required": [ + "name", + "id", + "result-type", + "result-data-key", + "description", + ], + }, + } + }, + "required": ["metrics"], + } + + # open the metrics table + with open(METRICS_TABLE, "r") as f: + metrics_table = yaml.safe_load(f) + + # Validate the structure against the schema + # (we expect None if all is validated) + assert jsonschema.validate(instance=metrics_table, schema=schema) is None + + # Check for unique IDs + ids = [metric["id"] for metric in metrics_table["metrics"]] + assert len(ids) == len(set(ids)) + + +@pytest.mark.parametrize( + "expected_file_name, check_extension, extensions, expected_result", + [ + ("readme", True, [".md", ""], True), + ("README", False, [], True), + ("CONTRIBUTING", True, [".md", ""], True), + ("contributing", False, [], True), + ("code_of_conduct", True, [".md", ""], True), + ("CODE_OF_CONDUCT", False, [], True), + ("LICENSE", True, [".md", ".txt", ""], True), + ("license", False, [], True), + ], +) +def test_file_exists_in_repo( + community_health_repository_path: str, + expected_file_name: str, + check_extension: bool, + extensions: List[str], + expected_result: bool, +): + """ + Combined test for file_exists_in_repo function using different scenarios. + """ + + # test a synthetic repo + repo_path = pathlib.Path(community_health_repository_path).resolve() + repo = pygit2.Repository(str(repo_path)) + + result = file_exists_in_repo( + repo=repo, + expected_file_name=expected_file_name, + check_extension=check_extension, + extensions=extensions, + ) + + assert result == expected_result + + # test the almanack itself + repo_path = pathlib.Path(".").resolve() + repo = pygit2.Repository(str(repo_path)) + + result = file_exists_in_repo( + repo=repo, + expected_file_name=expected_file_name, + check_extension=check_extension, + extensions=extensions, + ) + + assert result == expected_result diff --git a/tests/test_git.py b/tests/test_git.py index 62598e0b..a2a57ca5 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -16,8 +16,8 @@ ) -def test_clone_repository(repository_paths: dict[str, Any]): - repo_path = repository_paths["3_file_repo"] +def test_clone_repository(entropy_repository_paths: dict[str, Any]): + repo_path = entropy_repository_paths["3_file_repo"] # Call the function cloned_path = clone_repository(str(repo_path)) @@ -26,9 +26,9 @@ def test_clone_repository(repository_paths: dict[str, Any]): assert cloned_path.exists() -def test_get_commits(repository_paths: dict[str, Any]): +def test_get_commits(entropy_repository_paths: dict[str, Any]): # Open the repo - repo_path = repository_paths["3_file_repo"] + repo_path = entropy_repository_paths["3_file_repo"] repo = pygit2.Repository(str(repo_path)) # Call the function @@ -40,9 +40,9 @@ def test_get_commits(repository_paths: dict[str, Any]): assert len(commits) > 0 -def test_get_edited_files(repository_paths: dict[str, Any]): +def test_get_edited_files(entropy_repository_paths: dict[str, Any]): # Open the repo - repo_path = repository_paths["3_file_repo"] + repo_path = entropy_repository_paths["3_file_repo"] repo = pygit2.Repository(str(repo_path)) # Get commits to use for comparison @@ -58,7 +58,8 @@ def test_get_edited_files(repository_paths: dict[str, Any]): def test_get_loc_changed( - repository_paths: dict[str, pathlib.Path], repo_file_sets: dict[str, list[str]] + entropy_repository_paths: dict[str, pathlib.Path], + repo_file_sets: dict[str, list[str]], ) -> None: """ Test the calculate_loc_changes function. @@ -66,7 +67,7 @@ def test_get_loc_changed( results = {} - for label, repo_path in repository_paths.items(): + for label, repo_path in entropy_repository_paths.items(): # Extract two most recent commits: source and target source_commit, target_commit = get_most_recent_commits(repo_path) # Call loc_changes function on test repositories @@ -82,8 +83,8 @@ def test_get_loc_changed( ) # Check that all values are non-negative -def test_get_most_recent_commits(repository_paths: dict[str, Any]): - repo_path = repository_paths["3_file_repo"] +def test_get_most_recent_commits(entropy_repository_paths: dict[str, Any]): + repo_path = entropy_repository_paths["3_file_repo"] # Call the function to get the two most recent commits source_commit_hash, target_commit_hash = get_most_recent_commits(repo_path)