Add citability measure (#120)

* add functions for reading blobs; measure citation * linting * move blobs to git * add test for the almanack itself * Update src/almanack/metrics/metrics.yml Co-authored-by: Gregory Way <[email protected]> * add additional cases for citabillity in readmes Co-Authored-By: Gregory Way <[email protected]> --------- Co-authored-by: Gregory Way <[email protected]>
software-gardening · Oct 23, 2024 · 29ca220 · 29ca220
1 parent 29c4530
commit 29ca220
Show file tree

Hide file tree

Showing 9 changed files with 422 additions and 129 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,6 +18,7 @@ pyyaml = "^6.0.1"
 pygit2 = "^1.15.1"
 fire = "^0.6.0"
 tabulate = "^0.9.0"
+charset-normalizer = "^3.4.0"
 
 [tool.poetry.group.book.dependencies]
 jupyter-book = "^1.0.0"

diff --git a/src/almanack/git.py b/src/almanack/git.py
@@ -4,9 +4,10 @@
 
 import pathlib
 import tempfile
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import pygit2
+from charset_normalizer import from_bytes
 
 
 def clone_repository(repo_url: str) -> pathlib.Path:
@@ -144,3 +145,71 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]:
     target_commit = commits[0]  # Most recent
 
     return str(source_commit.id), str(target_commit.id)
+
+
+"""
+Module for handling various tasks with git repo blobs.
+"""
+
+
+import pygit2
+
+
+def detect_encoding(blob_data: bytes) -> str:
+    """
+    Detect the encoding of the given blob data using charset-normalizer.
+
+    Args:
+        blob_data (bytes): The raw bytes of the blob to analyze.
+
+    Returns:
+        str: The best detected encoding of the blob data.
+
+    Raises:
+        ValueError: If no encoding could be detected.
+    """
+    if not blob_data:
+        raise ValueError("No data provided for encoding detection.")
+
+    result = from_bytes(blob_data)
+    if result.best():
+        # Get the best encoding found
+        return result.best().encoding
+    raise ValueError("Encoding could not be detected.")
+
+
+def find_and_read_file(repo: pygit2.Repository, filename: str) -> Optional[str]:
+    """
+    Find and read the content of a file in the repository that matches the filename pattern.
+
+    Args:
+        repo (str): The path to the repository.
+        filename (str): The pattern to match against filenames.
+
+    Returns:
+        Optional[str]: The content of the found file, or None if no matching files are found.
+    """
+
+    # Get the tree associated with the latest commit
+    tree = repo.head.peel().tree
+
+    # find the first occurrence of a matching file
+    found_file: Optional[pygit2.Blob] = next(
+        (
+            entry
+            for entry in tree
+            if entry.type == pygit2.GIT_OBJECT_BLOB
+            and filename.lower() == entry.name.lower()
+        ),
+        None,
+    )
+
+    # if we have none, return it early to avoid trying to read nothing
+    if found_file is None:
+        return found_file
+
+    # Read the content of the first found blob
+    blob_data: bytes = found_file.data
+
+    # Decode and return content as a string
+    return blob_data.decode(detect_encoding(blob_data))
diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py
@@ -11,7 +11,7 @@
 import pygit2
 import yaml
 
-from ..git import clone_repository, get_commits, get_edited_files
+from ..git import clone_repository, find_and_read_file, get_commits, get_edited_files
 from .entropy.calculate_entropy import (
     calculate_aggregate_entropy,
     calculate_normalized_entropy,
@@ -115,6 +115,57 @@ def file_exists_in_repo(
     return False
 
 
+def is_citable(repo: pygit2.Repository) -> bool:
+    """
+    Check if the given repository is citable.
+
+    A repository is considered citable if it contains a CITATION.cff or CITATION.bib
+    file, or if the README.md file contains a citation section indicated by "## Citation"
+    or "## Citing".
+
+    Args:
+        repo (pygit2.Repository): The repository to check for citation files.
+
+    Returns:
+        bool: True if the repository is citable, False otherwise.
+    """
+
+    # Check for a CITATION.cff or CITATION.bib file
+    if file_exists_in_repo(
+        repo=repo,
+        expected_file_name="citation",
+        check_extension=True,
+        extensions=[".cff", ".bib"],
+    ):
+        return True
+
+    # Look for a README.md file and read its content
+    if (
+        file_content := find_and_read_file(repo=repo, filename="readme.md")
+    ) is not None:
+        # Check for an H2 heading indicating a citation section
+        if any(
+            check_string in file_content
+            for check_string in [
+                # markdown sub-headers
+                "## Citation",
+                "## Citing",
+                "## Cite",
+                "## How to cite",
+                # RST sub-headers
+                "Citation\n--------",
+                "Citing\n------",
+                "Cite\n----",
+                "How to cite\n-----------",
+                # DOI shield
+                "[![DOI](https://img.shields.io/badge/DOI",
+            ]
+        ):
+            return True
+
+    return False
+
+
 def compute_repo_data(repo_path: str) -> None:
     """
     Computes comprehensive data for a GitHub repository.
@@ -183,6 +234,7 @@ def compute_repo_data(repo_path: str) -> None:
                 repo=repo,
                 expected_file_name="license",
             ),
+            "is-citable": is_citable(repo=repo),
             "normalized_total_entropy": normalized_total_entropy,
             "file_level_entropy": file_entropy,
         }

diff --git a/src/almanack/metrics/metrics.yml b/src/almanack/metrics/metrics.yml
@@ -52,6 +52,13 @@ metrics:
     description: >-
       Boolean value indicating the presence of a LICENSE file
       in the repository.
+  - name: "is-citable"
+    id: "SGA-GL-0005"
+    result-type: "bool"
+    result-data-key: "is-citable"
+    description: >-
+      Boolean value indicating the presence of a CITATION file
+      or some other means of indicating how to cite the work.
   - name: "agg-info-entropy"
     id: "SGA-VS-0001"
     result-type: "float"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,11 +7,13 @@
 import shutil
 import subprocess
 
+import pygit2
 import pytest
 
 from tests.data.almanack.repo_setup.create_repo import (
-    create_community_health_repository,
     create_entropy_repositories,
+    repo_setup,
+    set_repo_user_config,
 )
 
 from .utils import check_subproc_run_for_nonzero
@@ -110,7 +112,50 @@ def community_health_repository_path(tmp_path_factory):
     Fixture to call create_community_health_repository, create the repositories, then delete them
     using the tmp_path_factory fixture to provide a temporary directory for tests.
     """
-    # Create a base temporary directory
-    base_path = tmp_path_factory.mktemp("almanack_community_health")
 
-    yield create_community_health_repository(base_path)
+    # Create a temporary directory for the session
+    temp_dir = tmp_path_factory.mktemp("community_health_repo")
+
+    yield repo_setup(
+        repo_path=pathlib.Path(temp_dir),
+        files={
+            "README.md": "# This is an example readme\n\nWelcome to our repo!",
+            "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
+            "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
+            "LICENSE.txt": "This is an example LICENSE file.",
+        },
+    )
+
+
+@pytest.fixture
+def repo_with_citation_in_readme(tmp_path):
+    """Create a temporary repository with a specific structure for testing."""
+    # Create a new repository in the temporary path
+    repo = pygit2.init_repository(tmp_path, bare=False)
+
+    # Set user.name and user.email in the config
+    set_repo_user_config(repo)
+
+    (tmp_path / "README.md").write_text("## Citation")
+
+    index = repo.index
+
+    index.add_all()
+    index.write()
+
+    author = repo.default_signature
+
+    tree = repo.index.write_tree()
+
+    repo.create_commit(
+        "refs/heads/main",
+        author,
+        author,
+        "Committing nested files",
+        tree,
+        [],
+    )
+    # set the head to the main branch
+    repo.set_head("refs/heads/main")
+
+    yield repo  # Provide the repository to the tests
diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py
@@ -155,46 +155,45 @@ def create_entropy_repositories(base_path: pathlib.Path) -> None:
         commit_changes(repo_path, "Commit with added lines of code")
 
 
-def create_community_health_repository(base_path: pathlib.Path) -> str:
+def repo_setup(repo_path: pathlib.Path, files: dict) -> pygit2.Repository:
+    """
+    Set up a temporary repository with specified files.
 
-    filenames_and_contents = {
-        "README.md": "# This is an example readme\n\nWelcome to our repo!",
-        "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
-        "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
-        "LICENSE.txt": "This is an example LICENSE file.",
-    }
+    Args:
+        tmp_path (Path): The temporary directory where the repo will be created.
+        files (dict): A dictionary where keys are filenames and values are their content.
 
-    repo_path = base_path / "community_health"
-    repo_path.mkdir(parents=True, exist_ok=True)
-    repo = pygit2.init_repository(path=str(repo_path), bare=False)
+    Returns:
+        pygit2.Repository: The initialized repository with files.
+    """
+    # Create a new repository in the temporary path
+    repo = pygit2.init_repository(repo_path, bare=False)
 
     # Set user.name and user.email in the config
     set_repo_user_config(repo)
 
-    for filename, content in filenames_and_contents.items():
-        # add content to each file based on the filenames and contents dict
-        with open((repo_path / filename).resolve(), "w") as f:
-            f.write(content)
+    # Create files in the repository
+    for filename, content in files.items():
+        (repo_path / filename).write_text(content)
 
-    # add all files to the index
-    repo.index.add_all()
-    # write the files to the index
-    repo.index.write()
+    # Stage and commit the files
+    index = repo.index
+    index.add_all()
+    index.write()
 
-    # create a tree for the index
-    tree = repo.index.write_tree()
-    # gather a default signature author
     author = repo.default_signature
+    tree = repo.index.write_tree()
+
     repo.create_commit(
         "refs/heads/main",
         author,
         author,
-        "Committing community health files",
+        "Initial commit with setup files",
         tree,
         [],
     )
 
-    # set the head to the main branch
+    # Set the head to the main branch
     repo.set_head("refs/heads/main")
 
-    return str(repo_path)
+    return repo