Skip to content

Commit

Permalink
Add citability measure (#120)
Browse files Browse the repository at this point in the history
* add functions for reading blobs; measure citation

* linting

* move blobs to git

* add test for the almanack itself

* Update src/almanack/metrics/metrics.yml

Co-authored-by: Gregory Way <[email protected]>

* add additional cases for citabillity in readmes

Co-Authored-By: Gregory Way <[email protected]>

---------

Co-authored-by: Gregory Way <[email protected]>
  • Loading branch information
d33bs and gwaybio authored Oct 23, 2024
1 parent 29c4530 commit 29ca220
Show file tree
Hide file tree
Showing 9 changed files with 422 additions and 129 deletions.
199 changes: 107 additions & 92 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pyyaml = "^6.0.1"
pygit2 = "^1.15.1"
fire = "^0.6.0"
tabulate = "^0.9.0"
charset-normalizer = "^3.4.0"

[tool.poetry.group.book.dependencies]
jupyter-book = "^1.0.0"
Expand Down
71 changes: 70 additions & 1 deletion src/almanack/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import pathlib
import tempfile
from typing import Dict, List
from typing import Dict, List, Optional

import pygit2
from charset_normalizer import from_bytes


def clone_repository(repo_url: str) -> pathlib.Path:
Expand Down Expand Up @@ -144,3 +145,71 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]:
target_commit = commits[0] # Most recent

return str(source_commit.id), str(target_commit.id)


"""
Module for handling various tasks with git repo blobs.
"""


import pygit2


def detect_encoding(blob_data: bytes) -> str:
"""
Detect the encoding of the given blob data using charset-normalizer.
Args:
blob_data (bytes): The raw bytes of the blob to analyze.
Returns:
str: The best detected encoding of the blob data.
Raises:
ValueError: If no encoding could be detected.
"""
if not blob_data:
raise ValueError("No data provided for encoding detection.")

result = from_bytes(blob_data)
if result.best():
# Get the best encoding found
return result.best().encoding
raise ValueError("Encoding could not be detected.")


def find_and_read_file(repo: pygit2.Repository, filename: str) -> Optional[str]:
"""
Find and read the content of a file in the repository that matches the filename pattern.
Args:
repo (str): The path to the repository.
filename (str): The pattern to match against filenames.
Returns:
Optional[str]: The content of the found file, or None if no matching files are found.
"""

# Get the tree associated with the latest commit
tree = repo.head.peel().tree

# find the first occurrence of a matching file
found_file: Optional[pygit2.Blob] = next(
(
entry
for entry in tree
if entry.type == pygit2.GIT_OBJECT_BLOB
and filename.lower() == entry.name.lower()
),
None,
)

# if we have none, return it early to avoid trying to read nothing
if found_file is None:
return found_file

# Read the content of the first found blob
blob_data: bytes = found_file.data

# Decode and return content as a string
return blob_data.decode(detect_encoding(blob_data))
54 changes: 53 additions & 1 deletion src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pygit2
import yaml

from ..git import clone_repository, get_commits, get_edited_files
from ..git import clone_repository, find_and_read_file, get_commits, get_edited_files
from .entropy.calculate_entropy import (
calculate_aggregate_entropy,
calculate_normalized_entropy,
Expand Down Expand Up @@ -115,6 +115,57 @@ def file_exists_in_repo(
return False


def is_citable(repo: pygit2.Repository) -> bool:
"""
Check if the given repository is citable.
A repository is considered citable if it contains a CITATION.cff or CITATION.bib
file, or if the README.md file contains a citation section indicated by "## Citation"
or "## Citing".
Args:
repo (pygit2.Repository): The repository to check for citation files.
Returns:
bool: True if the repository is citable, False otherwise.
"""

# Check for a CITATION.cff or CITATION.bib file
if file_exists_in_repo(
repo=repo,
expected_file_name="citation",
check_extension=True,
extensions=[".cff", ".bib"],
):
return True

# Look for a README.md file and read its content
if (
file_content := find_and_read_file(repo=repo, filename="readme.md")
) is not None:
# Check for an H2 heading indicating a citation section
if any(
check_string in file_content
for check_string in [
# markdown sub-headers
"## Citation",
"## Citing",
"## Cite",
"## How to cite",
# RST sub-headers
"Citation\n--------",
"Citing\n------",
"Cite\n----",
"How to cite\n-----------",
# DOI shield
"[![DOI](https://img.shields.io/badge/DOI",
]
):
return True

return False


def compute_repo_data(repo_path: str) -> None:
"""
Computes comprehensive data for a GitHub repository.
Expand Down Expand Up @@ -183,6 +234,7 @@ def compute_repo_data(repo_path: str) -> None:
repo=repo,
expected_file_name="license",
),
"is-citable": is_citable(repo=repo),
"normalized_total_entropy": normalized_total_entropy,
"file_level_entropy": file_entropy,
}
Expand Down
7 changes: 7 additions & 0 deletions src/almanack/metrics/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ metrics:
description: >-
Boolean value indicating the presence of a LICENSE file
in the repository.
- name: "is-citable"
id: "SGA-GL-0005"
result-type: "bool"
result-data-key: "is-citable"
description: >-
Boolean value indicating the presence of a CITATION file
or some other means of indicating how to cite the work.
- name: "agg-info-entropy"
id: "SGA-VS-0001"
result-type: "float"
Expand Down
53 changes: 49 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import shutil
import subprocess

import pygit2
import pytest

from tests.data.almanack.repo_setup.create_repo import (
create_community_health_repository,
create_entropy_repositories,
repo_setup,
set_repo_user_config,
)

from .utils import check_subproc_run_for_nonzero
Expand Down Expand Up @@ -110,7 +112,50 @@ def community_health_repository_path(tmp_path_factory):
Fixture to call create_community_health_repository, create the repositories, then delete them
using the tmp_path_factory fixture to provide a temporary directory for tests.
"""
# Create a base temporary directory
base_path = tmp_path_factory.mktemp("almanack_community_health")

yield create_community_health_repository(base_path)
# Create a temporary directory for the session
temp_dir = tmp_path_factory.mktemp("community_health_repo")

yield repo_setup(
repo_path=pathlib.Path(temp_dir),
files={
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
},
)


@pytest.fixture
def repo_with_citation_in_readme(tmp_path):
"""Create a temporary repository with a specific structure for testing."""
# Create a new repository in the temporary path
repo = pygit2.init_repository(tmp_path, bare=False)

# Set user.name and user.email in the config
set_repo_user_config(repo)

(tmp_path / "README.md").write_text("## Citation")

index = repo.index

index.add_all()
index.write()

author = repo.default_signature

tree = repo.index.write_tree()

repo.create_commit(
"refs/heads/main",
author,
author,
"Committing nested files",
tree,
[],
)
# set the head to the main branch
repo.set_head("refs/heads/main")

yield repo # Provide the repository to the tests
47 changes: 23 additions & 24 deletions tests/data/almanack/repo_setup/create_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,46 +155,45 @@ def create_entropy_repositories(base_path: pathlib.Path) -> None:
commit_changes(repo_path, "Commit with added lines of code")


def create_community_health_repository(base_path: pathlib.Path) -> str:
def repo_setup(repo_path: pathlib.Path, files: dict) -> pygit2.Repository:
"""
Set up a temporary repository with specified files.
filenames_and_contents = {
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
}
Args:
tmp_path (Path): The temporary directory where the repo will be created.
files (dict): A dictionary where keys are filenames and values are their content.
repo_path = base_path / "community_health"
repo_path.mkdir(parents=True, exist_ok=True)
repo = pygit2.init_repository(path=str(repo_path), bare=False)
Returns:
pygit2.Repository: The initialized repository with files.
"""
# Create a new repository in the temporary path
repo = pygit2.init_repository(repo_path, bare=False)

# Set user.name and user.email in the config
set_repo_user_config(repo)

for filename, content in filenames_and_contents.items():
# add content to each file based on the filenames and contents dict
with open((repo_path / filename).resolve(), "w") as f:
f.write(content)
# Create files in the repository
for filename, content in files.items():
(repo_path / filename).write_text(content)

# add all files to the index
repo.index.add_all()
# write the files to the index
repo.index.write()
# Stage and commit the files
index = repo.index
index.add_all()
index.write()

# create a tree for the index
tree = repo.index.write_tree()
# gather a default signature author
author = repo.default_signature
tree = repo.index.write_tree()

repo.create_commit(
"refs/heads/main",
author,
author,
"Committing community health files",
"Initial commit with setup files",
tree,
[],
)

# set the head to the main branch
# Set the head to the main branch
repo.set_head("refs/heads/main")

return str(repo_path)
return repo
Loading

0 comments on commit 29ca220

Please sign in to comment.