Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add citability measure #120

Merged
merged 6 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 107 additions & 92 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pyyaml = "^6.0.1"
pygit2 = "^1.15.1"
fire = "^0.6.0"
tabulate = "^0.9.0"
charset-normalizer = "^3.4.0"

[tool.poetry.group.book.dependencies]
jupyter-book = "^1.0.0"
Expand Down
71 changes: 70 additions & 1 deletion src/almanack/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import pathlib
import tempfile
from typing import Dict, List
from typing import Dict, List, Optional

import pygit2
from charset_normalizer import from_bytes


def clone_repository(repo_url: str) -> pathlib.Path:
Expand Down Expand Up @@ -144,3 +145,71 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]:
target_commit = commits[0] # Most recent

return str(source_commit.id), str(target_commit.id)


"""
Module for handling various tasks with git repo blobs.
"""


import pygit2


def detect_encoding(blob_data: bytes) -> str:
"""
Detect the encoding of the given blob data using charset-normalizer.

Args:
blob_data (bytes): The raw bytes of the blob to analyze.

Returns:
str: The best detected encoding of the blob data.

Raises:
ValueError: If no encoding could be detected.
"""
if not blob_data:
raise ValueError("No data provided for encoding detection.")

result = from_bytes(blob_data)
if result.best():
# Get the best encoding found
return result.best().encoding
raise ValueError("Encoding could not be detected.")


def find_and_read_file(repo: pygit2.Repository, filename: str) -> Optional[str]:
"""
Find and read the content of a file in the repository that matches the filename pattern.

Args:
repo (str): The path to the repository.
filename (str): The pattern to match against filenames.

Returns:
Optional[str]: The content of the found file, or None if no matching files are found.
"""

# Get the tree associated with the latest commit
tree = repo.head.peel().tree

# find the first occurrence of a matching file
found_file: Optional[pygit2.Blob] = next(
(
entry
for entry in tree
if entry.type == pygit2.GIT_OBJECT_BLOB
and filename.lower() == entry.name.lower()
),
None,
)

# if we have none, return it early to avoid trying to read nothing
if found_file is None:
return found_file

# Read the content of the first found blob
blob_data: bytes = found_file.data

# Decode and return content as a string
return blob_data.decode(detect_encoding(blob_data))
54 changes: 53 additions & 1 deletion src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pygit2
import yaml

from ..git import clone_repository, get_commits, get_edited_files
from ..git import clone_repository, find_and_read_file, get_commits, get_edited_files
from .entropy.calculate_entropy import (
calculate_aggregate_entropy,
calculate_normalized_entropy,
Expand Down Expand Up @@ -115,6 +115,57 @@ def file_exists_in_repo(
return False


def is_citable(repo: pygit2.Repository) -> bool:
"""
Check if the given repository is citable.

A repository is considered citable if it contains a CITATION.cff or CITATION.bib
file, or if the README.md file contains a citation section indicated by "## Citation"
or "## Citing".

Args:
repo (pygit2.Repository): The repository to check for citation files.

Returns:
bool: True if the repository is citable, False otherwise.
"""

# Check for a CITATION.cff or CITATION.bib file
if file_exists_in_repo(
repo=repo,
expected_file_name="citation",
check_extension=True,
extensions=[".cff", ".bib"],
):
return True

# Look for a README.md file and read its content
if (
file_content := find_and_read_file(repo=repo, filename="readme.md")
) is not None:
# Check for an H2 heading indicating a citation section
if any(
check_string in file_content
for check_string in [
# markdown sub-headers
"## Citation",
"## Citing",
"## Cite",
"## How to cite",
# RST sub-headers
"Citation\n--------",
"Citing\n------",
"Cite\n----",
"How to cite\n-----------",
# DOI shield
"[![DOI](https://img.shields.io/badge/DOI",
]
):
return True

return False


def compute_repo_data(repo_path: str) -> None:
"""
Computes comprehensive data for a GitHub repository.
Expand Down Expand Up @@ -183,6 +234,7 @@ def compute_repo_data(repo_path: str) -> None:
repo=repo,
expected_file_name="license",
),
"is-citable": is_citable(repo=repo),
"normalized_total_entropy": normalized_total_entropy,
"file_level_entropy": file_entropy,
}
Expand Down
7 changes: 7 additions & 0 deletions src/almanack/metrics/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ metrics:
description: >-
Boolean value indicating the presence of a LICENSE file
in the repository.
- name: "is-citable"
id: "SGA-GL-0005"
result-type: "bool"
result-data-key: "is-citable"
description: >-
Boolean value indicating the presence of a CITATION file
or some other means of indicating how to cite the work.
- name: "agg-info-entropy"
id: "SGA-VS-0001"
result-type: "float"
Expand Down
53 changes: 49 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
import shutil
import subprocess

import pygit2
import pytest

from tests.data.almanack.repo_setup.create_repo import (
create_community_health_repository,
create_entropy_repositories,
repo_setup,
set_repo_user_config,
)

from .utils import check_subproc_run_for_nonzero
Expand Down Expand Up @@ -110,7 +112,50 @@ def community_health_repository_path(tmp_path_factory):
Fixture to call create_community_health_repository, create the repositories, then delete them
using the tmp_path_factory fixture to provide a temporary directory for tests.
"""
# Create a base temporary directory
base_path = tmp_path_factory.mktemp("almanack_community_health")

yield create_community_health_repository(base_path)
# Create a temporary directory for the session
temp_dir = tmp_path_factory.mktemp("community_health_repo")

yield repo_setup(
repo_path=pathlib.Path(temp_dir),
files={
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
},
)


@pytest.fixture
def repo_with_citation_in_readme(tmp_path):
"""Create a temporary repository with a specific structure for testing."""
# Create a new repository in the temporary path
repo = pygit2.init_repository(tmp_path, bare=False)

# Set user.name and user.email in the config
set_repo_user_config(repo)

(tmp_path / "README.md").write_text("## Citation")

index = repo.index

index.add_all()
index.write()

author = repo.default_signature

tree = repo.index.write_tree()

repo.create_commit(
"refs/heads/main",
author,
author,
"Committing nested files",
tree,
[],
)
# set the head to the main branch
repo.set_head("refs/heads/main")

yield repo # Provide the repository to the tests
47 changes: 23 additions & 24 deletions tests/data/almanack/repo_setup/create_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,46 +155,45 @@ def create_entropy_repositories(base_path: pathlib.Path) -> None:
commit_changes(repo_path, "Commit with added lines of code")


def create_community_health_repository(base_path: pathlib.Path) -> str:
def repo_setup(repo_path: pathlib.Path, files: dict) -> pygit2.Repository:
"""
Set up a temporary repository with specified files.

filenames_and_contents = {
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
}
Args:
tmp_path (Path): The temporary directory where the repo will be created.
files (dict): A dictionary where keys are filenames and values are their content.

repo_path = base_path / "community_health"
repo_path.mkdir(parents=True, exist_ok=True)
repo = pygit2.init_repository(path=str(repo_path), bare=False)
Returns:
pygit2.Repository: The initialized repository with files.
"""
# Create a new repository in the temporary path
repo = pygit2.init_repository(repo_path, bare=False)

# Set user.name and user.email in the config
set_repo_user_config(repo)

for filename, content in filenames_and_contents.items():
# add content to each file based on the filenames and contents dict
with open((repo_path / filename).resolve(), "w") as f:
f.write(content)
# Create files in the repository
for filename, content in files.items():
(repo_path / filename).write_text(content)

# add all files to the index
repo.index.add_all()
# write the files to the index
repo.index.write()
# Stage and commit the files
index = repo.index
index.add_all()
index.write()

# create a tree for the index
tree = repo.index.write_tree()
# gather a default signature author
author = repo.default_signature
tree = repo.index.write_tree()

repo.create_commit(
"refs/heads/main",
author,
author,
"Committing community health files",
"Initial commit with setup files",
tree,
[],
)

# set the head to the main branch
# Set the head to the main branch
repo.set_head("refs/heads/main")

return str(repo_path)
return repo
Loading